Python English Version
Imported libraries
import numpy as np import matplotlib.pyplot as plt import netwokx as nx
The flow network analysis algorithms
Import and visualize network data
Construct flow network from log file
demo data:
dd=[['a',0],['a',1],['a',2],['b',1],['b',2],['c',1],['c',2],['c',3],['d',2],['d',3], ['e',0],['e',4],['f',0],['f',4],['g',0],['g',4],['g',5],['h',0],['h',5],['i',6]]
function definition:
def constructFlowNetwork (data): LastCookie=data[0][0] LastUrl=str(data[0][1]) ed={} for i in data[1:]: cookie = i[0] url = str(i[1]) if cookie == LastCookie: eg = LastUrl + "\t" + url else: eg = LastUrl + "\t" + "sink" if eg in ed: ed[eg] += 1 else: ed[eg] = 1 LastCookie=cookie LastUrl=url fe = str(data[-1][1]) + "\t" + "sink" if fe in ed: ed[fe] += 1 else: ed[fe] = 1 G=nx.DiGraph() for i in ed.items(): a=re.split("\t",i[0]) G.add_edge(a[0],a[1],weight=i[1]) return G
function usage:
G = constructFlowNetwork(dd)
Visualize weighted networks
n = G.number_of_edges() pos = nx.spring_layout(G) colors =range(n) nx.draw(G, pos, node_size = 5, node_color = "#A0CBE2",edge_color=colors, width =1,edge_cmap=plt.cm.Blues, with_labels= False) #plt.savefig("E:/exampleNetwork.png")
The basic quantities of flow networks
Calculate the inflow and outflow of a node
outflow
def outflow(G,node): n=0 for i in G.edges(node): n+=G[i[0]][i[1]].values()[0] return n
inflow
def inflow(G,node): return outflow(G.reverse(), node)
Balance a flow network
function definition
def flowBalancing(G): RG = G.reverse() H = G.copy() def nodeBalancing(node): outw=0 for i in G.edges(node): outw+=G[i[0]][i[1]].values()[0] inw=0 for i in RG.edges(node): inw+=RG[i[0]][i[1]].values()[0] deltaflow=inw-outw if deltaflow > 0: H.add_edge(node, "sink",weight=deltaflow) elif deltaflow < 0: H.add_edge("source", node, weight=abs(deltaflow)) else: pass for i in G.nodes(): nodeBalancing(i) H.remove_edge("sink", "sink") return H
function usage
H = flowBalancing(G)
Obtain the dissipation on each node
function definition
def networkDissipation(G): d=[] RG = G.reverse() def nodeDissipate(node): toSink=0 outw=0 for i in G.edges(node): outw += G[i[0]][i[1]].values()[0] if i[1]=="sink": toSink += G[i[0]][i[1]].values()[0] inw=0 for i in RG.edges(node): inw+=RG[i[0]][i[1]].values()[0] fromSource=outw-inw totalflow=max(inw,outw) d.append([totalflow,toSink,fromSource]) for i in G.nodes(): if i=="sink": continue nodeDissipate(i) return d
function usage
D = networkDissipation(G)
Derive fundamental matrix based on Markov analysis
import libraries
from numpy import linalg as LA
function definition
def getUmatrix(G): H = flowBalancing(G) F1=nx.to_numpy_matrix(H) sourcep=H.nodes().index('source') sinkp=H.nodes().index('sink') F1oJ=F1[sourcep,] if sinkp > sourcep: F1oJ=delete(F1oJ,sourcep,1) F1oJ=delete(F1oJ,sinkp-1,1) AI=delete(AI,sourcep,1) AI=delete(AI,sinkp-1,1) F1[sinkp,sinkp]=1 M = F1 / F1.sum(axis=1) M = delete(M, sinkp, 0) M = delete(M, sinkp, 1) I = np.identity(len(M)) U = LA.inv( I - M) U = delete(U, sourcep, 0) U = delete(U, sourcep, 1) else: F1oJ=delete(F1oJ,sinkp,1) F1oJ=delete(F1oJ,sourcep-1,1) AI=delete(AI,sinkp,1) AI=delete(AI,sourcep-1,1) F1[sinkp,sinkp]=1 M = F1 / F1.sum(axis=1) M = delete(M, sinkp, 0) M = delete(M, sinkp, 1) I = np.identity(len(M)) U = LA.inv( I - M) U = delete(U, sourcep-1, 0) U = delete(U, sourcep-1, 1) return U
function usage
U = getUmatrix(G)
Calculate the average flow length
function definition
def flowLength(G): H = flowBalancing(G) F1=nx.to_numpy_matrix(H) sp=H.nodes().index('sink') F1[sp,sp]=1 M = F1 / F1.sum(axis=1) M = delete(M, sp, 0) M = delete(M, sp, 1) I = np.identity(len(M)) U = LA.inv( I - M) L = np.sum(U[0,]) return L
function usage
L = flowLength(G)
Obtain the balanced flow and the circulated flow controlled by each node
import libraries
from numpy import linalg as LA
function definition
def AICI(G): H = flowBalancing(G) F1=nx.to_numpy_matrix(H) sourcep=H.nodes().index('source') sinkp=H.nodes().index('sink') F1oJ=F1[sourcep,] AI = F1.sum(0) if sinkp > sourcep: F1oJ=delete(F1oJ,sourcep,1) F1oJ=delete(F1oJ,sinkp-1,1) AI=delete(AI,sourcep,1) AI=delete(AI,sinkp-1,1) F1[sinkp,sinkp]=1 M = F1 / F1.sum(axis=1) M = delete(M, sinkp, 0) M = delete(M, sinkp, 1) I = np.identity(len(M)) U = LA.inv( I - M) U = delete(U, sourcep, 0) U = delete(U, sourcep, 1) else: F1oJ=delete(F1oJ,sinkp,1) F1oJ=delete(F1oJ,sourcep-1,1) AI=delete(AI,sinkp,1) AI=delete(AI,sourcep-1,1) F1[sinkp,sinkp]=1 M = F1 / F1.sum(axis=1) M = delete(M, sinkp, 0) M = delete(M, sinkp, 1) I = np.identity(len(M)) U = LA.inv( I - M) U = delete(U, sourcep-1, 0) U = delete(U, sourcep-1, 1) def calculateCi(i): Ci =sum(U[i,:])*sum(np.dot(F1oJ,U[:,i]))/U[i,i] return Ci CI = map( lambda x:calculateCi(x),range(len(U)) ) AI = AI.tolist()[0] return np.transpose(np.vstack((AI,CI)))
function usage
AICIData = AICI(G)
Network re-normalization techniques
Calculate the box fractal dimension of networks
import libraries
from math import * from networkx import * import random from copy import deepcopy import os, sys import time import re import statsmodels.api as sm
function copyright
#------download address:lev[dot]ccny[dot]cuny[dot]edu/~hmakse/modules[dot]py-----------------# ''' Comments: If you have any questions or find a bug write Hernan Rozenfeld an email at hernanrozenfeld at domain gmail with the usual "dot com" at the end. Code in Python (written by Hernan Rozenfeld): For this code to run you'll need to install Python and Networkx. File "modules.py" contains algorithms MEMB, CBB, and random covering for network renormalization. '''
function definition
1 Compact box burning algorithm
def CBB(G,lb): #This is the compact box burning algorithm. """ It returns a dictionary with {box_id:subgraph_generated_by_the_nodes_in_this_box} The box_id is the center of the box. The subgraphs may be disconnected. """ uncovered_nodes=G.nodes() uncovered_nodes = set(uncovered_nodes) covered_nodes = set([]) boxes_subgraphs = {} adj = G.adj while uncovered_nodes: center = random.choice(list(uncovered_nodes)) nodes_visited = {center:0} search_queue = [center] d = 1 while len(search_queue) > 0 and d <= lb-1: next_depth = [] extend = next_depth.extend for n in search_queue: l = [ i for i in adj[n].iterkeys() if i not in nodes_visited ] extend(l) for j in l: nodes_visited[j] = d search_queue = next_depth d += 1 new_covered_nodes = set(nodes_visited.keys()) new_covered_nodes = new_covered_nodes.difference(covered_nodes) nodes_checked_as_centers = set([center]) while len(nodes_checked_as_centers) < len(new_covered_nodes): secondary_center = random.choice(list(new_covered_nodes.difference(nodes_checked_as_centers))) nodes_checked_as_centers.add(secondary_center) nodes_visited = {secondary_center:0} search_queue = [secondary_center] d = 1 while len(search_queue) > 0 and d <= lb-1: next_depth = [] extend = next_depth.extend for n in search_queue: l = [ i for i in adj[n].iterkeys() if i not in nodes_visited ] # faster than has_key? yep extend(l) for j in l: nodes_visited[j] = d search_queue = next_depth d += 1 nodes_covered_by_secondary = set(nodes_visited.keys()) new_covered_nodes = new_covered_nodes.intersection(nodes_covered_by_secondary) boxes_subgraphs[center] = subgraph(G,list(new_covered_nodes)) uncovered_nodes = uncovered_nodes.difference(new_covered_nodes) covered_nodes = covered_nodes.union(new_covered_nodes) return boxes_subgraphs
2 Random box covering algorithm
def random_box_covering(G,rb): """ It returns a dictionary with {box_id:subgraph_generated_by_the_nodes_in_this_box} The box_id is the center of the box. """ H = deepcopy(G) burned_nodes = [] unburned_nodes = G.nodes() boxes_subgraphs = {} adj = H.adj while unburned_nodes: center_node = random.choice(unburned_nodes) nodes_visited = [center_node] search_queue = [center_node] d = 1 while search_queue and d <= rb: next_depth = [] extend = next_depth.extend for n in search_queue: l = [ i for i in adj[n].iterkeys() if i not in nodes_visited] extend(l) nodes_visited.extend(l) search_queue = next_depth d += 1 new_burned_nodes = nodes_visited#.keys() H.remove_nodes_from(new_burned_nodes) boxes_subgraphs[center_node] = subgraph(G,new_burned_nodes) unburned_nodes = list(set(unburned_nodes)-set(new_burned_nodes)) return boxes_subgraphs
3 MEMB algorithm
def MEMB(G,rb,cycle=0): """ It returns a dictionary with {box_id:subgraph_generated_by_the_nodes_in_this_box} The box_id is the center of the box. cycle: Ignore this parameter. Use the default cycle=0. """ adj = G.adj number_of_nodes = G.number_of_nodes() covered_nodes = set() center_nodes = set() non_center_nodes = G.nodes() center_node_found = 0 boxes={} #this will be "box_id:[nodes in box]" central_distance_of_node = {} #"node:central_distance" node_box_id = {} #"node:box_id" nodes_sorted_by_central_distance={} #Dict with {central_distance:[nodes]} excluded_mass_of_non_centers_rb = {} #This contains [(node:excluded_mass)] for rb excluded_mass_of_non_centers_rb2 = {} #This contains [(node:excluded_mass)] for rb+1 rb2 = rb + 1 for node in non_center_nodes: #if node in [5000,10000,20000,30000]: print "node", node level=0 # the current level nextlevel={node:1} # list of nodes to check at next level paths_rb=None paths_rb2={node:[node]} # paths dictionary (paths to key from source) while nextlevel: paths_rb = deepcopy(paths_rb2) thislevel=nextlevel nextlevel={} for v in thislevel: for w in G.neighbors(v): if not paths_rb2.has_key(w): paths_rb2[w]=paths_rb2[v]+[w] nextlevel[w]=1 level=level+1 if (rb2 <= level): break excluded_mass_of_node = len(paths_rb2) try: excluded_mass_of_non_centers_rb2[excluded_mass_of_node].append(node) except KeyError: excluded_mass_of_non_centers_rb2[excluded_mass_of_node] = [node] excluded_mass_of_node = len(paths_rb) try: excluded_mass_of_non_centers_rb[excluded_mass_of_node].append(node) except KeyError: excluded_mass_of_non_centers_rb[excluded_mass_of_node] = [node] maximum_excluded_mass = 0 nodes_with_maximum_excluded_mass=[] new_covered_nodes = {} center_node_and_mass = [] cycle_index = 0 while len(covered_nodes) < number_of_nodes: #print len(covered_nodes),number_of_nodes cycle_index += 1 if cycle_index == cycle: rb2 = rb+1 cycle_index = 0 else: rb2 = rb while 1: if rb2 == rb+1: #t1=time.time() while 1: maximum_key = max(excluded_mass_of_non_centers_rb2.keys()) node = random.choice(excluded_mass_of_non_centers_rb2[maximum_key]) if node in center_nodes: excluded_mass_of_non_centers_rb2[maximum_key].remove(node) if not excluded_mass_of_non_centers_rb2[maximum_key]: del excluded_mass_of_non_centers_rb2[maximum_key] else: break nodes_visited = {} bfs = single_source_shortest_path(G,node,cutoff=rb2) for i in bfs: nodes_visited[i] = len(bfs[i])-1 excluded_mass_of_node = len(set(nodes_visited.keys()).difference(covered_nodes)) if excluded_mass_of_node == maximum_key: center_node_and_mass = (node,maximum_key) excluded_mass_of_non_centers_rb2[maximum_key].remove(node) if not excluded_mass_of_non_centers_rb2[maximum_key]: del excluded_mass_of_non_centers_rb2[maximum_key] new_covered_nodes = nodes_visited break else: excluded_mass_of_non_centers_rb2[maximum_key].remove(node) if not excluded_mass_of_non_centers_rb2[maximum_key]: del excluded_mass_of_non_centers_rb2[maximum_key] try: excluded_mass_of_non_centers_rb2[excluded_mass_of_node].append(node) except KeyError: excluded_mass_of_non_centers_rb2[excluded_mass_of_node] = [node] #print "time", time.time()-t1 else: #t1=time.time() while 1: maximum_key = max(excluded_mass_of_non_centers_rb.keys()) node = random.choice(excluded_mass_of_non_centers_rb[maximum_key]) if node in center_nodes: excluded_mass_of_non_centers_rb[maximum_key].remove(node) if not excluded_mass_of_non_centers_rb[maximum_key]: del excluded_mass_of_non_centers_rb[maximum_key] else: break nodes_visited = {} bfs = single_source_shortest_path(G,node,cutoff=rb) for i in bfs: nodes_visited[i] = len(bfs[i])-1 excluded_mass_of_node = len(set(nodes_visited.keys()).difference(covered_nodes)) if excluded_mass_of_node == maximum_key: center_node_and_mass = (node,maximum_key) excluded_mass_of_non_centers_rb[maximum_key].remove(node) if not excluded_mass_of_non_centers_rb[maximum_key]: del excluded_mass_of_non_centers_rb[maximum_key] new_covered_nodes = nodes_visited break else: excluded_mass_of_non_centers_rb[maximum_key].remove(node) if not excluded_mass_of_non_centers_rb[maximum_key]: del excluded_mass_of_non_centers_rb[maximum_key] try: excluded_mass_of_non_centers_rb[excluded_mass_of_node].append(node) except KeyError: excluded_mass_of_non_centers_rb[excluded_mass_of_node] = [node] #print "time", time.time()-t1 # center_node_found = center_node_and_mass[0] boxes[center_node_found] = [center_node_found] node_box_id[center_node_found] = center_node_found non_center_nodes.remove(center_node_found) center_nodes.add(center_node_found) covered_nodes = covered_nodes.union(set(new_covered_nodes.keys())) #print len(covered_nodes) for i in new_covered_nodes: # try: if central_distance_of_node[i] > new_covered_nodes[i]: nodes_sorted_by_central_distance[central_distance_of_node[i]].remove(i) if not nodes_sorted_by_central_distance[central_distance_of_node[i]]: del nodes_sorted_by_central_distance[central_distance_of_node[i]] try: nodes_sorted_by_central_distance[new_covered_nodes[i]].append(i) except KeyError: nodes_sorted_by_central_distance[new_covered_nodes[i]] = [i] central_distance_of_node[i] = new_covered_nodes[i] except KeyError: central_distance_of_node[i] = new_covered_nodes[i] try: nodes_sorted_by_central_distance[new_covered_nodes[i]].append(i) except: nodes_sorted_by_central_distance[new_covered_nodes[i]] = [i] # max_distance = max(nodes_sorted_by_central_distance.keys()) for i in range(1,max_distance+1): for j in nodes_sorted_by_central_distance[i]: targets = list(set(adj[j].iterkeys()).intersection(set(nodes_sorted_by_central_distance[i-1]))) node_box_id[j] = node_box_id[random.choice(targets)] boxes[node_box_id[j]].append(j) boxes_subgraphs={} for i in boxes: boxes_subgraphs[i] = subgraph(G,boxes[i]) # return boxes_subgraphs
function usage
def alloRegressPlot(xdata,ydata,col,xlab,ylab): x=xdata;y=ydata; xx = sm.add_constant(x, prepend=True) res = sm.OLS(y,xx).fit() constant=res.params[0];beta=res.params[1]; r2=res.rsquared plt.plot(xdata,ydata, "o",color=col,markersize=5) plt.plot(xdata,constant+xdata*beta,"red") plt.text(min(xdata)+(max(xdata)-min(xdata))/10,min(ydata)+(max(ydata)-min(ydata))/2,str(np.round(beta,2))) plt.xlabel(xlab) plt.ylabel(ylab)
def fractalDimension(G): data = box_counting_dim(G,lmax=20,num_covers=1) alloRegressPlot(np.log(np.array(data.keys())),np.log(np.array(data.values())),"b","l","N")
d = fractalDimension(G)
4 Random box by BEDWARDS
function copyright
#------download address:networkx[dot]lanl[dot]gov/trac/ticket/359-----------------# """Return the fractal box counting dimension Returns the box counting dimension given by: "...the network is covered with Nb boxes, where all vertices in each of them are connected by a minimum distance smaller than lb. Nb and lb are found to be related by Nb proportional lb^-db where db is the fractal box dimension of the network." Note that Nb should be the minimal cover (ie inf(all Nb)) however, [2] note that "...we find that minimization is not relevant and any covering gives rise to the same exponenant." It is unclear in the paper whether this is only for networks that exhibit a scale free property, or for all networks so use with caution. We provide a value to calculate multiple random covers for each l. It will then take the minimum. Parameters ---------- G : graph The graph used to compute the fractal cluster dimension. lmax : Int Max L to calculate Nb num_covers : int Number of covers to calculate for each l Returns ------- Nb(l) : dict of ints The number of boxes need to cover the nodes given boxes of size l References ---------- .. [1] L. da F. Costa, F.A. Rodrigues, G. Travieso, P.R. Villas Boas "Characterization of complex networks: A survey of measurements" Advances in Physics, Volume 56, Issue 1, Jan 2007 pages 167-242 [2] C. Song, S. Havlin and H.A. Makse, Nature 433 392 (2005). """
function definition
def size_box_cover(G,l): Nbl = 0 not_covered = set(G.nodes()) while not (list(not_covered)==[]): n = random.choice(list(not_covered)) sp = nx.single_source_shortest_path_length(G,n,cutoff=l) box_nodes = set(sp.keys()) not_covered.difference_update(box_nodes) Nbl = Nbl + 1 return Nbl
def box_counting_dim(G,lmax=None,num_covers=1): if (lmax==None): lmax = nx.diameter(G) Nb = {} for l in range(1,lmax+1): Nblmin = G.order() for n in range(num_covers): Nbl = size_box_cover(G,l) if (Nbl <Nblmin): Nblmin = Nbl Nb[l] = Nblmin return Nb
Calculate the cluster fractal dimension of networks
1. by BEDWARDS
function copyright
#------download address:networkx[dot]lanl[dot]gov/trac/ticket/359-----------------# """Return the fractal cluster dimension Returns the cluster dimension for a network. Given by "...a seed vertex is chosen at random and a cluster is formed by vertices distance at most l from the seed. This process is repeated many times and the average mass of the resulting clusters is calculated as a function of l, resulting the the relation <Mc> proportional l^df where the average mass <Mc> is defined as the number of vertices in the cluster and df is the fractal cluster dimension"[1] Parameters ---------- G : graph The graph used to compute the fractal cluster dimension. nsamples : int Number of random nsamples to use for each l lmax : Int Largest l to calculate dimension for Returns ------- Mc(l) : dict of floats The average mass for each l given in a dictionary keyed by l. References ---------- .. [1] L. da F. Costa, F.A. Rodrigues, G. Travieso, P.R. Villas Boas "Characterization of complex networks: A survey of measurements" Advances in Physics, Volume 56, Issue 1, Jan 2007 pages 167-242 """
function definition
def average_cluster_mass(G,l,nsamples=10): try: nodes = random.sample(G.nodes(), nsamples) except ValueError: raise nx.NetworkXError("nsamples larger than number of nodes") mass = 0.0 for n in nodes: sp = nx.single_source_shortest_path_length(G,n,cutoff=l) mass+= len(sp) - 1 return float(mass)/len(nodes)
def cluster_dim(G,nsamples=10,lmax=None): if (lmax==None): lmax = nx.diameter(G) Mc = {} for l in range(1,lmax+1): Mc[l] = average_cluster_mass(G,l,nsamples) return Mc
Consider the direction and weights of edges in network re-normalization
consider the direction of edges
G = nx.DiGraph() If don't want to consider the direction of edges H = nx.Graph() 或 H = G.to_undirected()
consider weights
replace sp = nx.single_source_shortest_path_length(G,n,cutoff=l) with sp = nx.single_source_shortest_path_length(G,n,cutoff=l)
The change of basic flow network quantities in re-normalization
import libraries
import statsmodels.api as sm
function definition
def renormalizeMapping(G,l): replaceDic={} not_covered = set(G.nodes()) while not (list(not_covered)==[]): n = random.choice(list(not_covered)) sp = nx.single_source_shortest_path_length(G,n,cutoff=l) #sp = nx.single_source_dijkstra_path_length(G,source=n,cutoff=l,weight='weight') box_nodes = set(sp.keys()) for i in list(box_nodes): replaceDic[i]=n not_covered.difference_update(box_nodes) return replaceDic
calculate quantities of interest in each step of re-normalization
def renormalizeStat(G,l): mapping = renormalizeMapping(G,l) G = nx.relabel_nodes(G,mapping) G.remove_edges_from(G.selfloop_edges()) n = nx.number_of_nodes(G) m = nx.number_of_edges(G) w = G.size(weight='weight') return [n,m,w]
The re-normalization of flow networks containing several components
def renormalizeStat(G,l): if "sink" in G.nodes(): G.remove_node("sink") g = nx.weakly_connected_component_subgraphs(G)[0] mapping = renormalizeMapping(g,l) g = nx.relabel_nodes(g,mapping) g.remove_edges_from(g.selfloop_edges()) n = nx.number_of_nodes(g) m = nx.number_of_edges(g) h = flowBalancing(g) uv = outflow(h,"source") pv = h.size(weight='weight')-uv return [n,m,uv,pv]
The growth network models
Fractal-Small world networks by Song et al.
def fractal_model(generation,m,x,e): """ Returns the fractal model introduced by Song, Havlin, Makse in Nature Physics 2, 275. generation = number of generations m = number of offspring per node x = number of connections between offsprings e = probability that hubs stay connected 1-e = probability that x offsprings connect. If e=1 we are in MODE 1 (pure small-world). If e=0 we are in MODE 2 (pure fractal). """ G=Graph() G.add_edge(0,1) #This is the seed for the network (generation 0) node_index = 2 for n in range(1,generation+1): all_links = G.edges() while all_links: link = all_links.pop() new_nodes_a = range(node_index,node_index + m) #random.shuffle(new_nodes_a) node_index += m new_nodes_b = range(node_index,node_index + m) #random.shuffle(new_nodes_b) node_index += m G.add_edges_from([(link[0],node) for node in new_nodes_a]) G.add_edges_from([(link[1],node) for node in new_nodes_b]) repulsive_links = zip(new_nodes_a,new_nodes_b) G.add_edges_from([repulsive_links.pop() for i in range(x-1)]) if random.random() > e: G.remove_edge(*link) G.add_edge(*repulsive_links.pop()) return G
Data analysis tools
Pearson correlation
import libraries
from scipy.stats.stats import pearsonr
function usage
pearsonr(x, y)
Uni-variate ordinary least square regression
import libraries
import statsmodels.api as sm
fit parameters
def OLSRegressFit(x,y): xx = sm.add_constant(x, prepend=True) res = sm.OLS(y,xx).fit() constant, beta = res.params r2 = res.rsquared return [constant, beta, r2]
plot data points and the regression line
def OLSRegressPlot(x,y,col,xlab,ylab): xx = sm.add_constant(x, prepend=True) res = sm.OLS(y,xx).fit() constant, beta = res.params r2 = res.rsquared plt.plot(x,y, "o",color=col,markersize=5) plt.plot(x,constant+x*beta,"red") plt.text(min(x)+(max(x)-min(x))/10,min(y)+(max(y)-min(y))/2,str(np.round(beta,2))) plt.xlabel(xlab) plt.ylabel(ylab)
Uni-variate othorgonal regression
import libraries
import scipy.odr
fit parameters
def othorgonalRegressFit(x,y): def f(p, x): return p[0]+p[1]*x myModel = scipy.odr.Model(f) myData = scipy.odr.Data(x, y) myOdr = scipy.odr.ODR(myData, myModel , beta0=[1,1]) myOdr.set_job(fit_type=0) out = myOdr.run() constant, beta = out.beta SSE = sum((consatnt + x*beta - y)**2) SST = SSE + sum((y-np.mean(y))**2) r2 = 1-SSE/SST return [constant, beta, r2]
plot data points and the regression line
def othorgonalRegressPlot(x, y, col, xlab, ylab): def f(p, x): return p[0] + p[1]*x myModel = scipy.odr.Model(f) myData = scipy.odr.Data(x, y) myOdr = scipy.odr.ODR(myData, myModel , beta0=[1,1]) myOdr.set_job(fit_type=0) out = myOdr.run() constant, beta = out.beta plt.plot(x, y, "o", color=col, markersize=5) plt.plot(x,constant + x * beta,"red") plt.text(min(x)+(max(x)-min(x))/10,min(y)+(max(y)-min(y))/2,str(np.round(beta,2))) plt.xlabel(xlab) plt.ylabel(ylab)
Multi-variate ordinary least square regression
import libraries
import statsmodels.api as sm
fit parameters
def OLSMRegressFit(x1, x2, y): x = np.column_stack((x1,x2)) x = sm.add_constant(x, prepend=True) res = sm.OLS(y,x).fit() constant, beta1, beta2 = res.params return [constant, beta1, beta2]
log-binned data
def binData(x, y, binScale): # 0 < binScale < 1 data = np.transpose(np.vstack((np.array(x),np.array(y)))) data = np.log (data + 1) rs = np.arange(floor(min(data[:,0])),ceil(max(data[:,0])),binScale) rs = np.transpose(np.vstack((rs[:-1],rs[1:]))) ps=[] def binf(rang): try: d1 = data[data[:,0] >= rang[0]] d2 = d1[d1[:,0] < rang[1]] if np.mean(d2[:,0])>0 and np.mean(d2[:,1]) >0: ps.append( [np.mean(d2[:,0]),np.mean(d2[:,1])]) except: pass map(binf,rs) ps=np.array(ps) return ps
Fit the Discrete Generalized Beta Distribution
import libraries
import statsmodels.api as sm
fit parameters
def DGBDfFit(data): t=array(sorted(data,key=lambda x:-x)) r=array(range(1,len(data)+1)) y = log(t) x1 = log(max(r)+1-r) x2 = log(r) x = np.column_stack((x1,x2)) x = sm.add_constant(x, prepend=True) res = sm.OLS(y,x).fit() [A,b,a] = res.params return [A,b,a]
plot data points and the fitted curve
def DGBDPlot(data): t=array(sorted(data,key=lambda x:-x)) r=array(range(1,len(data)+1)) y = log(t) x1 = log(max(r)+1-r) x2 = log(r) x = np.column_stack((x1,x2)) x = sm.add_constant(x, prepend=True) res = sm.OLS(y,x).fit() [A,b,a] = res.params plt.plot(r,t,"o",color="b") plt.plot(r,exp(A)*(max(r)+1-r)**b*r**a,"r-") plt.yscale('log') plt.text(max(r)/2,max(t)/100,"b=" + str(round(b,2)) + ", a=" + str(round(a,2)))
Fit the Power law distribution with exponential cut-off
import libraries
import statsmodels.api as sm import random
fit parameters
def powerLawExponentialCutOffFit(data): t = np.array(sorted(data,key=lambda x:-x)) r = np.array(range(len(data))) +1 r = r/np.max(r) y = np.log(r) x1 = np.log(t) x2 = t x = np.column_stack((x1,x2)) x = sm.add_constant(x, prepend=True) res = sm.OLS(y,x).fit() L,alpha,lambda = res.params return [L,alpha,lambda]
plot data points and the fitted curve
def powerLawExponentialCutOffPlot(data): t = np.array(sorted(data,key=lambda x:-x)) r = np.array(range(len(data))) +1 r = r/np.max(r) y = np.log(r) x1 = np.log(t) x2 = t x = np.column_stack((x1,x2)) x = sm.add_constant(x, prepend=True) res = sm.OLS(y,x).fit() L,alpha,lambda = res.params return [L,alpha,lambda] plt.plot(t,r,".",color="SteelBlue",alpha=0.75,markersize=3) plt.plot(t,exp(L) * t ** alpha * exp(lambda * t),"r-") plt.xscale('log'); plt.yscale('log') plt.xlabel(r'value ') plt.ylabel(r'cumulative probability')
Ticks of Python programming
Import file separated by "\t" symbol
data = [] f = open('E:\xxx','rb') for line in f.readlines(): line = line.strip().split("\t") # add a filter such as: if line[0] == 1 data.append(line) f.close()
Export file separated by "\t" symbol
f = open("E:/xxx", "wb") for i in data: f.write("\t".join(map(str,i)) + "\n") f.close()
Select rows with all elements greater than 0 in an array=
d = d[d.all(1)]
Encoding/decoding
s = s.decode('gb2312').encode('utf-8')
Obtain a list of continuous days
import libraries
import datetime
function definition
def getDays(year1, month1, day1, year2, month2, day2): dt = datetime.datetime(year1, month1, day1) end = datetime.datetime(year2, month2, day2) step = datetime.timedelta(days=1) days = [] while dt < end: days.append(dt.strftime('%Y%m%d')) dt += step return days
function usage
days = getDays(2013, 2, 27, 2013, 4, 27)
plotting histogram
function definition
def histogram (data,intervals,color,xlab): weights = np.ones_like(data)/len(data) plt.hist(data, intervals, facecolor = color, weights=weights, alpha = 0.75) plt.xlabel(xlab)
function usage
histogram (x,30,'green',r'$\theta$')
plot sub-figures
figure(num=None, figsize=(12, 4), dpi=80, facecolor='w', edgecolor='k') plt.subplot(131) plt.plot(1,1,"bo") plt.subplot(132) plt.plot(1,1,"ro") plt.subplot(133) plt.plot(1,1,"mo") plt.tight_layout(pad=0.4, w_pad=0, h_pad=1.0)
if all characters in a string "s" is numeric
s.isdigit()