forked from RichieJu520/Co-occurrence_Network_Analysis
-
Notifications
You must be signed in to change notification settings - Fork 1
/
3.Random_vs_observed_cooccurrence.py
170 lines (147 loc) · 5.37 KB
/
3.Random_vs_observed_cooccurrence.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
# -*- coding: utf-8 -*-
"""
@author: Feng Ju
@email: [email protected]
The script was written and tested in python 2.7
###@cite Ju F, Xia Y, Guo F, Wang ZP, Zhang T. 2014.
###@Taxonomic relatedness shapes bacterial assembly in activated sludge of globally distributed wastewater treatment plants.
###@Environmental Microbiology. 16(8):2421-2432
"""
print 'This script is written for adding attribute to each NODE in GML files before imported into Gehpi'
print 'This script also calculate the random and observed incidences of co-occurrence between differnt types of network nodes!'
print 'The map file is a tab-delimited file with node ID (col 1) and type of node (col 2)
print 'The gml file is the gml-format network file generated from the R scripts'
while True:
Parameters=raw_input("Enter parameters [map file](.map) and [GML file] sepeated by Space: ")
try:
P1=Parameters.strip().split(' ')[0]
P2=Parameters.strip().split(' ')[1]
break
except:
print 'errors: invalid input format or not enough input !'
continue
a={}
for line in open(P1,'r'):
try:
a[line.rstrip().split('\t')[0]]=line.rstrip().split('\t')[1]
except:
print 'Pls provdied a tab-delimited txt file!'
f=open(P2.replace('.gml','') +'.modified.gml','w')
f1=open(P1 +'.map','w')
b=[]
i=0
for line in open(P2,'r'):
if 'label' not in line:
f.write(line)
else:
i+=1
name = line.strip().split('"')[1]
f.write(line)
try:
f1.write(name+'\t'+str(a[name])+'\n')
f.write(' '+'order'+' '+str(a[name])+'\n')
except KeyError:
print name
print i,'items were added into the node!'
print 'OK, work finished!'
f.close()
f1.close()
X1 = P1 +'.map'
X2 = P2.replace('.gml','') +'.modified.gml'
f=open(X2.replace('.gml','')+'_Observed_VS_Random.xls','w')
f1=open(X2.replace('.gml','')+'_edge_properties.xls','w')
f2=open(X2.replace('.gml','')+'_node_properties.xls','w')
a1,a2={},{}
lis=[]
for line in open(X1,'r'):
a1[line.strip().split('\t')[0]]=line.strip().split('\t')[1]
lis.append(line.strip().split('\t')[1])
print len(a1), 'Node-affiliation pairs!'
dic1={}
lis_U=list(set(lis))
for item in lis_U:
dic1[item]=lis.count(item)
f2.write('\t'.join(['id','nmae','phylum','degree'])+'\n')
b,c,d,e,g = [],[],[],[],[]
for line in open(X2,'r'):
if ' id ' in line:
f2.write(line.strip().split(' ')[1]+'\t')
b.append(line.strip().split(' ')[1])
elif ' name' in line:
f2.write(line.strip().split(' ')[1].replace('"','')+'\t')
c.append(line.strip().split(' ')[1].replace('"',''))
f2.write(a1[line.strip().split(' ')[1].replace('"','')]+'\t')
elif ' degree' in line:
f2.write(line.strip().split(' ')[1]+'\n')
elif 'source' in line:
d.append(line.strip().split(' ')[1])
elif 'target' in line:
e.append(line.strip().split(' ')[1])
elif 'weight' in line:
g.append(line.strip().split(' ')[1])
else:
continue
print len(b), len(c),'nodes!'
print len(d), len(e),'edges!'
for i in range(len(b)):
a2[b[i]]=c[i]
j,k=0,0
n=[]
p=[]
f1.write('Source'+'\t'+'Phylum'+'\t'+'Target'+'\t'+'Phylum'+'\t'+'Weight'+'\n')
for m in range(len(g)):
if a1[a2[d[m]]]==a1[a2[e[m]]]:
f1.write(str(a2[d[m]])+'\t'+str(a1[a2[d[m]]])+'\t'+str(a2[e[m]])+'\t'+str(a1[a2[e[m]]])+'\t'+str(g[m])+'\n')
j+=1
n.append(a1[a2[d[m]]]+'__'+a1[a2[e[m]]])
else:
f1.write(str(a2[d[m]])+'\t'+str(a1[a2[d[m]]])+'\t'+str(a2[e[m]])+'\t'+str(a1[a2[e[m]]])+'\t'+str(g[m])+'\n')
k+=1
p.append(a1[a2[d[m]]]+'__'+a1[a2[e[m]]])
print j, 'Internal-type cooccurence!'
print k, 'External-type cooccurence!'
n1=list(set(n))
n1.sort()
dic2={}
for item in n1:
dic2[item]=str(n.count(item))
p1=list(set(p))
p1.sort()
for item in p1:
i1=p.count(item)
i2=p.count(item.split('__')[1]+'__'+item.split('__')[0])
dic2[item]=i1+i2
if i2!=0:
p1.remove(item.split('__')[1]+'__'+item.split('__')[0])
else:
continue
f.write(str(len(c))+' '+'nodes and'+' '+ str(len(e))+' '+'edges in the network!'+'\n')
f.write(str(j)+' '+'edges with internal-type cooccurence!'+'\n')
f.write(str(k)+' '+'edges with external-type cooccurence!'+'\n')
f.write('N1__N2'+'\t'+'N1-freq'+'\t'+'N2-freq'+'\t'+'Edges'+'\t'+'Random'+'\t'+'Observed'+'\t'+'O/R-ratio'+'\n')
for key in n1:
i1=dic1[key.split('__')[0]]
i2=dic1[key.split('__')[1]]
if key.split('__')[0]==key.split('__')[1]:
random=100*float(i1*(i2-1))/(len(c)*(len(c)-1))
else:
random=2*100*float(i1*i2)/(len(c)*(len(c)-1))
observed=100*float(dic2[key])/len(e)
ratio=observed/random
f.write('\t'.join([key, str(i1), str(i2), str(dic2[key]), str(random), str(observed), str(ratio)])+'\n')
for key in p1:
i1=dic1[key.split('__')[0]]
i2=dic1[key.split('__')[1]]
if key.split('__')[0]==key.split('__')[1]:
random=100*float(i1*(i2-1))/(len(c)*(len(c)-1))
else:
random=2*100*float(i1*i2)/(len(c)*(len(c)-1))
observed=100*float(dic2[key])/len(e)
ratio=observed/random
f.write('\t'.join([key, str(i1), str(i2), str(dic2[key]), str(random), str(observed), str(ratio)])+'\n')
f.write('\n')
f.write('Node-affiliation'+'\t'+'Nodes_freq'+'\n')
list_s = sorted(dic1, key=dic1.__getitem__, reverse = True)
for key in list_s:
f.write(key+'\t'+str(dic1[key])+'\n')
print 'OK, finished!'