-
Notifications
You must be signed in to change notification settings - Fork 0
/
masterscript.py
843 lines (746 loc) · 46.7 KB
/
masterscript.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
#!/usr/bin/env python
## Jeff Law, March 2022
## Master Script for running Toxcast
print("Importing libraries")
from optparse import OptionParser,OptionGroup
from src.utils import file_utils as utils
from src import toxcast_utils as t_utils
from src import interactome_utils as i_utils
# script to hold the different versions and settings and such
from src import toxcast_settings as t_settings
# used for running edgelinker in a parallel manner
from src import run_edgelinker
import os
import sys
import subprocess
import re
#import src.map_chemicals as map_chemicals
# Add /data/jeff-law/tools/tqdm to your python path to give the progress bar functionality.
from tqdm import tqdm
# for checking to see if this script is being run on baobab
import socket
TOXCAST_DATA = None
chemIDtoName, chemNametoID = t_utils.getChemicalNameMaps()
def main(args):
global VERSION, DATADIR, INTERACTOMES
global RESULTSPREFIX, INPUTSPREFIX, REC_TFS_FILE
global PATHLINKERDATAVERSIONS, TOXCAST_DATA
global REC_TFS_PENALTY
## parse arguments
opts = parseArguments(args)
for version in opts.version:
VERSION = version
# settings are now stored in the src/toxcast_settings.py script
INPUTSPREFIX, RESULTSPREFIX, interactome = t_settings.set_version(VERSION)
REC_TFS_PENALTY = t_settings.REC_TFS_PENALTY
EDGE_PENALTY = t_settings.EDGE_PENALTY
DATADIR = t_settings.DATADIR
#PATHLINKERDATAVERSIONS = t_settings.PATHLINKERDATAVERSIONS
REC_TFS_FILE = t_settings.REC_TFS_FILE
# create the directories if they don't exist
t_utils.checkDir(RESULTSPREFIX)
t_utils.checkDir(INPUTSPREFIX)
# check if the inputs are there
if opts.forceinputs or not os.path.isdir("%s/rec-tfs" % INPUTSPREFIX):
# load the toxcast data
TOXCAST_DATA = t_utils.loadToxcastData(t_settings.INTERACTOMES[VERSION])
if VERSION in t_settings.ZSCORE_PENALTY:
print("Adding a zscore cost to the sources/targets")
include_zscore_weight = True
else:
include_zscore_weight = False
# generate the perturbed rec and tf files as well as the chemicals.txt file
TOXCAST_DATA.write_chemical_perturbed_rec_tfs(
"%s/chemicals.txt" % INPUTSPREFIX, "%s/rec-tfs" % INPUTSPREFIX,
include_zscore_weight=include_zscore_weight)
else:
TOXCAST_DATA = t_utils.loadToxcastData(t_settings.INTERACTOMES[VERSION])
# these chemicals have at least 1 hit rec and tf
chemicals = TOXCAST_DATA.chemical_rec.keys()
# If we're penalizing the undirected edges, add that penalty now
if VERSION in t_settings.UNDIR_PENALTIES and not os.path.isfile(interactome):
i_utils.addUndirPenalty(
t_settings.INTERACTOMES[VERSION], interactome, penalty=t_settings.UNDIR_PENALTIES[VERSION])
# kind of like a test run before starting everything
if opts.single_run:
#chemicals = [chemicals[0]]
#chemicals = [chemNametoID['Tiratricol']]
chemicals = sorted(opts.single_run)
# if the user specified to use the 'single_chem.txt' file, use that
if 'file' in opts.single_run:
del chemicals[chemicals.index('file')]
single_chem_file = "%s/single_chem.txt" % INPUTSPREFIX
if os.path.isfile(single_chem_file):
print("Using %d chemicals plus those listed in '%s'" % (len(chemicals), single_chem_file))
chemicals += sorted(utils.readItemList(single_chem_file))
else:
print("file '%s' not found." % (single_chem_file))
print("Using version '%s' with %d chemicals" % (VERSION, len(chemicals)))
if opts.response_networks:
generateResponseNetworks(chemicals, interactome, max_k=opts.edgelinker_k, pathlinker=opts.pathlinker,
edge_penalty=EDGE_PENALTY, rec_tfs_penalty=REC_TFS_PENALTY, forced=opts.forcealg)
# if opts.graphspace:
# call_post_to_graphspace(chemicals, interactome, opts)
scopes = opts.scope
if scopes is None:
scopes = [scopes]
for scope in scopes:
if opts.random_networks:
if 'permute' in scope:
print("Permuting the interactome %d times, and then running edgelinker on each of the permuted networks" % (10))
else:
print("Permuting the set of receptors and TFs for each chemical, then running edgelinker on the original interactome")
print("Using scope '%s'" % (scope))
edgelinkerPermuteNetworks(interactome, scope, opts.num_random_networks, opts.edgelinker_k, opts.k_to_test, opts.forcealg,
opts.parallel_processes, opts.super_computer, num_iterations=10,
printonly=opts.printonly, cleanup=opts.cleanup, single_chem=opts.single_run,
send_email=opts.send_email)
# TODO update this option for the new statistical test
if opts.stat_sig:
print("Using scope '%s'" % (scope))
compute_stat_sig(scope, opts.num_random_networks, opts.k_to_test, forced=opts.forcesig)
if opts.gen_figs:
gen_figs(scope, opts.k_to_test, forced=opts.forcesig)
# keep track of this for later
total_num_chemicals = len(chemicals)
curr_chemicals = chemicals
# get only the chemicals that have a significant response network
if opts.sig_cutoff:
k = 150
if len(opts.k_to_test) == 1:
k = opts.k_to_test[0]
sig_chemicals = t_utils.getSigChemicals(RESULTSPREFIX, scope, sig_cutoff_type=opts.sig_cutoff_type, sig_cutoff=opts.sig_cutoff, k=k)
nonsig_chemicals = set(chemicals).difference(sig_chemicals)
# write these to a file
sig_chemicals_file = "%s/sig-chemicals.txt" % (INPUTSPREFIX)
print("Writing the current set of significant chemicals to %s" % (sig_chemicals_file))
with open(sig_chemicals_file, 'w') as out:
header = "# chemicals from k: %s, scope: '%s', sig_cutoff_type: '%s', sig_cutoff: '%s'\n" % (k, scope, opts.sig_cutoff_type, opts.sig_cutoff)
out.write(header)
out.write('\n'.join(sig_chemicals))
nonsig_chemicals_file = "%s/nonsig-chemicals.txt" % (INPUTSPREFIX)
print("Writing the current set of non-significant chemicals to %s" % (nonsig_chemicals_file))
with open(nonsig_chemicals_file, 'w') as out:
header = "# chemicals from scope: '%s', sig_cutoff_type: '%s', sig_cutoff: '%s'\n" % (scope, opts.sig_cutoff_type, opts.sig_cutoff)
out.write(header)
out.write('\n'.join(nonsig_chemicals))
# use the sig chemicals as the set of chemicals for the rest of the scripts
curr_chemicals = sig_chemicals
# add a prefix to the output files showing which significance settings were used for this set of chemicals
sig_pref = 'all'
if opts.sig_cutoff:
sig_pref = "%s-%s-%s" % (scope, opts.sig_cutoff_type, str(opts.sig_cutoff).replace('.','_'))
if opts.overlap:
# TODO allow generating both with a single call to the master script
if sig_pref == "all":
write_overlap(chemicals_file="%s/chemicals.txt" % (INPUTSPREFIX), sig_pref=sig_pref, forced=opts.forcesig)
else:
# write both the sig and non-sig chemicals overlap
write_overlap(chemicals_file="%s/sig-chemicals.txt" % (INPUTSPREFIX), sig_pref=sig_pref, forced=opts.forcesig)
nonsig_pref = "nonsig-%s-%s-%s" % (scope, opts.sig_cutoff_type, str(opts.sig_cutoff).replace('.','_'))
write_overlap(chemicals_file="%s/nonsig-chemicals.txt" % (INPUTSPREFIX), sig_pref=nonsig_pref, forced=opts.forcesig)
if opts.overlap_graph:
# this also uses the sig-chemicals file, I just let the function handle it
post_overlap_graph_to_graphspace(
interactome, opts.overlap_graph, total_num_chemicals,
num_sig_chemicals=len(curr_chemicals), sig_pref=sig_pref, forced=opts.forcesig
)
# --------------------------------------------------
# -------------------- END MAIN --------------------
def generateResponseNetworks(chemicals, interactome, max_k=1000, pathlinker=False, k=500,
edge_penalty=None, rec_tfs_penalty=False, forced=False):
""" Generate a response network for each chemical by running EdgeLinker (or PathLinker) using the set of responsive receptors and TFs as the sources and targets
*chemicals*: the chemicals to generate response networks for
*max_k*: the total number of paths to write for edgelinker output
*pathlinker*: run pathlinker instead of edgelinker
*k*: number of paths to output for pathlinker
*forced*: overwrite the output files if they already exist
"""
# run pathlinker on every chemical,
print("Creating response networks for %d chemicals using the interactome '%s'" % (len(chemicals), interactome))
# TODO implement the group run so reading the interactome (bottleneck) is only done once
#t_utils.checkDir(out_dir)
#run_edgelinker.runGroupEdgeLinker(interactome, rec_tfs_file, "%s/%s"%(out_dir, chemical), max_k)
if pathlinker:
print("Using PathLinker to generate the response networks")
else:
print("Using EdgeLinker to generate the response networks")
if edge_penalty is not None:
print("\tedge penalty: %s" % (str(edge_penalty)))
if rec_tfs_penalty is True:
print("\tapplying penalty to sources and targets (receptors and tfs)")
out_dir = "%s/edgelinker"%(RESULTSPREFIX)
t_utils.checkDir(out_dir)
out_pref = "%s/" % out_dir
edgelinker_in_files, edgelinker_out_files = run_edgelinker.createGroupEdgeLinker(chemicals, INPUTSPREFIX, out_pref)
print("Running edgelinker on %d chemicals: %s, %s" % (len(chemicals), edgelinker_in_files, edgelinker_out_files))
run_edgelinker.runEdgeLinker(interactome, edgelinker_in_files, edgelinker_out_files, max_k,
edge_penalty=edge_penalty, rec_tfs_penalty=rec_tfs_penalty, multi_run=True)
# check to see if any of them failed, and output the failed runs
for chemical in chemicals:
results_file = "%s/%s-paths.txt"%(out_dir, chemical)
if not os.path.isfile(results_file) or os.path.getsize(results_file) < 50:
sys.stderr.write("ERROR: failed to generate response network for chemical: %s\n" % (chemical))
# cleanup the errored out files
results_file2 = "%s/%s-ranked-edges.txt"%(out_dir, chemical)
os.remove(results_file)
os.remove(results_file2)
# the rest is for running pathlinker so stop here
return
for chemical in tqdm(chemicals):
# if version is netpath or kegg, use the different type of input file
if VERSION in PATHLINKERDATAVERSIONS:
interactome = INTERACTOMES[VERSION] % chemical
#print("using interactome %s" % interactome)
#chemical = chemicals[i]
#print("\n----- %s %s %d -----"%(CHEMICAL_MAP.chemIDtoName[chemical], chemical, i))
#interactomes = [opts.interactome]
# chemical_weighted_interactomes = []
#if opts.change_weight:
# pass
# for change_weight in opts.change_weight:
# #for change_weight in [0.3,0.1]:
# increase_weight = 1 + change_weight
# decrease_weight = 1 - change_weight
# TODO change the dir to be in the inputs
# #ids_file = "results/chemicals/change_weights/%s-intermediate-acc.txt"%chemical
# t_utils.checkDir("results/chemicals/chemical_weights")
# interactome_name = opts.interactome.split('/')[-1]
# altered_interactome = "results/chemicals/chemical_weights/%s-%s-%s"%(chemical, str(increase_weight), interactome_name)
# write_altered_interactome(CHEMICAL_MAP, chemical, opts.interactome, altered_interactome, increase_weight, decrease_weight)
# chemical_weighted_interactomes.append((altered_interactome, change_weight))
#else:
# chemical_weighted_interactomes = [(interactome, '')]
#
# # loop through the change weight options
# for interactome, change_weight in chemical_weighted_interactomes:
#
# if change_weight != '':
# # setup the output dir to be something like "weighted-1_10"
# change_weight = str(change_weight)[-1] + '_10'
#
# #rec_tfs_file = "results/chemicals/pathlinker-inputs/%s-rec-tfs.txt"%chemical
rec_tfs_file = REC_TFS_FILE % (INPUTSPREFIX, chemical)
if pathlinker:
out_dir = "%s/pathlinker"%(RESULTSPREFIX)
t_utils.checkDir(out_dir)
pathlinker_output_file = "%s/%sk_%s-ranked-edges.txt"%(out_dir, chemical, k)
if not os.path.isfile(pathlinker_output_file) or forced:
#runPathlinker(opts.interactome, rec_tfs_file, opts.k, out_dir+'/'+chemical, log_file='results/chemicals/log_files/%s.log'%chemical, ids=ids_file, cost_change=opts.change_weight)
#runPathlinker(interactome, rec_tfs_file, opts.k, out_dir+'/'+chemical, log_file='', ids=ids_file, cost_change=opts.change_weight)
runPathlinker(interactome, rec_tfs_file, k, out_dir+'/'+chemical, log_file='')
#if opts.edgelinker:
else:
out_dir = "%s/edgelinker"%(RESULTSPREFIX)
t_utils.checkDir(out_dir)
edgelinker_output_file = "%s/%s-ranked-edges.txt"%(out_dir, chemical)
edgelinker_paths_file = "%s/%s-paths.txt"%(out_dir, chemical)
if not os.path.isfile(edgelinker_output_file) \
or os.path.getsize(edgelinker_paths_file) < 50 \
or forced:
run_edgelinker.runEdgeLinker(interactome, rec_tfs_file, "%s/%s"%(out_dir, chemical), max_k)
def compute_stat_sig(scope, num_random_networks, k_to_test, forced=False):
# after the random networks have been generated, run the statistical significance scripts to get p-values for each of the chemicals
# example call:
# python src/compute_stat_sig.py --compute-score-pval -n 1 1000 -k 10 -k 25 -k 50 -k 75 -k 100 -k 150 -k 200 -k 500 --out-dir outputs/2017_01-toxcast-signal-family-w0_15-p1_5-c1-no-bioseek/weighted/stats/stat-sig-permute-dir-undir/
cmd = "python src/stat_sig/compute_stat_sig.py " + \
" --chemicals %s/chemicals.txt " % (INPUTSPREFIX) + \
" --compute-score-pval " + \
" --num-random %d %d " % (num_random_networks[0], num_random_networks[1]) + \
" -k %s " % (' -k '.join([str(k) for k in k_to_test])) + \
" --out-dir %s/stats/stat-sig-%s/" % (RESULTSPREFIX, scope)
if forced:
cmd += " --forced "
# TODO add parallelization to compute_stat_sig
t_utils.runCommand(cmd)
def gen_figs(scope, k_to_test, forced=False):
# TODO check to make sure the necessary files (such as the statistical significance) have been generated first
commands_to_run = []
# create some plots comparing the p-values to some other statistics
cmd = "python src/plot/network/edge-weight-and-cost-hist.py " + \
" --version %s " % (VERSION) + \
" --compare-versions "
commands_to_run.append(cmd)
cmd = "python src/plot/version_plots/network_summary_plots.py " + \
" --version %s " % (VERSION) + \
" --compare-versions "
commands_to_run.append(cmd)
# cmd = "python src/plot/version_plots/k-type-hist.py " + \
# " --version %s " % (VERSION) + \
# " --compare-versions " + \
# " -k %s " % (' -k '.join([str(k) for k in k_to_test]))
# commands_to_run.append(cmd)
#
# cmd = "python src/plot/version_plots/sig_plots.py " + \
# " --version %s " % (VERSION) + \
# " --compare-versions " + \
# " --scope %s " % (scope)
# commands_to_run.append(cmd)
#
# # TODO fix this script
# cmd = "python src/plot/version_plots/plot_degree.py " + \
# " --version %s " % (VERSION) + \
# " --compare-versions " + \
# " --scope %s " % (scope)
# #commands_to_run.append(cmd)
for cmd in commands_to_run:
if forced:
cmd += " --forced "
print("\n")
print("-"*40)
t_utils.runCommand(cmd)
def write_overlap(chemicals_file, sig_pref="all", forced=False):
# write the current chemical set to a file
command = "python src/plot/overlap/plot_overlap_heatmap.py " + \
" --chemicals %s " % (chemicals_file) + \
" --out-dir %s/stats/overlap/%s/ " % (RESULTSPREFIX, sig_pref) + \
" --paths-dir %s/edgelinker/ " % (RESULTSPREFIX) + \
" --write-overlap " + \
" --plot-overlap " + \
" --plot-hits-overlap "
#" --inputs-dir %s " % (INPUTSPREFIX)
# " --overlap-pvals "
out_file = "%s/stats/overlap/%s/overlapping-proteins.txt" % (RESULTSPREFIX, sig_pref)
if forced is True or not os.path.isfile(out_file):
t_utils.runCommand(command)
else:
print("\t%s already exists. Use --forcesig to overwrite it" % (out_file))
# also run it with the family nodes split
command += " --split-family-nodes "
out_file = "%s/stats/overlap/%s/split-overlapping-proteins.txt" % (RESULTSPREFIX, sig_pref)
if forced is True or not os.path.isfile(out_file):
t_utils.runCommand(command)
else:
print("\t%s already exists. Use --forcesig to overwrite it" % (out_file))
def post_overlap_graph_to_graphspace(interactome, overlap_cutoff, num_chemicals, num_sig_chemicals=None, sig_pref="all", forced=False):
"""
Post the graph containing the nodes and edges common to > overlap_cutoff of the networks
*chemicals_file*:
*overlap_cutoff*: Number between 0 and 1. Generally 0.5
*sig_pref*: scope-sig_cutoff_type-sig_cutoff. Used to get the overlap of the significant networks. Default "all" is the overlap of all networks
*num_sig*: number of significant chemicals
"""
if sig_pref == "all":
chemicals_file = "%s/chemicals.txt" % (INPUTSPREFIX)
graph_name = "overlap-graph-%s-0_5-color-%s" % (num_chemicals, VERSION)
params = [(chemicals_file, sig_pref, graph_name)]
else:
# also post the non-significant overlap graph
sig_graph_name = "overlap-graph-sig%s-0_5-color-%s" % (num_sig_chemicals, VERSION)
nonsig_graph_name = "overlap-graph-nonsig%s-0_5-color-%s" % (num_chemicals-num_sig_chemicals, VERSION)
params = [("%s/sig-chemicals.txt"%(INPUTSPREFIX), sig_pref, sig_graph_name),
("%s/nonsig-chemicals.txt"%(INPUTSPREFIX), "nonsig-"+sig_pref, nonsig_graph_name)]
ev_version = t_utils.get_ev_version(VERSION)
# TODO add this file to toxcast_settings.py(?)
ctd_support_file = "/data/jeff-law/data/CTD/CTD_chem_gene_ixns.tsv"
for chemicals_file, sig_pref, graph_name in params:
node_overlap_file = "%s/stats/overlap/%s/overlapping-proteins.txt" % (RESULTSPREFIX, sig_pref)
edge_overlap_file = "%s/stats/overlap/%s/overlapping-edges.txt" % (RESULTSPREFIX, sig_pref)
# TODO update the evidence version
command = "python src/graphspace/post_to_graphspace_overlap.py " + \
" --ppi %s " % (interactome) + \
" --version %s" % (ev_version)+ \
" --sourcetarget %s/rec-tfs/all-rec-tfs.txt" % (INPUTSPREFIX) + \
" --edge-overlap %s" % (edge_overlap_file) + \
" --node-overlap %s" % (node_overlap_file) + \
" --sig-chemicals %s" % (chemicals_file) + \
" --overlap-cutoff %s" % (str(overlap_cutoff)) + \
" --graph-name %s" % (graph_name) + \
" --tag %s --tag overlap-graph --tag %s" % (VERSION, sig_pref) + \
" --ctd-support %s" % (ctd_support_file)
# TODO
# "--group ToxCast2 " + \
if VERSION in t_settings.EVIDENCE_FILES:
evidence_file = t_settings.EVIDENCE_FILES[VERSION]
command += " --ev-file %s" % (evidence_file)
# This file is written by the script. If it already exists, then skip posting to graphspace
ctd_support_amount = "%s-ctd-support.txt" % (node_overlap_file)
if forced is True or not os.path.isfile(ctd_support_amount):
t_utils.runCommand(command)
else:
print("\t%s already exists. Use --forcesig to post the overlap graph to graphspace anyway" % (ctd_support_amount))
def write_altered_interactome(chemical, interactome, altered_interactome, increase_weight, decrease_weight):
out = open(altered_interactome, 'w')
hits = []
non_hits = []
for acc in CHEMICAL_MAP.chemicals[chemical]['accs']:
if acc not in CHEMICAL_MAP.chemicals[chemical]['rec'] and acc not in CHEMICAL_MAP.chemicals[chemical]['tfs']:
if CHEMICAL_MAP.chemicals[chemical]['accs'][acc]['hit'] > 0:
hits.append(acc)
elif CHEMICAL_MAP.chemicals[chemical]['accs'][acc]['hit'] == 0:
non_hits.append(acc)
with open(interactome, 'r') as edges:
for line in edges:
if line[0] == '#':
out.write(line)
else:
edge = line.rstrip().split('\t')
u = edge[0]
v = edge[1]
w = float(edge[2])
evidence = edge[3]
if u in non_hits or v in non_hits:
w = w * decrease_weight
elif u in hits or v in hits:
w = w * increase_weight
if w > 0.9:
w = 0.9
out_line = '\t'.join([u,v,str(w),evidence]) + '\n'
out.write(out_line)
out.close()
def runPathlinker(interactome, rec_tfs_file, k, output_prefix, log_file='', ids='', cost_change=''):
#sys.stdout.write("Runnin pathlinker on %s\n"%rec_tfs_file)
sys.stdout.write("Runnin pathlinker on %s\n"%output_prefix)
command = "python /home/jeffl/git-workspace/PathLinker/PathLinker.py %s %s -k %s -o %s --allow-mult-targets --allow-mult-sources --write-paths "%(interactome, rec_tfs_file, k, output_prefix)
if ids and cost_change:
command += "--tested-assays %s --increase-weight %0.1f --decrease-weight %0.1f"%(ids, 1+cost_change, 1-cost_change)
if log_file:
command += "> %s 2>&1"%(log_file)
t_utils.runCommand(command, error_message="ERROR: Pathlinker failed on %s!\n"%rec_tfs_file)
def edgelinkerPermuteNetworks(interactome, scope, num_random_networks, max_k, k_to_test, forcealg,
parallel_processes, super_computer, num_iterations=10,
printonly=False, cleanup=False, single_chem=[], send_email=None):
"""
*num_iterations*: # edge swaps to perform (num_iterations * # edges in interactome)
"""
out_dir = "%s/random-%s" %(RESULTSPREFIX, scope)
t_utils.checkDir(out_dir)
orig_interactome = interactome
# just use the full interactome for now
if VERSION in ["netpath-pathlinker-signaling-children-reg", "kegg-pathlinker-signaling-children-reg"]:
# TODO use each pathway's interactome?
interactome = INTERACTOMES["2015pathlinker"]
# directory and file template for the stats of the generated random networks
stats_dir = "%s/stats/stat-sig-%s" % (RESULTSPREFIX, scope)
rand_scores_k = "%s/rand-networks/rand-%%d-med-scores-k.txt" % (stats_dir)
# check to make sure the chemical-k-median-scores.txt file used for summarizing the random networks exists
# UPDATE 2017-01-04: use the median path score of the k+ties paths rather than the score of the kth path
chemical_k_scores = "%s/chemical-k-median-scores.txt" % (stats_dir)
# modify the paths a bit if this is running on baobab
if super_computer:
# TODO if the interactome has already been copied, don't need to copy it again
# for now, just use the same chemical_k_scores
if forcealg or not os.path.isfile(chemical_k_scores):
# Copy the interactome to the baobab compute nodes
t_utils.copyToBaobabNodes(interactome)
else:
print("Assuming interactome already copied to baobab compute nodes: %s\n\tIf not, use --forcealg to ensure it gets copied" % (interactome))
t_utils.checkDir("%s/qsub_files" % out_dir)
# keep the log files on /data
qsub_out_dir = os.path.abspath(out_dir)
# if cleanup is specified, write the results to the baobab nodes, and then delete them there
# otherwise the results will be written to their original location
if cleanup:
# TODO this is a temporary fix for writing output to baobab nodes
# 2016-11-15: move the file-writing and reading to the /localdisk partition on the baobab nodes (baobab-1, baobab-2, etc)
# all of the reading and writing is slowing down /data (holmes) a lot
out_dir = re.sub("^/data", "/localdisk", os.path.abspath(out_dir))
interactome = re.sub("^/data", "/localdisk", os.path.abspath(interactome))
else:
print("\nTip: Use the --cleanup to write the files to individual baobab nodes rather than /data (holmes) offering a big speedup")
# check to make sure the chemical-k-median-scores.txt file used for summarizing the random networks exists
if forcealg or not os.path.isfile(chemical_k_scores):
# Run the command to write the file if it does not exist
# Whatever k are specified here will be used for all of the random networks (by parsing the header line of the file)
# the user should select which k the significance values will be computed at
# I am now using the seven values -k 25 -k 50 -k 75 -k 100 -k 150 -k 200 -k 500
print("%s does not exist. Running compute_stat_sig.py with the --write-counts option to write it." % (chemical_k_scores))
cmd = "python src/stat_sig/compute_stat_sig.py " + \
" --chemicals %s/chemicals.txt " % (INPUTSPREFIX) + \
" --k-limit %d " % (max_k) + \
" --group-by-prob " + \
" --write-counts " + \
" --paths-dir %s/edgelinker " % (RESULTSPREFIX) + \
" --out-dir %s " % (stats_dir) + \
" -k %s " % (' -k '.join([str(k) for k in k_to_test]))
#" -k 20 -k 50 -k 200 -k 500 "
t_utils.runCommand(cmd)
# command for permute
command_template = "time python src/permute/permute_and_edgelinker.py \
--version %s \
--interactome %s \
--inputs-dir %s \
--out-dir %s --k %d \
--num-iterations %d "\
% (VERSION, interactome, INPUTSPREFIX, out_dir, max_k, num_iterations)
command_template += " --write-score-counts %s " % (stats_dir)
if scope == 'permute-dir':
# if no options are specified, swapping will be all directed
pass
if scope == 'permute-undir':
command_template += " --undirected "
if scope == 'permute-dir-undir' or scope == 'permute-dir-undir-wb':
command_template += " --swap-phys-sig-sep "
if scope == 'permute-dir-undir-wb':
command_template += " --split-by-weight 0.6 "
if scope == 'local':
all_rec_tfs_file = "%s/rec-tfs/all-rec-tfs.txt" % (INPUTSPREFIX)
command_template += " --permute-rec-tfs %s " % (all_rec_tfs_file)
if scope == 'global':
all_rec_tfs_file = "%s/random-%s/all-rec-tfs.txt" % (INPUTSPREFIX, scope)
global CHEMICAL_MAP
# load the chemicals map
if CHEMICAL_MAP is None:
CHEMICAL_MAP = t_utils.loadChemicalMap(orig_interactome)
# write the set of all human rec and tfs
CHEMICAL_MAP.write_global_rec_tfs(
"%s/inputs/pathlinker-data/human-rec-tfs.txt" % (os.getcwd()),
all_rec_tfs_file)
command_template += " --permute-rec-tfs %s " % (all_rec_tfs_file)
if forcealg:
command_template += " --forced "
if cleanup:
command_template += " --cleanup "
if single_chem is not None:
command_template += " ".join([" --single-chem %s" % (chem) for chem in single_chem])
# split up the jobs to run by the number of parallel processes
# for example, running 1 to 100 jobs with 24 cores would be range(1,101,24) which returns [1, 25, 49, 73, 97]
# 2016-11-17 Rather than loop through every set of indexes, just check if the desired output file exists or not
jobs_to_run = []
for i in range(num_random_networks[0], num_random_networks[1]+1):
# if the stats for this index do not already exist, add this index to the job list
if forcealg or not os.path.isfile(rand_scores_k % i):
jobs_to_run.append(i)
# parallel_processes*jobs_in_one_qsub jobs will be put in a single qsub script
# these if else statements attempt to keep the number of jobs submitted to baobab to a minimum
# while also effectively using th eresources
if len(jobs_to_run) > 3000:
# there are only 6 baobab nodes, so if we had less than 600 jobs,
# we would not be using the resources effectively
jobs_in_one_qsub = 50
elif len(jobs_to_run) > 600:
# if parralel_processes is 10, this put 10 permute_and_edgelinker.py jobs each with 10 random networks to make in one qsub,
# effectively putting 100 runs in one qsub script
jobs_in_one_qsub = 10
elif len(jobs_to_run) > 300:
# up to 50 runs in one qsub script
jobs_in_one_qsub = 5
else:
# up to 10 runs in 1 qsub script
jobs_in_one_qsub = 1
# first source the virtual environment
# I'm not sure how to get this path automatically, so I'll just hard code it for now
virtual_env = "source /data/jeff-law/tools/anaconda3/bin/activate toxcast"
# maybe I could just copy the entire current path(?)
jobs = []
if super_computer:
jobs = [virtual_env]
start_index = 0
for i in range(0, len(jobs_to_run), parallel_processes):
# add the indexes of the jobs to run to the template
end_index = i+parallel_processes if i+parallel_processes < len(jobs_to_run) else len(jobs_to_run)
jobs.append(command_template + ''.join([" -r %d"%jobs_to_run[index] for index in range(i, end_index)]))
if send_email and i+parallel_processes >= len(jobs_to_run):
# if this is the last job, send an email indicating everything is finished
# TODO also send an email if something fails?
jobs.append(t_utils.setupEmailCommand(send_email, subject="ToxCast Permute Networks",
message="Finished `date`! Version: %s, scope: %s" % (VERSION, scope),
baobab=super_computer))
# trick to group more jobs in the same qsub script
# if this is not the last job and we have less jobs in the current command than we want, keep adding on to the command
if len(jobs) < jobs_in_one_qsub and end_index < len(jobs_to_run):
pass
else:
# skip running if printonly is specified
if printonly:
print(jobs)
continue
if super_computer:
# run it on the super computer!
qsub_pref = "%s/qsub_files/%d-%d" % (qsub_out_dir, jobs_to_run[start_index], jobs_to_run[end_index-1])
#submit_jobs_super_computer([command], qsub_pref, ppn=parallel_processes)
# 2016-12-01: Right now I'm running 10 jobs on a single node with 24 cores because
# running more than 12 at the same time started causing jobs to quit with a -9 exit code (memory limit exceeded)
submit_jobs_super_computer(jobs, qsub_pref, ppn=24)
else:
# run the command locally. It will start child processes equal to the specified number of parallel processes
# TODO add local parallelization
for job in jobs:
t_utils.runCommand(job)
#subprocess.check_call(job.split())
# start the list of jobs over
jobs = []
if super_computer:
jobs = [virtual_env]
# the new start index is the current end_index
start_index = end_index
def submit_jobs_super_computer(jobs, qsub_pref, job_template='', nodes=1, ppn=24, walltime='15:00:00'):
""" Function to write and submit a qsub bash script to the baobab cluster.
Should only be run on the baobab cluster.
This wiki page has the info for getting setup on baobab: http://wiki.cs.vt.edu/wiki/Cbb/Baobab
*jobs* a list or set of commands/jobs that would normally be called with subprocess.check_output
*qsub_pref* prefix to qsub file and logs (-out.txt and -err.txt)
*job_template* a template of a job that has bash variables in it to be written to a loop. jobs should then be a list of (string) numbers or something
*nodes* = total number of nodes you need
*ppn* = processors per node that you will need
*walltime* = amount of time your job will be allowed before being forcefully removed. 'HH:MM:SS'
"""
qsub_file = "%s-qsub.sh" % qsub_pref
name = qsub_pref.split('/')[-1] + '-' + VERSION
# start the qsub file
with open(qsub_file, 'w') as out:
out.write('#PBS -l nodes=%d:ppn=%d,walltime=%s\n' % (nodes, ppn, walltime))
# set the job name
out.write('#PBS -N %s\n' % (name))
out.write('#PBS -o %s-out.txt\n' % (qsub_pref))
out.write('#PBS -e %s-err.txt\n' % (qsub_pref))
out.write('#####################################################################################\n')
# edgelinker only runs in the directory the src code is in for some reason
out.write('cd %s\n' % (os.getcwd()))
# make sure the class path is set for the java command line parser
out.write('export CLASSPATH="/home/jeffl/git-workspace/EdgeLinker-fork/commons-cli-1.3.1.jar:$CLASSPATH"\n')
out.write('echo "Job Started at: `date`"\n')
if not job_template:
# write the jobs, as well as an echo (print) in a bash for loop to the qsub file
out.write('\n'.join(['echo "%s"\n%s' % (cmd, cmd) for cmd in jobs]) + '\n')
else:
for_loop = 'COUNTER="%s"\n' % (' '.join(jobs))
for_loop += 'for i in $COUNTER; do\n'
for_loop += '\techo %s\n' % (job_template)
for_loop += '\t%s\n' % (job_template)
for_loop += 'done\n'
out.write(for_loop)
out.write('echo "Job Ended at: `date`"\n')
# TODO some kind of email or notification if any of the jobs failed
#command = "qsub -v interactome=%s,rec_tfs_file=%s,output_prefix=%s,max_k=%s src/run_edgelinker_baobab.sh" % (interactome, rec_tfs_file, out_dir+"/%d-random"%i, max_k)
cmd = "qsub %s" % (qsub_file)
# run the qsub command from the system
# if this is on baobab, then run as normal
if 'baobab' in socket.gethostname():
t_utils.runCommand(cmd)
else:
# otherwise run over ssh
# have to give the path to the executable because the PATH variable isn't loaded when running over ssh
t_utils.runCommandOnBaobab(cmd)
def runCommandParallel(processes, max_processes, command, hide_output=False):
# TODO Popen doesn't let you check the success/failure of the command.
# How do you know if it was successful or not?
if hide_output:
processes.add(subprocess.Popen(command.split(), stdout=open('/dev/null', 'w'), stderr=open('/dev/null', 'w')))
else:
processes.add(subprocess.Popen(command.split()))
#print(len(processes), max_processes)
if len(processes) >= max_processes:
os.wait()
processes.difference_update([
p for p in processes if p.poll() is not None])
return processes
def parseArguments(args):
usage = 'master-script.py [options]\n'
parser = OptionParser(usage=usage)
## Common options
parser.add_option('','--version',type='string', action='append',
help="Version of the PPI to run. Can specify multiple versions and they will run one after the other. Options are: %s." % (', '.join(t_settings.ALLOWEDVERSIONS)))
parser.add_option('','--response-networks',action='store_true', default=False,
help='Generate the response networks by running edgelinker with the set of perturbed receptors and tfs for each chemical. Use --pathlinker to run pathlinker instead')
parser.add_option('-S','--single-run', type='string', action='append',
help='Run only a single chemical. Can specify multiple chemicals by listing this option multiple times.\n\tUse "file" to read the list of chemicals from the inputs/version/single_chem.txt file')
parser.add_option('','--forceinputs',action='store_true', default=False,
help='Force overwriting the parsed input files')
group = OptionGroup(parser,'Algorithms')
group.add_option('-P','--pathlinker',action='store_true', default=False,
help='Run PathLinker instead of edgelinker on the set of chemicals using perturbed receptors and tfs')
#group.add_option('','--k',type='int',default='200',
# help='k to use when running pathlinker\n\tDefault = 200')
group.add_option('','--edgelinker-k',type='int',default='1000',
help='maximum number of paths to keep when running edgelinker. Useful to not generate too much data (~2TB if all paths of each random network are kept)\n\tDefault = 1000')
group.add_option('-f','--forcealg',action='store_true', default=False,
help='Force the algorithm to run even if the output is there already. Also works for forcing random network generation')
parser.add_option_group(group)
# group = OptionGroup(parser,'GraphSpace')
# group.add_option('-g','--graphspace',action='store_true', default=False,
# help='post response networks to graphspace')
# group.add_option('','--k-to-post', type='int', default=200,
# help='Value of k to test for significance. Multiple k values can be given.')
# group.add_option('','--forcepost',action='store_true', default=False,
# help='Force the network to be posted to graphspace even if json file already exists')
# group.add_option('-p','--postfix',action='store',
# help='add the postfix to the end of the chemical name to compare different options')
# group.add_option('', '--revigo-colors', action='store_true', default=False,
# help="add colors to the nodes according to which REVIGO function/goterm it belongs to. GO terms must be manually selected based on REVIGO results")
# parser.add_option_group(group)
group = OptionGroup(parser, 'Statistics')
group.add_option('','--stat-sig', action='store_true',
help='Compute the statistical significance for given k values (specified by --k-to-test).\n\tUses the --scope and --num-random-networks options')
group.add_option('','--sig-cutoff', type='float',
help='Significance (p-value) cutoff for selecting chemicals. Uses the --scope option.')
group.add_option('','--sig-cutoff-type', type='string', default='FDR',
help='Type of significance value to use. Default="FDR".\n\tValid options: "BF" (Bonferonni-corrected), "FDR" (False discovery rate q-value)')
group.add_option('','--overlap', action='store_true', default=False,
help='Write the # of times each protein is found in the response networks. Can be used with --sig-cutoff to get the overlap of the significant networks')
group.add_option('','--overlap-graph', action='store', type="float",
help='Post the graph containing the nodes and edges common to > overlap_graph (cutoff) of the networks.' +
'Can be used with --sig-cutoff to get the overlap of the significant networks')
group.add_option('','--gen-figs',action='store_true', default=False,
help='Generate figures about various aspects of the networks including edge types, weights, statistical significance, and more')
group.add_option('','--forcesig',action='store_true', default=False,
help='Force the scripts to compute statistical significance even if the output is there already')
parser.add_option_group(group)
group = OptionGroup(parser,'Random Network Generation')
#group.add_option('-R','--permute_network',action='store_true',
# help='Run edgelinker on a permuted network using the receptors and tfs for each chemical. Will be used to compute statistical significance of networks')
group.add_option('-r','--random-networks',action='store_true', default=False,
help='Run pathlinker on random networks or random sets of receptors and tfs for each chemical. Will be used to compute statistical significance of networks')
group.add_option('','--scope',action='append',
help="type of permutation test to run. options are: 'local', 'global' and 'permute'." +
"\n\t'local' is the set of assayed rec and tfs. \n\t'global' is the entire set of human rec and tfs. " +
"\n\t'permute-undir' will run edgelinker on a permuted network where all edges are undirected using the receptors and tfs for each chemical." +
"\n\t'permute-dir' will run edgelinker on a permuted network where all edges are directed using the receptors and tfs for each chemical." +
"\n\t'permute-dir-undir' will permute the signaling and physical interactions (directed and undirected edges) separately." +
"\n\t'permute-dir-undir-wb' will also permute the edges above and below a weight of 0.6 separately (wb = weighted bins).")
group.add_option('-k','--k-to-test', type='int', action='append',
help='Value of k to test for significance. Multiple k values can be given. Suggested: 25, 50, 75, 100, 150, 200, 500')
group.add_option('-n','--num-random-networks',type='int',default=(1,100),nargs=2,
help="# of random networks to generate and run pathlinker on. Specify 2 integers (separated by a space): start_index end_index. \n\tDefault = 1 100")
#group.add_option('','--chemical-range',action='store', type='int', nargs=2,
# help='range of chemicals to run the random sets on. Requires two integers. For example: --chemical-range 100 200. Reverse list also supported (i.e. 200 100). Useful for generating random networks in parallel.')
group.add_option('','--parallel-processes',action='store', type='int', default=1,
help='# of processes to run in the background simultaneously. Default = 1')
group.add_option('','--super-computer',action='store_true', default=False,
help='Generate random networks using the baobab cluster. Can only be run on a lab computer or from baobab (ssh baobab.cbb.lan)')
group.add_option('','--cleanup',action='store_true', default=False,
help='Delete edgelinker output the intermediate files generated. Also writes the files to individual baobab nodes rather than /data (holmes) offering a big speedup (holmes gets really slow with all of the I/O). \n\tCurrently only implemented for random network generation')
# Was used for the 'local' and 'global' settings when random sets of receptors and tfs were chosen for each chemical
group.add_option('','--group-num-rectfs',action='store_true', default=False,
help='For the \'local\' and \'global\' scopes, group the random network generation by the # of rec and tfs ' + \
'rather than random networks for each chemical. Cuts data generation ~1/2 for 464 chemicals.')
group.add_option('','--send-email',action='store', type='string',
help='Send an email (to specified recipient, from [email protected]) after the random networks have finished generating')
parser.add_option_group(group)
group = OptionGroup(parser, 'Other Options')
group.add_option('','--printonly',action='store_true', default=False,
help='Just print the commands that would be run. Currently only implemented for random network generation')
parser.add_option_group(group)
# previous options
# parser.add_option('-o','--out-dir',action='store', default="results/chemicals/pathlinker-outputs",
# help='output dir to store pathlinker results for each chemical')
# parser.add_option('','--change-weight',action='append', type='float',
# help='Change the weight of edges out of and into acc by this amount')
# parser.add_option('','--overlap',action='store_true', \
# help='compute the pval of overlap of chemical A to B')
# parser.add_option('','--compute-prec-rec',action='store_true', \
# help='Compute the precision recall values for the chemicals')
#parser.add_option('','--write-paths',action='store_true',default=False, help='')
# parse the command line arguments
(opts, args) = parser.parse_args()
#if not opts.chemicals and not opts.random_networks and not opts.graphspace and not opts.overlap:
# print("Must specify to run pathlinker on the set of chemicals or the random set of receptors")
# sys.exit(1)
#if opts.chemicals and not (opts.pathlinker or opts.edgelinker):
for version in opts.version:
if version not in t_settings.ALLOWEDVERSIONS:
print("ERROR: version '%s' not an allowed version. Options are: %s." % (version, ', '.join(t_settings.ALLOWEDVERSIONS)))
sys.exit()
if opts.scope is not None:
for scope in opts.scope:
if scope not in ['local', 'global', 'permute-dir', 'permute-dir-undir', 'permute-dir-undir-wb', 'permute-undir']:
print("ERROR: %s not a valid option for scope. Valid options are: %s" %(scope, ', '.join(['local', 'global', 'permute-dir', 'permute-dir-undir', 'permute-dir-undir-wb', 'permute-undir'])))
sys.exit(1)
# This could be fine if the user wanted to generate them in reverse order (i.e. 464 -> 1) to try to eliminate problems of running edgelinker on the same chemicals
# if opts.chemical_range:
# if opts.chemical_range[0] > opts.chemical_range[1] or opts.chemical_range[0] < 0:
# print("ERROR: chemical range is invalid. First number must be < second number. Given: %d, %d"%(opts.chemical_range[0],opts.chemical_range[1]))
# sys.exit(1)
return opts
if __name__ == '__main__':
main(sys.argv)