-
Notifications
You must be signed in to change notification settings - Fork 5
/
run.py
128 lines (100 loc) · 5.01 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/env python
# encoding: utf-8
'''
Created by Joan Smith
on 2018-9-17.
Copyright (c) 2018. All rights reserved.
'''
import argparse
import urllib
import os
import sys
import pandas as pd
sys.path.append('common')
from common import utilities as util
from mutation_analysis import zscores_for_mutants
from copy_number import process_copy_numbers_to_genes
from copy_number import zscores_for_copy_number
CANCER_TYPES = ['BLCA', 'BRCA', 'COADREAD', 'GBMLGG', 'HNSC', 'KIPAN', 'LIHC', 'LUAD', 'LUSC',
'OV', 'PAAD', 'PRAD', 'SARC', 'SKCM', 'STES', 'UCEC']
BUCKET_URL = 'http://storage.googleapis.com/public-smith-sheltzer-cancer-analysis/'
COPY_NUMBER_SUFFIX = '.snp__genome_wide_snp_6__broad_mit_edu__Level_3__segmented_scna_minus_germline_cnv_hg19__seg.seg.txt'
def get_options():
parser = argparse.ArgumentParser(description='Download data and run analysis')
parser.add_argument('-p', action='store', dest='parallel_workers', type=int, default=0)
parser.add_argument('-o', action='store', dest='output_directory', default='.')
ns = parser.parse_args()
return (ns.parallel_workers, ns.output_directory)
def maybe_create_dir(d):
try:
os.makedirs(d)
except OSError:
pass
return
def download_data(output_directory):
maybe_create_dir(os.path.join(output_directory, 'input-data/TCGA-clinical'))
maybe_create_dir(os.path.join(output_directory, 'input-data/mutation-data'))
maybe_create_dir(os.path.join(output_directory, 'input-data/copy-number-data'))
for c in CANCER_TYPES:
print 'Downloading', c, 'data...'
clin_file = c + '.clin.merged.txt'
urllib.urlretrieve(BUCKET_URL + 'TCGA-clinical-files/' + clin_file,
os.path.join(output_directory, 'input-data', 'TCGA-clinical', clin_file))
copy_number_file = c + COPY_NUMBER_SUFFIX
urllib.urlretrieve(BUCKET_URL + 'copy-number-data/' + copy_number_file,
os.path.join(output_directory, 'input-data', 'copy-number-data', copy_number_file))
mutation_file = c + '.txt'
urllib.urlretrieve(BUCKET_URL + 'mutation-data/' + mutation_file,
os.path.join(output_directory, 'input-data', 'mutation-data', mutation_file))
urllib.urlretrieve(BUCKET_URL + 'HUGO_Gene_Nomenclature_Committee_Annotations.csv',
os.path.join(output_directory, 'input-data', 'HUGO_Gene_Nomenclature_Committee_Annotations.csv'))
def single_file_errors(cancer_type, file_path, fkind):
try:
clin = pd.read_csv(file_path, nrows=5, sep='\t')
if len(clin) != 5:
print cancer_type, fkind, 'file not downloaded correctly at', file_path
return True
except IOError as e:
print cancer_type, fkind, 'file was not found at', file_path
return True
return False
def check_data(input_directory):
print 'Checking Downloads...'
errors = False
for c in CANCER_TYPES:
clin_file_path = os.path.join(input_directory, 'TCGA-clinical', c + '.clin.merged.txt')
errors |= single_file_errors(c, clin_file_path, 'clinical')
mutation_file_path = os.path.join(input_directory, 'mutation-data', c + '.txt')
errors |= single_file_errors(c, mutation_file_path, 'mutation data')
copy_number_file_path = os.path.join(input_directory, 'copy-number-data', c + COPY_NUMBER_SUFFIX)
errors |= single_file_errors(c, copy_number_file_path, 'copy number data')
if errors:
print 'File download errors, please fix and rerun'
sys.exit(1)
print 'Downloads OK...'
def run_univariate_mutation(parallel_workers, output_directory):
maybe_create_dir(os.path.join(output_directory, 'mutation-zscores'))
zscores_for_mutants.all_cancer_types(os.path.join(output_directory, 'input-data', 'mutation-data'),
os.path.join(output_directory, 'input-data', 'TCGA-clinical'),
os.path.join(output_directory, 'mutation-zscores'),
parallel_workers=parallel_workers)
def run_univariate_copy_number(parallel_workers, output_directory):
maybe_create_dir(os.path.join(output_directory, 'copy-number-by-gene'))
maybe_create_dir(os.path.join(output_directory, 'copy-number-zscores'))
process_copy_numbers_to_genes.all_cancer_types(
os.path.join(output_directory, 'input-data', 'copy-number-data'),
os.path.join(output_directory, 'input-data', 'HUGO_Gene_Nomenclature_Committee_Annotations.csv'),
os.path.join(output_directory, 'copy-number-by-gene'),
parallel_workers=parallel_workers)
zscores_for_copy_number.all_cancer_types(os.path.join(output_directory, 'copy-number-by-gene'),
os.path.join(output_directory, 'input-data', 'TCGA-clinical'),
os.path.join(output_directory, 'copy-number-zscores'),
parallel_workers=parallel_workers)
def main():
parallel_workers, output_directory = get_options()
download_data(output_directory)
check_data(os.path.join(output_directory, 'input-data'))
run_univariate_mutation(parallel_workers, output_directory)
run_univariate_copy_number(parallel_workers, output_directory)
if __name__ == "__main__":
main()