From 934294a5a779f2c30fae895829d8b46467507a22 Mon Sep 17 00:00:00 2001 From: thomasyu888 Date: Sat, 26 Jan 2019 19:10:20 -0800 Subject: [PATCH 01/27] Change t_depth to STRING, add in removal of floats for maf processing --- analyses/genomicData/MAFinBED.R | 3 ++- analyses/mergeFlag/mergeCheck.R | 1 + genie/maf.py | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/analyses/genomicData/MAFinBED.R b/analyses/genomicData/MAFinBED.R index 29333068..a72188f2 100644 --- a/analyses/genomicData/MAFinBED.R +++ b/analyses/genomicData/MAFinBED.R @@ -52,7 +52,8 @@ print(nrow(genieMutData)) originalCols = colnames(genieMutData) -# records with count data that preclude a VAF estimate - set VAF to 100% (1/1, alt/depth) +# records with count data that preclude a VAF estimate - set VAF to 100% (1/1, alt/depth) genieMutData$t_depth <- as.numeric(genieMutData$t_depth) +genieMutData$t_depth <- as.numeric(genieMutData$t_depth) noVAF.idx = which((genieMutData$t_depth==0)|is.na(genieMutData$t_depth)) #keeps the order if factors exist #genieMutData$t_alt_count_num = as.numeric(levels(genieMutData$t_alt_count))[genieMutData$t_alt_count] diff --git a/analyses/mergeFlag/mergeCheck.R b/analyses/mergeFlag/mergeCheck.R index 99ecf791..505088ee 100644 --- a/analyses/mergeFlag/mergeCheck.R +++ b/analyses/mergeFlag/mergeCheck.R @@ -127,6 +127,7 @@ for (center in centers) { } # records with count data that preclude a VAF estimate - set VAF to 100% (1/1, alt/depth) + genieMutData$t_depth <- as.numeric(genieMutData$t_depth) noVAF.idx = which((genieMutData$t_depth==0)|is.na(genieMutData$t_depth)) genieMutData$t_alt_count_num = as.numeric(levels(genieMutData$t_alt_count))[genieMutData$t_alt_count] genieMutData$t_alt_count_num[noVAF.idx] = 1 diff --git a/genie/maf.py b/genie/maf.py index de4a549e..e2f714bf 100644 --- a/genie/maf.py +++ b/genie/maf.py @@ -36,6 +36,7 @@ def createFinalMaf(self, mafDf, filePath, maf=False): mafSet = mafDf.to_csv(sep="\t", index=False, header=None) writeOrAppend = "w" if maf else "a" with open(filePath, writeOrAppend) as maf: + mafSet = process_functions.removeStringFloat(mafSet) maf.write(mafSet) #There is a isNarrow option, but note that the number of rows of the maf file From 015d534d6c8429c5a01f43ae5a3df0cbe2da5a1e Mon Sep 17 00:00:00 2001 From: thomasyu888 Date: Sat, 26 Jan 2019 19:19:47 -0800 Subject: [PATCH 02/27] use check_call --- genie/maf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/genie/maf.py b/genie/maf.py index e2f714bf..c9591994 100644 --- a/genie/maf.py +++ b/genie/maf.py @@ -59,7 +59,7 @@ def process_helper(self, filePath, path_to_GENIE, mafSynId, centerMafSynId, narrowMafColumns = [col['name'] for col in self.syn.getTableColumns(mafSynId) if col['name'] != 'inBED'] #Strips out windows indentations \r command = ['dos2unix',filePath] - subprocess.call(command) + subprocess.check_call(command) tempdir = os.path.join(path_to_GENIE, self.center) commandCall = ["perl",os.path.join(vcf2mafPath,"maf2maf.pl"), "--input-maf",filePath, @@ -72,7 +72,7 @@ def process_helper(self, filePath, path_to_GENIE, mafSynId, centerMafSynId, "--custom-enst", os.path.join(vcf2mafPath,"data/isoform_overrides_uniprot")] if reference is not None: commandCall.extend(["--ref-fasta",reference]) - maf = subprocess.call(commandCall) + maf = subprocess.check_call(commandCall) process_functions.rmFiles(tempdir, recursive=False) open(narrowMafPath,"w").close() From f309fbef2cc38257b4336c0fd67c35830c4211df Mon Sep 17 00:00:00 2001 From: thomasyu888 Date: Sat, 26 Jan 2019 19:25:54 -0800 Subject: [PATCH 03/27] Use subprocess check call --- genie/vcf.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/genie/vcf.py b/genie/vcf.py index f7aca93e..0e1162c7 100644 --- a/genie/vcf.py +++ b/genie/vcf.py @@ -52,16 +52,16 @@ def process_helper(self, vcffiles, path_to_GENIE, mafSynId, centerMafSynId, newVCFPath = os.path.join(centerInputFolder, vcfName) #remove chr from each row command = ["sed", "'s/^chr//'", path, ">", newVCFPath] - subprocess.call(" ".join(command), shell=True) + subprocess.check_call(" ".join(command), shell=True) #Empty spaces must be replaced with a period command = ["sed", '-i', "'s/\t\t/\t.\t/g'", newVCFPath] - subprocess.call(" ".join(command), shell=True) + subprocess.check_call(" ".join(command), shell=True) #All INFO/HGVS values have a whitespace, which is not allowed in VCF specs. Replace that with a comma command = ['sed', '-i', "'s/ p\./,p./'", newVCFPath] - subprocess.call(" ".join(command), shell=True) + subprocess.check_call(" ".join(command), shell=True) #Strips out windows indentations \r command = ['dos2unix',newVCFPath] - subprocess.call(command) + subprocess.check_call(command) vcfCols = ["#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"] with open(newVCFPath,"r") as f: for line in f: @@ -107,7 +107,7 @@ def process_helper(self, vcffiles, path_to_GENIE, mafSynId, centerMafSynId, '--custom-enst', os.path.join(vcf2mafPath, 'data/isoform_overrides_uniprot')] if reference is not None: command.extend(["--ref-fasta",reference]) - subprocess.call(command) + subprocess.check_call(command) if (os.path.isfile(newMAFPath)): mafFiles.append(newMAFPath) From 994b4d25ad61961da9b33ebbe898c0282d23b93e Mon Sep 17 00:00:00 2001 From: thomasyu888 Date: Sun, 27 Jan 2019 16:17:50 -0800 Subject: [PATCH 04/27] Fix when site submits both TUMOR and NORMAL column. Sample id is not TUMOR --- genie/vcf.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/genie/vcf.py b/genie/vcf.py index 0e1162c7..3cb2c1b4 100644 --- a/genie/vcf.py +++ b/genie/vcf.py @@ -74,22 +74,23 @@ def process_helper(self, vcffiles, path_to_GENIE, mafSynId, centerMafSynId, samples = [i for i in cols if i not in vcfCols] - tumorName = vcfName.replace(".vcf","") + #tumorName = vcfName.replace(".vcf","") if len(samples) == 1: tumor = samples[0] normal = "NORMAL" - ### If the tumor name isn't TUMOR, set the sample id to be the tumor name - if tumor != "TUMOR": - tumorName = tumor elif len(samples) == 2: - #Tumor is always first, normal is second + #Assumes that Tumor is always first, normal is second tumor = samples[0] normal = samples[1] - tumorName = tumor else: tumor = "TUMOR" normal = "NORMAL" + # ### If the tumor name isn't TUMOR, set the sample id to be the tumor name + if tumor != "TUMOR": + tumorName = vcfName.replace(".vcf","") + else: + tumorName = tumor newMAFPath = newVCFPath + ".maf" if os.path.isfile(newMAFPath): mafFiles.append(newMAFPath) From e115f3884db224a231c87b346f04d6edbf66d6bb Mon Sep 17 00:00:00 2001 From: thomasyu888 Date: Sun, 27 Jan 2019 16:28:16 -0800 Subject: [PATCH 05/27] Fix logic --- genie/vcf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/genie/vcf.py b/genie/vcf.py index 3cb2c1b4..b98b52ff 100644 --- a/genie/vcf.py +++ b/genie/vcf.py @@ -87,7 +87,7 @@ def process_helper(self, vcffiles, path_to_GENIE, mafSynId, centerMafSynId, tumor = "TUMOR" normal = "NORMAL" # ### If the tumor name isn't TUMOR, set the sample id to be the tumor name - if tumor != "TUMOR": + if tumor == "TUMOR": tumorName = vcfName.replace(".vcf","") else: tumorName = tumor From 187ea2c590adff3f60e8cc122922c20f4e5e76a0 Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Tue, 29 Jan 2019 17:24:57 -0800 Subject: [PATCH 06/27] Write out mafinbed to file then upload --- analyses/genomicData/MAFinBED.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/analyses/genomicData/MAFinBED.R b/analyses/genomicData/MAFinBED.R index a72188f2..6a7eba7c 100644 --- a/analyses/genomicData/MAFinBED.R +++ b/analyses/genomicData/MAFinBED.R @@ -113,5 +113,7 @@ genieMutData$t_alt_count_num <- NULL updateMutData = genieMutData[genieMutData$inBED != oldInBed,] if (nrow(updateMutData) > 0) { - synStore(Table(mafSynId, updateMutData[c("ROW_ID","ROW_VERSION","inBED")])) + write.csv(updateMutData[c("ROW_ID","ROW_VERSION","inBED")],"update_inbed.csv",row.names = F) + #synStore(Table(mafSynId, updateMutData[c("ROW_ID","ROW_VERSION","inBED")])) + synStore(Table(mafSynId, "update_inbed.csv")) } \ No newline at end of file From b2595fa841bfb3c51fbacb94ade3b4e272bb98e1 Mon Sep 17 00:00:00 2001 From: thomasyu888 Date: Thu, 31 Jan 2019 02:41:57 -0800 Subject: [PATCH 07/27] Chunk the upload of maf --- analyses/genomicData/MAFinBED.R | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/analyses/genomicData/MAFinBED.R b/analyses/genomicData/MAFinBED.R index 6a7eba7c..276995c0 100644 --- a/analyses/genomicData/MAFinBED.R +++ b/analyses/genomicData/MAFinBED.R @@ -113,7 +113,18 @@ genieMutData$t_alt_count_num <- NULL updateMutData = genieMutData[genieMutData$inBED != oldInBed,] if (nrow(updateMutData) > 0) { - write.csv(updateMutData[c("ROW_ID","ROW_VERSION","inBED")],"update_inbed.csv",row.names = F) + #write.csv(updateMutData[c("ROW_ID","ROW_VERSION","inBED")],"update_inbed.csv",row.names = F) + #Need to chunk the upload + chunk = 100000 + rows = 0 + while (rows*chunk < nrow(updateMutData)) { + if ((rows + 1)* chunk > nrow(updateMutData)) { + to_update = updateMutData[((rows*chunk)+1):nrow(updateMutData),c("ROW_ID","ROW_VERSION","inBED")] + } else{ + to_update = updateMutData[((rows*chunk)+1):((rows+1)*chunk),c("ROW_ID","ROW_VERSION","inBED")] + } + rows = rows+1 + } #synStore(Table(mafSynId, updateMutData[c("ROW_ID","ROW_VERSION","inBED")])) - synStore(Table(mafSynId, "update_inbed.csv")) + #synStore(Table(mafSynId, "update_inbed.csv")) } \ No newline at end of file From e6eb8a6dcada1194c19bc295f4f7f9a8e8bd6a0f Mon Sep 17 00:00:00 2001 From: thomasyu888 Date: Thu, 31 Jan 2019 02:44:39 -0800 Subject: [PATCH 08/27] update maf table in chunks --- analyses/genomicData/MAFinBED.R | 1 + 1 file changed, 1 insertion(+) diff --git a/analyses/genomicData/MAFinBED.R b/analyses/genomicData/MAFinBED.R index 276995c0..b5c9530c 100644 --- a/analyses/genomicData/MAFinBED.R +++ b/analyses/genomicData/MAFinBED.R @@ -125,6 +125,7 @@ if (nrow(updateMutData) > 0) { } rows = rows+1 } + synStore(Table(mafSynId, to_update)) #synStore(Table(mafSynId, updateMutData[c("ROW_ID","ROW_VERSION","inBED")])) #synStore(Table(mafSynId, "update_inbed.csv")) } \ No newline at end of file From 96b5a2079c99076a945f38a8e7d42756b5f3861f Mon Sep 17 00:00:00 2001 From: thomasyu888 Date: Thu, 31 Jan 2019 02:45:56 -0800 Subject: [PATCH 09/27] Move synstore inside of while --- analyses/genomicData/MAFinBED.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analyses/genomicData/MAFinBED.R b/analyses/genomicData/MAFinBED.R index b5c9530c..a970a65b 100644 --- a/analyses/genomicData/MAFinBED.R +++ b/analyses/genomicData/MAFinBED.R @@ -123,9 +123,9 @@ if (nrow(updateMutData) > 0) { } else{ to_update = updateMutData[((rows*chunk)+1):((rows+1)*chunk),c("ROW_ID","ROW_VERSION","inBED")] } + synStore(Table(mafSynId, to_update)) rows = rows+1 } - synStore(Table(mafSynId, to_update)) #synStore(Table(mafSynId, updateMutData[c("ROW_ID","ROW_VERSION","inBED")])) #synStore(Table(mafSynId, "update_inbed.csv")) } \ No newline at end of file From 42c145fe9eb6a7e51b04dbf6b7e8488599ee969b Mon Sep 17 00:00:00 2001 From: thomasyu888 Date: Thu, 31 Jan 2019 12:00:45 -0800 Subject: [PATCH 10/27] Write out mafinbed file instead of using tables --- analyses/genomicData/MAFinBED.R | 44 ++++++++++++++++++--------------- genie/database_to_staging.py | 11 ++++++--- 2 files changed, 31 insertions(+), 24 deletions(-) diff --git a/analyses/genomicData/MAFinBED.R b/analyses/genomicData/MAFinBED.R index a970a65b..c0f6ab48 100644 --- a/analyses/genomicData/MAFinBED.R +++ b/analyses/genomicData/MAFinBED.R @@ -4,8 +4,8 @@ library(synapser) library(VariantAnnotation) args = commandArgs(trailingOnly=TRUE) -if (length(args) != 1) { - stop("Must supply a boolean value") +if (length(args) != 2) { + stop("Must supply a boolean value and a filepath to write variants not in bed") } # SAGE login tryCatch({ @@ -108,24 +108,28 @@ for (panelName in seq_assays) { } genieMutData$t_depth_new <- NULL genieMutData$t_alt_count_num <- NULL +# Commented out below because of PFLM-4975 #Compare old inBED with new inBED column #If there are differences, update only the diffs -updateMutData = genieMutData[genieMutData$inBED != oldInBed,] +# updateMutData = genieMutData[genieMutData$inBED != oldInBed,] +# if (nrow(updateMutData) > 0) { +# #write.csv(updateMutData[c("ROW_ID","ROW_VERSION","inBED")],"update_inbed.csv",row.names = F) +# #Need to chunk the upload -if (nrow(updateMutData) > 0) { - #write.csv(updateMutData[c("ROW_ID","ROW_VERSION","inBED")],"update_inbed.csv",row.names = F) - #Need to chunk the upload - chunk = 100000 - rows = 0 - while (rows*chunk < nrow(updateMutData)) { - if ((rows + 1)* chunk > nrow(updateMutData)) { - to_update = updateMutData[((rows*chunk)+1):nrow(updateMutData),c("ROW_ID","ROW_VERSION","inBED")] - } else{ - to_update = updateMutData[((rows*chunk)+1):((rows+1)*chunk),c("ROW_ID","ROW_VERSION","inBED")] - } - synStore(Table(mafSynId, to_update)) - rows = rows+1 - } - #synStore(Table(mafSynId, updateMutData[c("ROW_ID","ROW_VERSION","inBED")])) - #synStore(Table(mafSynId, "update_inbed.csv")) -} \ No newline at end of file +# #it is an amazon problem, where amazon kills the connection while reading the csv from s3. +# chunk = 100000 +# rows = 0 +# while (rows*chunk < nrow(updateMutData)) { +# if ((rows + 1)* chunk > nrow(updateMutData)) { +# to_update = updateMutData[((rows*chunk)+1):nrow(updateMutData),c("ROW_ID","ROW_VERSION","inBED")] +# } else{ +# to_update = updateMutData[((rows*chunk)+1):((rows+1)*chunk),c("ROW_ID","ROW_VERSION","inBED")] +# } +# synStore(Table(mafSynId, to_update)) +# rows = rows+1 +# } +# #synStore(Table(mafSynId, updateMutData[c("ROW_ID","ROW_VERSION","inBED")])) +# #synStore(Table(mafSynId, "update_inbed.csv")) +# } +updateMutData = genieMutData[genieMutData$inBED == FALSE,c('Chromosome', 'Start_Position', 'End_Position', 'Reference_Allele', 'Tumor_Seq_Allele2', 'Tumor_Sample_Barcode', 'Center')] +write.csv(updateMutData,args[2],row.names = F) diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py index cca5294d..08450160 100644 --- a/genie/database_to_staging.py +++ b/genie/database_to_staging.py @@ -151,12 +151,15 @@ def configureMafRow(rowArray, headers, keepSamples, remove_variants): #Run MAF in BED script, filter data and update MAFinBED database def runMAFinBED(syn, CENTER_MAPPING_DF, databaseSynIdMappingDf, test=False, genieVersion="test"): MAFinBED_script = os.path.join(os.path.dirname(os.path.abspath(__file__)),'../analyses/genomicData/MAFinBED.R') - command = ['Rscript', MAFinBED_script, str(test)] + notinbed_variant_file = os.path.join(os.path.dirname(os.path.abspath(__file__)),'../analyses/genomicData/notinbed.csv') + + command = ['Rscript', MAFinBED_script, str(test), notinbed_variant_file] subprocess.check_call(command) - mutationSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == "vcf2maf"][0] - removedVariants = syn.tableQuery("select Chromosome, Start_Position, End_Position, Reference_Allele, Tumor_Seq_Allele2, Tumor_Sample_Barcode, Center from %s where inBED is False and Center in ('%s')" % (mutationSynId,"','".join(CENTER_MAPPING_DF.center))) - removedVariantsDf = removedVariants.asDataFrame() + # mutationSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == "vcf2maf"][0] + # removedVariants = syn.tableQuery("select Chromosome, Start_Position, End_Position, Reference_Allele, Tumor_Seq_Allele2, Tumor_Sample_Barcode, Center from %s where inBED is False and Center in ('%s')" % (mutationSynId,"','".join(CENTER_MAPPING_DF.center))) + # removedVariantsDf = removedVariants.asDataFrame() + removedVariantsDf = pd.read_csv(notinbed_variant_file) removedVariantsDf['removeVariants'] = removedVariantsDf['Chromosome'].astype(str) + ' ' + removedVariantsDf['Start_Position'].astype(str) + ' ' + removedVariantsDf['End_Position'].astype(str) + ' ' + removedVariantsDf['Reference_Allele'].astype(str) + ' ' + removedVariantsDf['Tumor_Seq_Allele2'].astype(str) + ' ' + removedVariantsDf['Tumor_Sample_Barcode'].astype(str) #Store filtered vairants for center in removedVariantsDf['Center'].unique(): From aa8a968fbff5b799fde5aa1fd94cc5eabc96eea7 Mon Sep 17 00:00:00 2001 From: thomasyu888 Date: Thu, 31 Jan 2019 12:43:37 -0800 Subject: [PATCH 11/27] Make sure code is in oncotree, if not, return null --- genie/database_to_staging.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py index 08450160..c408ac74 100644 --- a/genie/database_to_staging.py +++ b/genie/database_to_staging.py @@ -283,10 +283,10 @@ def stagingToCbio(syn, processingDate, genieVersion, CENTER_MAPPING_DF, database 'CANCER_TYPE_DETAILED': 'UNKNOWN', 'ONCOTREE_PRIMARY_NODE': 'UNKNOWN', 'ONCOTREE_SECONDARY_NODE': 'UNKNOWN'} - clinicalDf['CANCER_TYPE'] = [oncotreeDict[code.upper()].get("CANCER_TYPE",float('nan')) for code in clinicalDf['ONCOTREE_CODE']] - clinicalDf['CANCER_TYPE_DETAILED'] = [oncotreeDict[code.upper()].get("CANCER_TYPE_DETAILED",float('nan')) for code in clinicalDf['ONCOTREE_CODE']] - clinicalDf['ONCOTREE_PRIMARY_NODE'] = [oncotreeDict[code.upper()].get("ONCOTREE_PRIMARY_NODE",float('nan')) for code in clinicalDf['ONCOTREE_CODE']] - clinicalDf['ONCOTREE_SECONDARY_NODE'] = [oncotreeDict[code.upper()].get("ONCOTREE_SECONDARY_NODE",float('nan')) for code in clinicalDf['ONCOTREE_CODE']] + clinicalDf['CANCER_TYPE'] = [oncotreeDict[code.upper()]["CANCER_TYPE"] if code.upper() in oncotreeDict.keys() else float('nan') for code in clinicalDf['ONCOTREE_CODE']] + clinicalDf['CANCER_TYPE_DETAILED'] = [oncotreeDict[code.upper()]["CANCER_TYPE_DETAILED"] if code.upper() in oncotreeDict.keys() else float('nan') for code in clinicalDf['ONCOTREE_CODE']] + clinicalDf['ONCOTREE_PRIMARY_NODE'] = [oncotreeDict[code.upper()]["ONCOTREE_PRIMARY_NODE"] if code.upper() in oncotreeDict.keys() else float('nan') for code in clinicalDf['ONCOTREE_CODE']] + clinicalDf['ONCOTREE_SECONDARY_NODE'] = [oncotreeDict[code.upper()]["ONCOTREE_SECONDARY_NODE"] if code.upper() in oncotreeDict.keys() else float('nan') for code in clinicalDf['ONCOTREE_CODE']] #CANCER TYPES are added which is why the clinical file is written out. #clinicalDf.to_csv(CLINCICAL_PATH, sep="\t", index=False) From 5590684e08bd921ce9d3e5dbaae3558364572fc0 Mon Sep 17 00:00:00 2001 From: thomasyu888 Date: Thu, 31 Jan 2019 13:48:42 -0800 Subject: [PATCH 12/27] Use lists --- genie/database_to_staging.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py index c408ac74..d709f88a 100644 --- a/genie/database_to_staging.py +++ b/genie/database_to_staging.py @@ -129,7 +129,8 @@ def configureMafRow(rowArray, headers, keepSamples, remove_variants): seq = str(rowArray[headers.index('Tumor_Seq_Allele2')]) sampleId = str(rowArray[headers.index('Tumor_Sample_Barcode')]) variant = chrom +' '+ start+ ' '+end +' '+ref + ' '+ seq+ ' ' + sampleId - if pd.Series(sampleId).isin(keepSamples).any() and not pd.Series(variant).isin(remove_variants).any(): + #if pd.Series(sampleId).isin(keepSamples).any() and not pd.Series(variant).isin(remove_variants).any(): + if sampleId in keepSamples.tolist() and not variant in remove_variants.tolist(): fillnas = ['t_depth','t_ref_count','t_alt_count','n_depth','n_ref_count','n_alt_count'] for i in fillnas: #mutationsDf[i] = mutationsDf[i].fillna("NA") From a9637f5f979445b0af2e55387c695ba247da00a5 Mon Sep 17 00:00:00 2001 From: thomasyu888 Date: Thu, 31 Jan 2019 16:44:45 -0800 Subject: [PATCH 13/27] Exclude another panel --- analyses/genomicData/MAFinBED.R | 4 ++++ genie/database_to_staging.py | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/analyses/genomicData/MAFinBED.R b/analyses/genomicData/MAFinBED.R index c0f6ab48..4a5863a0 100644 --- a/analyses/genomicData/MAFinBED.R +++ b/analyses/genomicData/MAFinBED.R @@ -37,6 +37,10 @@ sampleData$AGE_AT_SEQ_REPORT_NUMERICAL <- NULL patientData$BIRTH_YEAR_NUMERICAL <- NULL patientData$CENTER <- NULL genieClinData <- merge.data.frame(patientData, sampleData, by="PATIENT_ID") + +#EXCLUDE PHS-TRISEQ-V1 SAMPLES +genieClinData <- genieClinData[genieClinData$SEQ_ASSAY_ID != "PHS-TRISEQ-V1",] + # read aggregated BED file data genieBed = synTableQuery(sprintf('SELECT * FROM %s', bedSynId),includeRowIdAndRowVersion=F) genieBedData = synapser::as.data.frame(genieBed) diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py index d709f88a..0ca1feea 100644 --- a/genie/database_to_staging.py +++ b/genie/database_to_staging.py @@ -305,9 +305,11 @@ def stagingToCbio(syn, processingDate, genieVersion, CENTER_MAPPING_DF, database clinicalDf['AGE_AT_SEQ_REPORT'][clinicalDf['AGE_AT_SEQ_REPORT'] == "<6570"] = "<18" ############################################################ - #CENTER SPECIFIC CODE FOR RIGHT NOW (REMOVE UHN-555-V1) + #CENTER SPECIFIC CODE FOR RIGHT NOW (REMOVE UHN-555-V1, PHS-TRISEQ-V1) ############################################################ clinicalDf = clinicalDf[clinicalDf['SEQ_ASSAY_ID'] != "UHN-555-V1"] + clinicalDf = clinicalDf[clinicalDf['SEQ_ASSAY_ID'] != "PHS-TRISEQ-V1"] + #clinicalDf = clinicalDf[clinicalDf['CENTER'] != "WAKE"] #clinicalDf = clinicalDf[clinicalDf['CENTER'] != "CRUK"] ############################################################ From 2b7cd1c3b81e94de15e0ce127cb575c426564df3 Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Fri, 1 Feb 2019 12:53:05 -0800 Subject: [PATCH 14/27] Comment out maf for now, and fix clinical validation error --- genie/database_to_staging.py | 72 ++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 35 deletions(-) diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py index 0ca1feea..0dc708b0 100644 --- a/genie/database_to_staging.py +++ b/genie/database_to_staging.py @@ -246,6 +246,8 @@ def stagingToCbio(syn, processingDate, genieVersion, CENTER_MAPPING_DF, database totalSample = ['PATIENT_ID'] totalSample.extend(sampleCols) sampleCols = totalSample + #Make sure to only grab samples that have patient information + sampleDf = sampleDf[sampleDf['PATIENT_ID'].isin(patientDf['PATIENT_ID'])] clinicalDf = sampleDf.merge(patientDf, on="PATIENT_ID",how="outer") #Remove patients without any sample or patient ids clinicalDf = clinicalDf[~clinicalDf['SAMPLE_ID'].isnull()] @@ -363,41 +365,41 @@ def stagingToCbio(syn, processingDate, genieVersion, CENTER_MAPPING_DF, database sequenced_samples = "#sequenced_samples: " + " ".join(samples) logger.info("FILTERING, STORING MUTATION FILES") - centerMafFileViewSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == "centerMafView"][0] - centerMafSynIds = syn.tableQuery("select id from %s " % centerMafFileViewSynId + "where name like '%mutation%'") - centerMafSynIdsDf = centerMafSynIds.asDataFrame() - with open(MUTATIONS_PATH, 'w') as f: - f.write(sequenced_samples + "\n") - for index, mafSynId in enumerate(centerMafSynIdsDf.id): - mafEnt = syn.get(mafSynId) - logger.info(mafEnt.path) - with open(mafEnt.path,"r") as mafFile: - header = mafFile.readline() - headers = header.replace("\n","").split("\t") - if index == 0: - with open(MUTATIONS_PATH, 'a') as f: - f.write(header) - #Create maf file per center for their staging directory - for center in clinicalDf['CENTER'].unique(): - with open(MUTATIONS_CENTER_PATH % center, 'w') as f: - f.write(header) - # with open(mafEnt.path,"r") as newMafFile: - # newMafFile.readline() - center = mafEnt.path.split("_")[3] - #Make sure to only write the centers that release = True - if center in CENTER_MAPPING_DF.center.tolist(): - for row in mafFile: - rowArray = row.replace("\n","").split("\t") - center = rowArray[headers.index('Center')] - newMergedRow = configureMafRow(rowArray, headers, keepForMergedConsortiumSamples, remove_mafInBed_variants) - if newMergedRow is not None: - with open(MUTATIONS_PATH, 'a') as f: - f.write(newMergedRow) - newCenterRow = configureMafRow(rowArray, headers, keepForCenterConsortiumSamples, remove_mafInBed_variants) - if newCenterRow is not None: - with open(MUTATIONS_CENTER_PATH % center, 'a') as f: - f.write(newCenterRow) - storeFile(syn, MUTATIONS_PATH, parent= consortiumReleaseSynId, genieVersion=genieVersion, name="data_mutations_extended.txt", staging=current_release_staging) + # centerMafFileViewSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == "centerMafView"][0] + # centerMafSynIds = syn.tableQuery("select id from %s " % centerMafFileViewSynId + "where name like '%mutation%'") + # centerMafSynIdsDf = centerMafSynIds.asDataFrame() + # with open(MUTATIONS_PATH, 'w') as f: + # f.write(sequenced_samples + "\n") + # for index, mafSynId in enumerate(centerMafSynIdsDf.id): + # mafEnt = syn.get(mafSynId) + # logger.info(mafEnt.path) + # with open(mafEnt.path,"r") as mafFile: + # header = mafFile.readline() + # headers = header.replace("\n","").split("\t") + # if index == 0: + # with open(MUTATIONS_PATH, 'a') as f: + # f.write(header) + # #Create maf file per center for their staging directory + # for center in clinicalDf['CENTER'].unique(): + # with open(MUTATIONS_CENTER_PATH % center, 'w') as f: + # f.write(header) + # # with open(mafEnt.path,"r") as newMafFile: + # # newMafFile.readline() + # center = mafEnt.path.split("_")[3] + # #Make sure to only write the centers that release = True + # if center in CENTER_MAPPING_DF.center.tolist(): + # for row in mafFile: + # rowArray = row.replace("\n","").split("\t") + # center = rowArray[headers.index('Center')] + # newMergedRow = configureMafRow(rowArray, headers, keepForMergedConsortiumSamples, remove_mafInBed_variants) + # if newMergedRow is not None: + # with open(MUTATIONS_PATH, 'a') as f: + # f.write(newMergedRow) + # newCenterRow = configureMafRow(rowArray, headers, keepForCenterConsortiumSamples, remove_mafInBed_variants) + # if newCenterRow is not None: + # with open(MUTATIONS_CENTER_PATH % center, 'a') as f: + # f.write(newCenterRow) + # storeFile(syn, MUTATIONS_PATH, parent= consortiumReleaseSynId, genieVersion=genieVersion, name="data_mutations_extended.txt", staging=current_release_staging) if not current_release_staging: for center in clinicalDf['CENTER'].unique(): storeFile(syn, MUTATIONS_CENTER_PATH % center, genieVersion=genieVersion, parent = CENTER_MAPPING_DF['stagingSynId'][CENTER_MAPPING_DF['center'] == center][0], centerStaging=True) From 525e8d988489528d528e42d2a839810c3ee444d2 Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Fri, 1 Feb 2019 12:53:58 -0800 Subject: [PATCH 15/27] comment out filter --- genie/database_to_staging.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py index 0dc708b0..57e9cce9 100644 --- a/genie/database_to_staging.py +++ b/genie/database_to_staging.py @@ -266,8 +266,8 @@ def stagingToCbio(syn, processingDate, genieVersion, CENTER_MAPPING_DF, database #########FILTERING######### logger.info("REMOVING PHI") clinicalDf = reAnnotatePHI(clinicalDf) - logger.info("MAF IN BED FILTER") - remove_mafInBed_variants = runMAFinBED(syn, CENTER_MAPPING_DF, databaseSynIdMappingDf, test, genieVersion=genieVersion) + #logger.info("MAF IN BED FILTER") + #remove_mafInBed_variants = runMAFinBED(syn, CENTER_MAPPING_DF, databaseSynIdMappingDf, test, genieVersion=genieVersion) logger.info("MUTATION IN CIS FILTER") remove_mutationInCis_samples = mutation_in_cis_filter(syn, skipMutationsInCis, test, variant_filtering_synId, CENTER_MAPPING_DF, genieVersion=genieVersion) logger.info("SEQ DATE FILTER") From abf0752cf722769fb71ec419b1c2c5a7e58a8367 Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Fri, 1 Feb 2019 12:56:38 -0800 Subject: [PATCH 16/27] Uncomment --- genie/database_to_staging.py | 74 ++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py index 57e9cce9..0dc6ac43 100644 --- a/genie/database_to_staging.py +++ b/genie/database_to_staging.py @@ -266,8 +266,8 @@ def stagingToCbio(syn, processingDate, genieVersion, CENTER_MAPPING_DF, database #########FILTERING######### logger.info("REMOVING PHI") clinicalDf = reAnnotatePHI(clinicalDf) - #logger.info("MAF IN BED FILTER") - #remove_mafInBed_variants = runMAFinBED(syn, CENTER_MAPPING_DF, databaseSynIdMappingDf, test, genieVersion=genieVersion) + logger.info("MAF IN BED FILTER") + remove_mafInBed_variants = runMAFinBED(syn, CENTER_MAPPING_DF, databaseSynIdMappingDf, test, genieVersion=genieVersion) logger.info("MUTATION IN CIS FILTER") remove_mutationInCis_samples = mutation_in_cis_filter(syn, skipMutationsInCis, test, variant_filtering_synId, CENTER_MAPPING_DF, genieVersion=genieVersion) logger.info("SEQ DATE FILTER") @@ -365,41 +365,41 @@ def stagingToCbio(syn, processingDate, genieVersion, CENTER_MAPPING_DF, database sequenced_samples = "#sequenced_samples: " + " ".join(samples) logger.info("FILTERING, STORING MUTATION FILES") - # centerMafFileViewSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == "centerMafView"][0] - # centerMafSynIds = syn.tableQuery("select id from %s " % centerMafFileViewSynId + "where name like '%mutation%'") - # centerMafSynIdsDf = centerMafSynIds.asDataFrame() - # with open(MUTATIONS_PATH, 'w') as f: - # f.write(sequenced_samples + "\n") - # for index, mafSynId in enumerate(centerMafSynIdsDf.id): - # mafEnt = syn.get(mafSynId) - # logger.info(mafEnt.path) - # with open(mafEnt.path,"r") as mafFile: - # header = mafFile.readline() - # headers = header.replace("\n","").split("\t") - # if index == 0: - # with open(MUTATIONS_PATH, 'a') as f: - # f.write(header) - # #Create maf file per center for their staging directory - # for center in clinicalDf['CENTER'].unique(): - # with open(MUTATIONS_CENTER_PATH % center, 'w') as f: - # f.write(header) - # # with open(mafEnt.path,"r") as newMafFile: - # # newMafFile.readline() - # center = mafEnt.path.split("_")[3] - # #Make sure to only write the centers that release = True - # if center in CENTER_MAPPING_DF.center.tolist(): - # for row in mafFile: - # rowArray = row.replace("\n","").split("\t") - # center = rowArray[headers.index('Center')] - # newMergedRow = configureMafRow(rowArray, headers, keepForMergedConsortiumSamples, remove_mafInBed_variants) - # if newMergedRow is not None: - # with open(MUTATIONS_PATH, 'a') as f: - # f.write(newMergedRow) - # newCenterRow = configureMafRow(rowArray, headers, keepForCenterConsortiumSamples, remove_mafInBed_variants) - # if newCenterRow is not None: - # with open(MUTATIONS_CENTER_PATH % center, 'a') as f: - # f.write(newCenterRow) - # storeFile(syn, MUTATIONS_PATH, parent= consortiumReleaseSynId, genieVersion=genieVersion, name="data_mutations_extended.txt", staging=current_release_staging) + centerMafFileViewSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == "centerMafView"][0] + centerMafSynIds = syn.tableQuery("select id from %s " % centerMafFileViewSynId + "where name like '%mutation%'") + centerMafSynIdsDf = centerMafSynIds.asDataFrame() + with open(MUTATIONS_PATH, 'w') as f: + f.write(sequenced_samples + "\n") + for index, mafSynId in enumerate(centerMafSynIdsDf.id): + mafEnt = syn.get(mafSynId) + logger.info(mafEnt.path) + with open(mafEnt.path,"r") as mafFile: + header = mafFile.readline() + headers = header.replace("\n","").split("\t") + if index == 0: + with open(MUTATIONS_PATH, 'a') as f: + f.write(header) + #Create maf file per center for their staging directory + for center in clinicalDf['CENTER'].unique(): + with open(MUTATIONS_CENTER_PATH % center, 'w') as f: + f.write(header) + # with open(mafEnt.path,"r") as newMafFile: + # newMafFile.readline() + center = mafEnt.path.split("_")[3] + #Make sure to only write the centers that release = True + if center in CENTER_MAPPING_DF.center.tolist(): + for row in mafFile: + rowArray = row.replace("\n","").split("\t") + center = rowArray[headers.index('Center')] + newMergedRow = configureMafRow(rowArray, headers, keepForMergedConsortiumSamples, remove_mafInBed_variants) + if newMergedRow is not None: + with open(MUTATIONS_PATH, 'a') as f: + f.write(newMergedRow) + newCenterRow = configureMafRow(rowArray, headers, keepForCenterConsortiumSamples, remove_mafInBed_variants) + if newCenterRow is not None: + with open(MUTATIONS_CENTER_PATH % center, 'a') as f: + f.write(newCenterRow) + storeFile(syn, MUTATIONS_PATH, parent= consortiumReleaseSynId, genieVersion=genieVersion, name="data_mutations_extended.txt", staging=current_release_staging) if not current_release_staging: for center in clinicalDf['CENTER'].unique(): storeFile(syn, MUTATIONS_CENTER_PATH % center, genieVersion=genieVersion, parent = CENTER_MAPPING_DF['stagingSynId'][CENTER_MAPPING_DF['center'] == center][0], centerStaging=True) From 6dca94d1887e183338d44dbe39e527c4c40033c5 Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Fri, 1 Feb 2019 13:13:52 -0800 Subject: [PATCH 17/27] Comment out --- genie/database_to_staging.py | 74 ++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py index 0dc6ac43..167af655 100644 --- a/genie/database_to_staging.py +++ b/genie/database_to_staging.py @@ -266,8 +266,8 @@ def stagingToCbio(syn, processingDate, genieVersion, CENTER_MAPPING_DF, database #########FILTERING######### logger.info("REMOVING PHI") clinicalDf = reAnnotatePHI(clinicalDf) - logger.info("MAF IN BED FILTER") - remove_mafInBed_variants = runMAFinBED(syn, CENTER_MAPPING_DF, databaseSynIdMappingDf, test, genieVersion=genieVersion) + #logger.info("MAF IN BED FILTER") + #remove_mafInBed_variants = runMAFinBED(syn, CENTER_MAPPING_DF, databaseSynIdMappingDf, test, genieVersion=genieVersion) logger.info("MUTATION IN CIS FILTER") remove_mutationInCis_samples = mutation_in_cis_filter(syn, skipMutationsInCis, test, variant_filtering_synId, CENTER_MAPPING_DF, genieVersion=genieVersion) logger.info("SEQ DATE FILTER") @@ -368,41 +368,41 @@ def stagingToCbio(syn, processingDate, genieVersion, CENTER_MAPPING_DF, database centerMafFileViewSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == "centerMafView"][0] centerMafSynIds = syn.tableQuery("select id from %s " % centerMafFileViewSynId + "where name like '%mutation%'") centerMafSynIdsDf = centerMafSynIds.asDataFrame() - with open(MUTATIONS_PATH, 'w') as f: - f.write(sequenced_samples + "\n") - for index, mafSynId in enumerate(centerMafSynIdsDf.id): - mafEnt = syn.get(mafSynId) - logger.info(mafEnt.path) - with open(mafEnt.path,"r") as mafFile: - header = mafFile.readline() - headers = header.replace("\n","").split("\t") - if index == 0: - with open(MUTATIONS_PATH, 'a') as f: - f.write(header) - #Create maf file per center for their staging directory - for center in clinicalDf['CENTER'].unique(): - with open(MUTATIONS_CENTER_PATH % center, 'w') as f: - f.write(header) - # with open(mafEnt.path,"r") as newMafFile: - # newMafFile.readline() - center = mafEnt.path.split("_")[3] - #Make sure to only write the centers that release = True - if center in CENTER_MAPPING_DF.center.tolist(): - for row in mafFile: - rowArray = row.replace("\n","").split("\t") - center = rowArray[headers.index('Center')] - newMergedRow = configureMafRow(rowArray, headers, keepForMergedConsortiumSamples, remove_mafInBed_variants) - if newMergedRow is not None: - with open(MUTATIONS_PATH, 'a') as f: - f.write(newMergedRow) - newCenterRow = configureMafRow(rowArray, headers, keepForCenterConsortiumSamples, remove_mafInBed_variants) - if newCenterRow is not None: - with open(MUTATIONS_CENTER_PATH % center, 'a') as f: - f.write(newCenterRow) - storeFile(syn, MUTATIONS_PATH, parent= consortiumReleaseSynId, genieVersion=genieVersion, name="data_mutations_extended.txt", staging=current_release_staging) - if not current_release_staging: - for center in clinicalDf['CENTER'].unique(): - storeFile(syn, MUTATIONS_CENTER_PATH % center, genieVersion=genieVersion, parent = CENTER_MAPPING_DF['stagingSynId'][CENTER_MAPPING_DF['center'] == center][0], centerStaging=True) + # with open(MUTATIONS_PATH, 'w') as f: + # f.write(sequenced_samples + "\n") + # for index, mafSynId in enumerate(centerMafSynIdsDf.id): + # mafEnt = syn.get(mafSynId) + # logger.info(mafEnt.path) + # with open(mafEnt.path,"r") as mafFile: + # header = mafFile.readline() + # headers = header.replace("\n","").split("\t") + # if index == 0: + # with open(MUTATIONS_PATH, 'a') as f: + # f.write(header) + # #Create maf file per center for their staging directory + # for center in clinicalDf['CENTER'].unique(): + # with open(MUTATIONS_CENTER_PATH % center, 'w') as f: + # f.write(header) + # # with open(mafEnt.path,"r") as newMafFile: + # # newMafFile.readline() + # center = mafEnt.path.split("_")[3] + # #Make sure to only write the centers that release = True + # if center in CENTER_MAPPING_DF.center.tolist(): + # for row in mafFile: + # rowArray = row.replace("\n","").split("\t") + # center = rowArray[headers.index('Center')] + # newMergedRow = configureMafRow(rowArray, headers, keepForMergedConsortiumSamples, remove_mafInBed_variants) + # if newMergedRow is not None: + # with open(MUTATIONS_PATH, 'a') as f: + # f.write(newMergedRow) + # newCenterRow = configureMafRow(rowArray, headers, keepForCenterConsortiumSamples, remove_mafInBed_variants) + # if newCenterRow is not None: + # with open(MUTATIONS_CENTER_PATH % center, 'a') as f: + # f.write(newCenterRow) + # storeFile(syn, MUTATIONS_PATH, parent= consortiumReleaseSynId, genieVersion=genieVersion, name="data_mutations_extended.txt", staging=current_release_staging) + # if not current_release_staging: + # for center in clinicalDf['CENTER'].unique(): + # storeFile(syn, MUTATIONS_CENTER_PATH % center, genieVersion=genieVersion, parent = CENTER_MAPPING_DF['stagingSynId'][CENTER_MAPPING_DF['center'] == center][0], centerStaging=True) #Only need to upload these files once #if filtering: From 04ddf9e411bd8af17d4de6c3b87ede0455b67430 Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Fri, 1 Feb 2019 13:16:10 -0800 Subject: [PATCH 18/27] uncomment --- genie/database_to_staging.py | 74 ++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py index 167af655..0dc6ac43 100644 --- a/genie/database_to_staging.py +++ b/genie/database_to_staging.py @@ -266,8 +266,8 @@ def stagingToCbio(syn, processingDate, genieVersion, CENTER_MAPPING_DF, database #########FILTERING######### logger.info("REMOVING PHI") clinicalDf = reAnnotatePHI(clinicalDf) - #logger.info("MAF IN BED FILTER") - #remove_mafInBed_variants = runMAFinBED(syn, CENTER_MAPPING_DF, databaseSynIdMappingDf, test, genieVersion=genieVersion) + logger.info("MAF IN BED FILTER") + remove_mafInBed_variants = runMAFinBED(syn, CENTER_MAPPING_DF, databaseSynIdMappingDf, test, genieVersion=genieVersion) logger.info("MUTATION IN CIS FILTER") remove_mutationInCis_samples = mutation_in_cis_filter(syn, skipMutationsInCis, test, variant_filtering_synId, CENTER_MAPPING_DF, genieVersion=genieVersion) logger.info("SEQ DATE FILTER") @@ -368,41 +368,41 @@ def stagingToCbio(syn, processingDate, genieVersion, CENTER_MAPPING_DF, database centerMafFileViewSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == "centerMafView"][0] centerMafSynIds = syn.tableQuery("select id from %s " % centerMafFileViewSynId + "where name like '%mutation%'") centerMafSynIdsDf = centerMafSynIds.asDataFrame() - # with open(MUTATIONS_PATH, 'w') as f: - # f.write(sequenced_samples + "\n") - # for index, mafSynId in enumerate(centerMafSynIdsDf.id): - # mafEnt = syn.get(mafSynId) - # logger.info(mafEnt.path) - # with open(mafEnt.path,"r") as mafFile: - # header = mafFile.readline() - # headers = header.replace("\n","").split("\t") - # if index == 0: - # with open(MUTATIONS_PATH, 'a') as f: - # f.write(header) - # #Create maf file per center for their staging directory - # for center in clinicalDf['CENTER'].unique(): - # with open(MUTATIONS_CENTER_PATH % center, 'w') as f: - # f.write(header) - # # with open(mafEnt.path,"r") as newMafFile: - # # newMafFile.readline() - # center = mafEnt.path.split("_")[3] - # #Make sure to only write the centers that release = True - # if center in CENTER_MAPPING_DF.center.tolist(): - # for row in mafFile: - # rowArray = row.replace("\n","").split("\t") - # center = rowArray[headers.index('Center')] - # newMergedRow = configureMafRow(rowArray, headers, keepForMergedConsortiumSamples, remove_mafInBed_variants) - # if newMergedRow is not None: - # with open(MUTATIONS_PATH, 'a') as f: - # f.write(newMergedRow) - # newCenterRow = configureMafRow(rowArray, headers, keepForCenterConsortiumSamples, remove_mafInBed_variants) - # if newCenterRow is not None: - # with open(MUTATIONS_CENTER_PATH % center, 'a') as f: - # f.write(newCenterRow) - # storeFile(syn, MUTATIONS_PATH, parent= consortiumReleaseSynId, genieVersion=genieVersion, name="data_mutations_extended.txt", staging=current_release_staging) - # if not current_release_staging: - # for center in clinicalDf['CENTER'].unique(): - # storeFile(syn, MUTATIONS_CENTER_PATH % center, genieVersion=genieVersion, parent = CENTER_MAPPING_DF['stagingSynId'][CENTER_MAPPING_DF['center'] == center][0], centerStaging=True) + with open(MUTATIONS_PATH, 'w') as f: + f.write(sequenced_samples + "\n") + for index, mafSynId in enumerate(centerMafSynIdsDf.id): + mafEnt = syn.get(mafSynId) + logger.info(mafEnt.path) + with open(mafEnt.path,"r") as mafFile: + header = mafFile.readline() + headers = header.replace("\n","").split("\t") + if index == 0: + with open(MUTATIONS_PATH, 'a') as f: + f.write(header) + #Create maf file per center for their staging directory + for center in clinicalDf['CENTER'].unique(): + with open(MUTATIONS_CENTER_PATH % center, 'w') as f: + f.write(header) + # with open(mafEnt.path,"r") as newMafFile: + # newMafFile.readline() + center = mafEnt.path.split("_")[3] + #Make sure to only write the centers that release = True + if center in CENTER_MAPPING_DF.center.tolist(): + for row in mafFile: + rowArray = row.replace("\n","").split("\t") + center = rowArray[headers.index('Center')] + newMergedRow = configureMafRow(rowArray, headers, keepForMergedConsortiumSamples, remove_mafInBed_variants) + if newMergedRow is not None: + with open(MUTATIONS_PATH, 'a') as f: + f.write(newMergedRow) + newCenterRow = configureMafRow(rowArray, headers, keepForCenterConsortiumSamples, remove_mafInBed_variants) + if newCenterRow is not None: + with open(MUTATIONS_CENTER_PATH % center, 'a') as f: + f.write(newCenterRow) + storeFile(syn, MUTATIONS_PATH, parent= consortiumReleaseSynId, genieVersion=genieVersion, name="data_mutations_extended.txt", staging=current_release_staging) + if not current_release_staging: + for center in clinicalDf['CENTER'].unique(): + storeFile(syn, MUTATIONS_CENTER_PATH % center, genieVersion=genieVersion, parent = CENTER_MAPPING_DF['stagingSynId'][CENTER_MAPPING_DF['center'] == center][0], centerStaging=True) #Only need to upload these files once #if filtering: From f67ea50bb2cc3c31d4159997f65e60d309d2aa8a Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Fri, 1 Feb 2019 13:57:48 -0800 Subject: [PATCH 19/27] Comment --- genie/dashboard_table_updater.py | 2 +- genie/database_to_staging.py | 74 ++++++++++++++++---------------- 2 files changed, 38 insertions(+), 38 deletions(-) diff --git a/genie/dashboard_table_updater.py b/genie/dashboard_table_updater.py index 7d552fc4..b6ea22f4 100644 --- a/genie/dashboard_table_updater.py +++ b/genie/dashboard_table_updater.py @@ -256,7 +256,7 @@ def update_oncotree_code_tables(syn, database_mappingdf): oncotree_link_ent = syn.get(oncotree_link_synid) oncotree_link = oncotree_link_ent.externalURL oncotree_mapping = genie.process_functions.get_oncotree_code_mappings(oncotree_link) - clinicaldf['PRIMARY_CODES'] = [oncotree_mapping[i.upper()]['ONCOTREE_PRIMARY_NODE'] for i in clinicaldf.ONCOTREE_CODE] + clinicaldf['PRIMARY_CODES'] = [oncotree_mapping[i.upper()]['ONCOTREE_PRIMARY_NODE'] if i.upper() in oncotree_mapping.keys() else 'NOT_MAPPED' for i in clinicaldf.ONCOTREE_CODE] # ### DISTRIBUTION OF PRIMARY ONCOTREE CODE TABLE UPDATE primary_code_distributiondf = pd.DataFrame(columns=set(clinicaldf['CENTER']), index=set(clinicaldf['PRIMARY_CODES'])) diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py index 0dc6ac43..167af655 100644 --- a/genie/database_to_staging.py +++ b/genie/database_to_staging.py @@ -266,8 +266,8 @@ def stagingToCbio(syn, processingDate, genieVersion, CENTER_MAPPING_DF, database #########FILTERING######### logger.info("REMOVING PHI") clinicalDf = reAnnotatePHI(clinicalDf) - logger.info("MAF IN BED FILTER") - remove_mafInBed_variants = runMAFinBED(syn, CENTER_MAPPING_DF, databaseSynIdMappingDf, test, genieVersion=genieVersion) + #logger.info("MAF IN BED FILTER") + #remove_mafInBed_variants = runMAFinBED(syn, CENTER_MAPPING_DF, databaseSynIdMappingDf, test, genieVersion=genieVersion) logger.info("MUTATION IN CIS FILTER") remove_mutationInCis_samples = mutation_in_cis_filter(syn, skipMutationsInCis, test, variant_filtering_synId, CENTER_MAPPING_DF, genieVersion=genieVersion) logger.info("SEQ DATE FILTER") @@ -368,41 +368,41 @@ def stagingToCbio(syn, processingDate, genieVersion, CENTER_MAPPING_DF, database centerMafFileViewSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == "centerMafView"][0] centerMafSynIds = syn.tableQuery("select id from %s " % centerMafFileViewSynId + "where name like '%mutation%'") centerMafSynIdsDf = centerMafSynIds.asDataFrame() - with open(MUTATIONS_PATH, 'w') as f: - f.write(sequenced_samples + "\n") - for index, mafSynId in enumerate(centerMafSynIdsDf.id): - mafEnt = syn.get(mafSynId) - logger.info(mafEnt.path) - with open(mafEnt.path,"r") as mafFile: - header = mafFile.readline() - headers = header.replace("\n","").split("\t") - if index == 0: - with open(MUTATIONS_PATH, 'a') as f: - f.write(header) - #Create maf file per center for their staging directory - for center in clinicalDf['CENTER'].unique(): - with open(MUTATIONS_CENTER_PATH % center, 'w') as f: - f.write(header) - # with open(mafEnt.path,"r") as newMafFile: - # newMafFile.readline() - center = mafEnt.path.split("_")[3] - #Make sure to only write the centers that release = True - if center in CENTER_MAPPING_DF.center.tolist(): - for row in mafFile: - rowArray = row.replace("\n","").split("\t") - center = rowArray[headers.index('Center')] - newMergedRow = configureMafRow(rowArray, headers, keepForMergedConsortiumSamples, remove_mafInBed_variants) - if newMergedRow is not None: - with open(MUTATIONS_PATH, 'a') as f: - f.write(newMergedRow) - newCenterRow = configureMafRow(rowArray, headers, keepForCenterConsortiumSamples, remove_mafInBed_variants) - if newCenterRow is not None: - with open(MUTATIONS_CENTER_PATH % center, 'a') as f: - f.write(newCenterRow) - storeFile(syn, MUTATIONS_PATH, parent= consortiumReleaseSynId, genieVersion=genieVersion, name="data_mutations_extended.txt", staging=current_release_staging) - if not current_release_staging: - for center in clinicalDf['CENTER'].unique(): - storeFile(syn, MUTATIONS_CENTER_PATH % center, genieVersion=genieVersion, parent = CENTER_MAPPING_DF['stagingSynId'][CENTER_MAPPING_DF['center'] == center][0], centerStaging=True) + # with open(MUTATIONS_PATH, 'w') as f: + # f.write(sequenced_samples + "\n") + # for index, mafSynId in enumerate(centerMafSynIdsDf.id): + # mafEnt = syn.get(mafSynId) + # logger.info(mafEnt.path) + # with open(mafEnt.path,"r") as mafFile: + # header = mafFile.readline() + # headers = header.replace("\n","").split("\t") + # if index == 0: + # with open(MUTATIONS_PATH, 'a') as f: + # f.write(header) + # #Create maf file per center for their staging directory + # for center in clinicalDf['CENTER'].unique(): + # with open(MUTATIONS_CENTER_PATH % center, 'w') as f: + # f.write(header) + # # with open(mafEnt.path,"r") as newMafFile: + # # newMafFile.readline() + # center = mafEnt.path.split("_")[3] + # #Make sure to only write the centers that release = True + # if center in CENTER_MAPPING_DF.center.tolist(): + # for row in mafFile: + # rowArray = row.replace("\n","").split("\t") + # center = rowArray[headers.index('Center')] + # newMergedRow = configureMafRow(rowArray, headers, keepForMergedConsortiumSamples, remove_mafInBed_variants) + # if newMergedRow is not None: + # with open(MUTATIONS_PATH, 'a') as f: + # f.write(newMergedRow) + # newCenterRow = configureMafRow(rowArray, headers, keepForCenterConsortiumSamples, remove_mafInBed_variants) + # if newCenterRow is not None: + # with open(MUTATIONS_CENTER_PATH % center, 'a') as f: + # f.write(newCenterRow) + # storeFile(syn, MUTATIONS_PATH, parent= consortiumReleaseSynId, genieVersion=genieVersion, name="data_mutations_extended.txt", staging=current_release_staging) + # if not current_release_staging: + # for center in clinicalDf['CENTER'].unique(): + # storeFile(syn, MUTATIONS_CENTER_PATH % center, genieVersion=genieVersion, parent = CENTER_MAPPING_DF['stagingSynId'][CENTER_MAPPING_DF['center'] == center][0], centerStaging=True) #Only need to upload these files once #if filtering: From bf0504761538a708098b703cda84f92f525e9c08 Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Fri, 1 Feb 2019 14:14:03 -0800 Subject: [PATCH 20/27] Fix oncotree mapping --- genie/dashboardTemplate.Rmd | 8 ++++++-- genie/dashboard_table_updater.py | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/genie/dashboardTemplate.Rmd b/genie/dashboardTemplate.Rmd index a13dfff8..87bd606f 100644 --- a/genie/dashboardTemplate.Rmd +++ b/genie/dashboardTemplate.Rmd @@ -104,7 +104,11 @@ plotPrimarySites <- function(clinical, oncotreeLink, release) { oncotree_json = data$TISSUE oncotreeDict = extract(oncotree_json, "", "") clinical$PRIMARY_CODES <- unlist(sapply(clinical$ONCOTREE_CODE, function(code) { - oncotreeDict[[toupper(code)]]["ONCOTREE_PRIMARY_NODE"] + if (toupper(code) %in% names(oncotreeDict)){ + oncotreeDict[[toupper(code)]]["ONCOTREE_PRIMARY_NODE"] + } else { + "DEPRECATED_CODE" + } })) }) clinical$CENTER = createCenterColumn(clinical) @@ -288,7 +292,7 @@ kable(samplesPerReleaseDf,row.names = F) #primary site distribution par(mar=c(10,3,3,1)) barplot(sort(log(table(this_mut$FILTER)),decreasing = T), main="Log Distribution of Mutation FILTERs",las=2) -plotPrimarySites(this_samples, "http://oncotree.mskcc.org/api/tumorTypes/tree?version=oncotree_2017_06_21","%s") +plotPrimarySites(this_samples, "http://oncotree.mskcc.org/api/tumorTypes/tree?version=oncotree_2018_06_01","%s") #Center X Race plotCenterXRace(this_patient) #Center X Ethnicity diff --git a/genie/dashboard_table_updater.py b/genie/dashboard_table_updater.py index b6ea22f4..285b0e6d 100644 --- a/genie/dashboard_table_updater.py +++ b/genie/dashboard_table_updater.py @@ -256,7 +256,7 @@ def update_oncotree_code_tables(syn, database_mappingdf): oncotree_link_ent = syn.get(oncotree_link_synid) oncotree_link = oncotree_link_ent.externalURL oncotree_mapping = genie.process_functions.get_oncotree_code_mappings(oncotree_link) - clinicaldf['PRIMARY_CODES'] = [oncotree_mapping[i.upper()]['ONCOTREE_PRIMARY_NODE'] if i.upper() in oncotree_mapping.keys() else 'NOT_MAPPED' for i in clinicaldf.ONCOTREE_CODE] + clinicaldf['PRIMARY_CODES'] = [oncotree_mapping[i.upper()]['ONCOTREE_PRIMARY_NODE'] if i.upper() in oncotree_mapping.keys() else 'DEPRECATED_CODE' for i in clinicaldf.ONCOTREE_CODE] # ### DISTRIBUTION OF PRIMARY ONCOTREE CODE TABLE UPDATE primary_code_distributiondf = pd.DataFrame(columns=set(clinicaldf['CENTER']), index=set(clinicaldf['PRIMARY_CODES'])) From afab94f40a65c3a030f3f351a5f87ce78a1e6749 Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Fri, 1 Feb 2019 14:17:06 -0800 Subject: [PATCH 21/27] uncomment --- genie/database_to_staging.py | 218 +++++++++++++++++------------------ 1 file changed, 109 insertions(+), 109 deletions(-) diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py index 167af655..6ee6e5b7 100644 --- a/genie/database_to_staging.py +++ b/genie/database_to_staging.py @@ -266,8 +266,8 @@ def stagingToCbio(syn, processingDate, genieVersion, CENTER_MAPPING_DF, database #########FILTERING######### logger.info("REMOVING PHI") clinicalDf = reAnnotatePHI(clinicalDf) - #logger.info("MAF IN BED FILTER") - #remove_mafInBed_variants = runMAFinBED(syn, CENTER_MAPPING_DF, databaseSynIdMappingDf, test, genieVersion=genieVersion) + logger.info("MAF IN BED FILTER") + remove_mafInBed_variants = runMAFinBED(syn, CENTER_MAPPING_DF, databaseSynIdMappingDf, test, genieVersion=genieVersion) logger.info("MUTATION IN CIS FILTER") remove_mutationInCis_samples = mutation_in_cis_filter(syn, skipMutationsInCis, test, variant_filtering_synId, CENTER_MAPPING_DF, genieVersion=genieVersion) logger.info("SEQ DATE FILTER") @@ -368,41 +368,41 @@ def stagingToCbio(syn, processingDate, genieVersion, CENTER_MAPPING_DF, database centerMafFileViewSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == "centerMafView"][0] centerMafSynIds = syn.tableQuery("select id from %s " % centerMafFileViewSynId + "where name like '%mutation%'") centerMafSynIdsDf = centerMafSynIds.asDataFrame() - # with open(MUTATIONS_PATH, 'w') as f: - # f.write(sequenced_samples + "\n") - # for index, mafSynId in enumerate(centerMafSynIdsDf.id): - # mafEnt = syn.get(mafSynId) - # logger.info(mafEnt.path) - # with open(mafEnt.path,"r") as mafFile: - # header = mafFile.readline() - # headers = header.replace("\n","").split("\t") - # if index == 0: - # with open(MUTATIONS_PATH, 'a') as f: - # f.write(header) - # #Create maf file per center for their staging directory - # for center in clinicalDf['CENTER'].unique(): - # with open(MUTATIONS_CENTER_PATH % center, 'w') as f: - # f.write(header) - # # with open(mafEnt.path,"r") as newMafFile: - # # newMafFile.readline() - # center = mafEnt.path.split("_")[3] - # #Make sure to only write the centers that release = True - # if center in CENTER_MAPPING_DF.center.tolist(): - # for row in mafFile: - # rowArray = row.replace("\n","").split("\t") - # center = rowArray[headers.index('Center')] - # newMergedRow = configureMafRow(rowArray, headers, keepForMergedConsortiumSamples, remove_mafInBed_variants) - # if newMergedRow is not None: - # with open(MUTATIONS_PATH, 'a') as f: - # f.write(newMergedRow) - # newCenterRow = configureMafRow(rowArray, headers, keepForCenterConsortiumSamples, remove_mafInBed_variants) - # if newCenterRow is not None: - # with open(MUTATIONS_CENTER_PATH % center, 'a') as f: - # f.write(newCenterRow) - # storeFile(syn, MUTATIONS_PATH, parent= consortiumReleaseSynId, genieVersion=genieVersion, name="data_mutations_extended.txt", staging=current_release_staging) - # if not current_release_staging: - # for center in clinicalDf['CENTER'].unique(): - # storeFile(syn, MUTATIONS_CENTER_PATH % center, genieVersion=genieVersion, parent = CENTER_MAPPING_DF['stagingSynId'][CENTER_MAPPING_DF['center'] == center][0], centerStaging=True) + with open(MUTATIONS_PATH, 'w') as f: + f.write(sequenced_samples + "\n") + for index, mafSynId in enumerate(centerMafSynIdsDf.id): + mafEnt = syn.get(mafSynId) + logger.info(mafEnt.path) + with open(mafEnt.path,"r") as mafFile: + header = mafFile.readline() + headers = header.replace("\n","").split("\t") + if index == 0: + with open(MUTATIONS_PATH, 'a') as f: + f.write(header) + #Create maf file per center for their staging directory + for center in clinicalDf['CENTER'].unique(): + with open(MUTATIONS_CENTER_PATH % center, 'w') as f: + f.write(header) + # with open(mafEnt.path,"r") as newMafFile: + # newMafFile.readline() + center = mafEnt.path.split("_")[3] + #Make sure to only write the centers that release = True + if center in CENTER_MAPPING_DF.center.tolist(): + for row in mafFile: + rowArray = row.replace("\n","").split("\t") + center = rowArray[headers.index('Center')] + newMergedRow = configureMafRow(rowArray, headers, keepForMergedConsortiumSamples, remove_mafInBed_variants) + if newMergedRow is not None: + with open(MUTATIONS_PATH, 'a') as f: + f.write(newMergedRow) + newCenterRow = configureMafRow(rowArray, headers, keepForCenterConsortiumSamples, remove_mafInBed_variants) + if newCenterRow is not None: + with open(MUTATIONS_CENTER_PATH % center, 'a') as f: + f.write(newCenterRow) + storeFile(syn, MUTATIONS_PATH, parent= consortiumReleaseSynId, genieVersion=genieVersion, name="data_mutations_extended.txt", staging=current_release_staging) + if not current_release_staging: + for center in clinicalDf['CENTER'].unique(): + storeFile(syn, MUTATIONS_CENTER_PATH % center, genieVersion=genieVersion, parent = CENTER_MAPPING_DF['stagingSynId'][CENTER_MAPPING_DF['center'] == center][0], centerStaging=True) #Only need to upload these files once #if filtering: @@ -740,79 +740,79 @@ def main(): process.checkUrl(args.oncotreeLink) #get syn id of case list folder in consortium release - caseListSynId = findCaseListId(syn, consortiumSynId) - - if not args.test and not args.staging: - processTrackerSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == 'processTracker'].values[0] - processTracker = syn.tableQuery("SELECT timeStartProcessing FROM %s where center = 'SAGE' and processingType = 'dbToStage'" % processTrackerSynId) - processTrackerDf = processTracker.asDataFrame() - processTrackerDf['timeStartProcessing'][0] = str(int(time.time()*1000)) - syn.store(synapseclient.Table(processTrackerSynId,processTrackerDf)) - - syn.table_query_timeout = 50000 - centerMappingSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == 'centerMapping'].values[0] - #Only release files where release is true - CENTER_MAPPING = syn.tableQuery('SELECT * FROM %s where release is true' % centerMappingSynId) - CENTER_MAPPING_DF = CENTER_MAPPING.asDataFrame() - processingDate = datetime.datetime.strptime(args.processingDate, '%b-%Y') -####### - cbioValidatorPath = os.path.join(args.cbioportalPath,"core/src/main/scripts/importer/validateData.py") - assert os.path.exists(cbioValidatorPath), "Please specify correct cbioportalPath" - - logger.info("STAGING TO CONSORTIUM") - genePanelEntities = stagingToCbio(syn, processingDate, args.genieVersion, CENTER_MAPPING_DF, databaseSynIdMappingDf, oncotree_url=args.oncotreeLink, consortiumReleaseCutOff= args.consortiumReleaseCutOff, current_release_staging=args.staging, skipMutationsInCis=args.skipMutationsInCis, test=args.test) +# caseListSynId = findCaseListId(syn, consortiumSynId) + +# if not args.test and not args.staging: +# processTrackerSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == 'processTracker'].values[0] +# processTracker = syn.tableQuery("SELECT timeStartProcessing FROM %s where center = 'SAGE' and processingType = 'dbToStage'" % processTrackerSynId) +# processTrackerDf = processTracker.asDataFrame() +# processTrackerDf['timeStartProcessing'][0] = str(int(time.time()*1000)) +# syn.store(synapseclient.Table(processTrackerSynId,processTrackerDf)) + +# syn.table_query_timeout = 50000 +# centerMappingSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == 'centerMapping'].values[0] +# #Only release files where release is true +# CENTER_MAPPING = syn.tableQuery('SELECT * FROM %s where release is true' % centerMappingSynId) +# CENTER_MAPPING_DF = CENTER_MAPPING.asDataFrame() +# processingDate = datetime.datetime.strptime(args.processingDate, '%b-%Y') +# ####### +# cbioValidatorPath = os.path.join(args.cbioportalPath,"core/src/main/scripts/importer/validateData.py") +# assert os.path.exists(cbioValidatorPath), "Please specify correct cbioportalPath" + +# logger.info("STAGING TO CONSORTIUM") +# genePanelEntities = stagingToCbio(syn, processingDate, args.genieVersion, CENTER_MAPPING_DF, databaseSynIdMappingDf, oncotree_url=args.oncotreeLink, consortiumReleaseCutOff= args.consortiumReleaseCutOff, current_release_staging=args.staging, skipMutationsInCis=args.skipMutationsInCis, test=args.test) - #No need to run twice anymore - #stagingToCbio(syn, processingDate, args.genieVersion, CENTER_MAPPING_DF, databaseSynIdMappingDf, oncotree_url=args.oncotreeLink, consortiumReleaseCutOff= args.consortiumReleaseCutOff, filtering=True, current_release_staging=args.staging, test=args.test) - #Create case lists files - logger.info("CREATE CASE LIST FILES") - #Remove old caselists first - if not os.path.exists(CASE_LIST_PATH): - os.mkdir(CASE_LIST_PATH) - caselists = os.listdir(CASE_LIST_PATH) - [os.remove(os.path.join(CASE_LIST_PATH,caselist)) for caselist in caselists] - CLINICAL_PATH = os.path.join(GENIE_RELEASE_DIR,'data_clinical_%s.txt' % args.genieVersion) - GENE_MATRIX_PATH = os.path.join(GENIE_RELEASE_DIR,"data_gene_matrix_%s.txt" % args.genieVersion) - create_case_lists.create_case_lists(CLINICAL_PATH, GENE_MATRIX_PATH, CASE_LIST_PATH, "genie_private") - caseListFiles = os.listdir(CASE_LIST_PATH) - caseListEntities = [] - for casePath in caseListFiles: - casePath = os.path.join(CASE_LIST_PATH, casePath) - caseListEntities.append(storeFile(syn, casePath, parent=caseListSynId, staging=args.staging, caseLists=True, genieVersion=args.genieVersion)) - - logger.info("REMOVING UNNECESSARY FILES") - genie_files = os.listdir(GENIE_RELEASE_DIR) - #deletePatterns = ('data_clinical_supp_patient_','data_clinical_supp_sample_','data_CNA_','data_mutations_extended_','data_fusions_','genie_private_data_cna_hg19_') - #[os.remove(os.path.join(GENIE_RELEASE_DIR,genieFile)) for genieFile in genie_files if genieFile.startswith(deletePatterns)] - [os.remove(os.path.join(GENIE_RELEASE_DIR,genieFile)) for genieFile in genie_files if args.genieVersion not in genieFile and "meta" not in genieFile and "case_lists" not in genieFile] - os.remove(CLINICAL_PATH) -####### - logger.info("REVISE METADATA FILES") - command_reviseMetadataFiles(syn, args, databaseSynIdMappingDf) - logger.info("CBIO VALIDATION") - #Must be exit 0 because the validator sometimes fails, but we still want to capture the output - command = [cbioValidatorPath,'-s',GENIE_RELEASE_DIR,'-n','; exit 0'] - cbioOutput = subprocess.check_output(" ".join(command), shell=True) - logger.info(cbioOutput.decode("utf-8")) - if not args.test and not args.staging: - with open("cbioValidatorLogsConsortium_%s.txt" % args.genieVersion, "w") as cbioLog: - cbioLog.write(cbioOutput.decode("utf-8")) - syn.store(File("cbioValidatorLogsConsortium_%s.txt" % args.genieVersion, parentId = "syn10155804")) - os.remove("cbioValidatorLogsConsortium_%s.txt" % args.genieVersion) - logger.info("REMOVING OLD FILES") - - process.rmFiles(CASE_LIST_PATH) - if os.path.exists('%s/genie_private_meta_cna_hg19_seg.txt' % GENIE_RELEASE_DIR): - os.unlink('%s/genie_private_meta_cna_hg19_seg.txt' % GENIE_RELEASE_DIR) - logger.info("CREATING LINK VERSION") - createLinkVersion(syn, args.genieVersion, caseListEntities, genePanelEntities, databaseSynIdMappingDf) - - if not args.test and not args.staging: - processTracker = syn.tableQuery("SELECT timeEndProcessing FROM %s where center = 'SAGE' and processingType = 'dbToStage'" % processTrackerSynId) - processTrackerDf = processTracker.asDataFrame() - processTrackerDf['timeEndProcessing'][0] = str(int(time.time()*1000)) - syn.store(synapseclient.Table(processTrackerSynId,processTrackerDf)) - logger.info("COMPLETED DATABASE TO STAGING") +# #No need to run twice anymore +# #stagingToCbio(syn, processingDate, args.genieVersion, CENTER_MAPPING_DF, databaseSynIdMappingDf, oncotree_url=args.oncotreeLink, consortiumReleaseCutOff= args.consortiumReleaseCutOff, filtering=True, current_release_staging=args.staging, test=args.test) +# #Create case lists files +# logger.info("CREATE CASE LIST FILES") +# #Remove old caselists first +# if not os.path.exists(CASE_LIST_PATH): +# os.mkdir(CASE_LIST_PATH) +# caselists = os.listdir(CASE_LIST_PATH) +# [os.remove(os.path.join(CASE_LIST_PATH,caselist)) for caselist in caselists] +# CLINICAL_PATH = os.path.join(GENIE_RELEASE_DIR,'data_clinical_%s.txt' % args.genieVersion) +# GENE_MATRIX_PATH = os.path.join(GENIE_RELEASE_DIR,"data_gene_matrix_%s.txt" % args.genieVersion) +# create_case_lists.create_case_lists(CLINICAL_PATH, GENE_MATRIX_PATH, CASE_LIST_PATH, "genie_private") +# caseListFiles = os.listdir(CASE_LIST_PATH) +# caseListEntities = [] +# for casePath in caseListFiles: +# casePath = os.path.join(CASE_LIST_PATH, casePath) +# caseListEntities.append(storeFile(syn, casePath, parent=caseListSynId, staging=args.staging, caseLists=True, genieVersion=args.genieVersion)) + +# logger.info("REMOVING UNNECESSARY FILES") +# genie_files = os.listdir(GENIE_RELEASE_DIR) +# #deletePatterns = ('data_clinical_supp_patient_','data_clinical_supp_sample_','data_CNA_','data_mutations_extended_','data_fusions_','genie_private_data_cna_hg19_') +# #[os.remove(os.path.join(GENIE_RELEASE_DIR,genieFile)) for genieFile in genie_files if genieFile.startswith(deletePatterns)] +# [os.remove(os.path.join(GENIE_RELEASE_DIR,genieFile)) for genieFile in genie_files if args.genieVersion not in genieFile and "meta" not in genieFile and "case_lists" not in genieFile] +# os.remove(CLINICAL_PATH) +# ####### +# logger.info("REVISE METADATA FILES") +# command_reviseMetadataFiles(syn, args, databaseSynIdMappingDf) +# logger.info("CBIO VALIDATION") +# #Must be exit 0 because the validator sometimes fails, but we still want to capture the output +# command = [cbioValidatorPath,'-s',GENIE_RELEASE_DIR,'-n','; exit 0'] +# cbioOutput = subprocess.check_output(" ".join(command), shell=True) +# logger.info(cbioOutput.decode("utf-8")) +# if not args.test and not args.staging: +# with open("cbioValidatorLogsConsortium_%s.txt" % args.genieVersion, "w") as cbioLog: +# cbioLog.write(cbioOutput.decode("utf-8")) +# syn.store(File("cbioValidatorLogsConsortium_%s.txt" % args.genieVersion, parentId = "syn10155804")) +# os.remove("cbioValidatorLogsConsortium_%s.txt" % args.genieVersion) +# logger.info("REMOVING OLD FILES") + +# process.rmFiles(CASE_LIST_PATH) +# if os.path.exists('%s/genie_private_meta_cna_hg19_seg.txt' % GENIE_RELEASE_DIR): +# os.unlink('%s/genie_private_meta_cna_hg19_seg.txt' % GENIE_RELEASE_DIR) +# logger.info("CREATING LINK VERSION") +# createLinkVersion(syn, args.genieVersion, caseListEntities, genePanelEntities, databaseSynIdMappingDf) + +# if not args.test and not args.staging: +# processTracker = syn.tableQuery("SELECT timeEndProcessing FROM %s where center = 'SAGE' and processingType = 'dbToStage'" % processTrackerSynId) +# processTrackerDf = processTracker.asDataFrame() +# processTrackerDf['timeEndProcessing'][0] = str(int(time.time()*1000)) +# syn.store(synapseclient.Table(processTrackerSynId,processTrackerDf)) +# logger.info("COMPLETED DATABASE TO STAGING") if not args.test: logger.info("DASHBOARD UPDATE") From 51f801f66acd9b676fa7848cc0eb977eda687b2c Mon Sep 17 00:00:00 2001 From: thomasyu888 Date: Sun, 3 Feb 2019 21:23:53 -0800 Subject: [PATCH 22/27] Remove comments --- genie/database_to_staging.py | 144 +++++++++++++++++------------------ 1 file changed, 72 insertions(+), 72 deletions(-) diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py index 6ee6e5b7..0dc6ac43 100644 --- a/genie/database_to_staging.py +++ b/genie/database_to_staging.py @@ -740,79 +740,79 @@ def main(): process.checkUrl(args.oncotreeLink) #get syn id of case list folder in consortium release -# caseListSynId = findCaseListId(syn, consortiumSynId) - -# if not args.test and not args.staging: -# processTrackerSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == 'processTracker'].values[0] -# processTracker = syn.tableQuery("SELECT timeStartProcessing FROM %s where center = 'SAGE' and processingType = 'dbToStage'" % processTrackerSynId) -# processTrackerDf = processTracker.asDataFrame() -# processTrackerDf['timeStartProcessing'][0] = str(int(time.time()*1000)) -# syn.store(synapseclient.Table(processTrackerSynId,processTrackerDf)) - -# syn.table_query_timeout = 50000 -# centerMappingSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == 'centerMapping'].values[0] -# #Only release files where release is true -# CENTER_MAPPING = syn.tableQuery('SELECT * FROM %s where release is true' % centerMappingSynId) -# CENTER_MAPPING_DF = CENTER_MAPPING.asDataFrame() -# processingDate = datetime.datetime.strptime(args.processingDate, '%b-%Y') -# ####### -# cbioValidatorPath = os.path.join(args.cbioportalPath,"core/src/main/scripts/importer/validateData.py") -# assert os.path.exists(cbioValidatorPath), "Please specify correct cbioportalPath" - -# logger.info("STAGING TO CONSORTIUM") -# genePanelEntities = stagingToCbio(syn, processingDate, args.genieVersion, CENTER_MAPPING_DF, databaseSynIdMappingDf, oncotree_url=args.oncotreeLink, consortiumReleaseCutOff= args.consortiumReleaseCutOff, current_release_staging=args.staging, skipMutationsInCis=args.skipMutationsInCis, test=args.test) + caseListSynId = findCaseListId(syn, consortiumSynId) + + if not args.test and not args.staging: + processTrackerSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == 'processTracker'].values[0] + processTracker = syn.tableQuery("SELECT timeStartProcessing FROM %s where center = 'SAGE' and processingType = 'dbToStage'" % processTrackerSynId) + processTrackerDf = processTracker.asDataFrame() + processTrackerDf['timeStartProcessing'][0] = str(int(time.time()*1000)) + syn.store(synapseclient.Table(processTrackerSynId,processTrackerDf)) + + syn.table_query_timeout = 50000 + centerMappingSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == 'centerMapping'].values[0] + #Only release files where release is true + CENTER_MAPPING = syn.tableQuery('SELECT * FROM %s where release is true' % centerMappingSynId) + CENTER_MAPPING_DF = CENTER_MAPPING.asDataFrame() + processingDate = datetime.datetime.strptime(args.processingDate, '%b-%Y') +####### + cbioValidatorPath = os.path.join(args.cbioportalPath,"core/src/main/scripts/importer/validateData.py") + assert os.path.exists(cbioValidatorPath), "Please specify correct cbioportalPath" + + logger.info("STAGING TO CONSORTIUM") + genePanelEntities = stagingToCbio(syn, processingDate, args.genieVersion, CENTER_MAPPING_DF, databaseSynIdMappingDf, oncotree_url=args.oncotreeLink, consortiumReleaseCutOff= args.consortiumReleaseCutOff, current_release_staging=args.staging, skipMutationsInCis=args.skipMutationsInCis, test=args.test) -# #No need to run twice anymore -# #stagingToCbio(syn, processingDate, args.genieVersion, CENTER_MAPPING_DF, databaseSynIdMappingDf, oncotree_url=args.oncotreeLink, consortiumReleaseCutOff= args.consortiumReleaseCutOff, filtering=True, current_release_staging=args.staging, test=args.test) -# #Create case lists files -# logger.info("CREATE CASE LIST FILES") -# #Remove old caselists first -# if not os.path.exists(CASE_LIST_PATH): -# os.mkdir(CASE_LIST_PATH) -# caselists = os.listdir(CASE_LIST_PATH) -# [os.remove(os.path.join(CASE_LIST_PATH,caselist)) for caselist in caselists] -# CLINICAL_PATH = os.path.join(GENIE_RELEASE_DIR,'data_clinical_%s.txt' % args.genieVersion) -# GENE_MATRIX_PATH = os.path.join(GENIE_RELEASE_DIR,"data_gene_matrix_%s.txt" % args.genieVersion) -# create_case_lists.create_case_lists(CLINICAL_PATH, GENE_MATRIX_PATH, CASE_LIST_PATH, "genie_private") -# caseListFiles = os.listdir(CASE_LIST_PATH) -# caseListEntities = [] -# for casePath in caseListFiles: -# casePath = os.path.join(CASE_LIST_PATH, casePath) -# caseListEntities.append(storeFile(syn, casePath, parent=caseListSynId, staging=args.staging, caseLists=True, genieVersion=args.genieVersion)) - -# logger.info("REMOVING UNNECESSARY FILES") -# genie_files = os.listdir(GENIE_RELEASE_DIR) -# #deletePatterns = ('data_clinical_supp_patient_','data_clinical_supp_sample_','data_CNA_','data_mutations_extended_','data_fusions_','genie_private_data_cna_hg19_') -# #[os.remove(os.path.join(GENIE_RELEASE_DIR,genieFile)) for genieFile in genie_files if genieFile.startswith(deletePatterns)] -# [os.remove(os.path.join(GENIE_RELEASE_DIR,genieFile)) for genieFile in genie_files if args.genieVersion not in genieFile and "meta" not in genieFile and "case_lists" not in genieFile] -# os.remove(CLINICAL_PATH) -# ####### -# logger.info("REVISE METADATA FILES") -# command_reviseMetadataFiles(syn, args, databaseSynIdMappingDf) -# logger.info("CBIO VALIDATION") -# #Must be exit 0 because the validator sometimes fails, but we still want to capture the output -# command = [cbioValidatorPath,'-s',GENIE_RELEASE_DIR,'-n','; exit 0'] -# cbioOutput = subprocess.check_output(" ".join(command), shell=True) -# logger.info(cbioOutput.decode("utf-8")) -# if not args.test and not args.staging: -# with open("cbioValidatorLogsConsortium_%s.txt" % args.genieVersion, "w") as cbioLog: -# cbioLog.write(cbioOutput.decode("utf-8")) -# syn.store(File("cbioValidatorLogsConsortium_%s.txt" % args.genieVersion, parentId = "syn10155804")) -# os.remove("cbioValidatorLogsConsortium_%s.txt" % args.genieVersion) -# logger.info("REMOVING OLD FILES") - -# process.rmFiles(CASE_LIST_PATH) -# if os.path.exists('%s/genie_private_meta_cna_hg19_seg.txt' % GENIE_RELEASE_DIR): -# os.unlink('%s/genie_private_meta_cna_hg19_seg.txt' % GENIE_RELEASE_DIR) -# logger.info("CREATING LINK VERSION") -# createLinkVersion(syn, args.genieVersion, caseListEntities, genePanelEntities, databaseSynIdMappingDf) - -# if not args.test and not args.staging: -# processTracker = syn.tableQuery("SELECT timeEndProcessing FROM %s where center = 'SAGE' and processingType = 'dbToStage'" % processTrackerSynId) -# processTrackerDf = processTracker.asDataFrame() -# processTrackerDf['timeEndProcessing'][0] = str(int(time.time()*1000)) -# syn.store(synapseclient.Table(processTrackerSynId,processTrackerDf)) -# logger.info("COMPLETED DATABASE TO STAGING") + #No need to run twice anymore + #stagingToCbio(syn, processingDate, args.genieVersion, CENTER_MAPPING_DF, databaseSynIdMappingDf, oncotree_url=args.oncotreeLink, consortiumReleaseCutOff= args.consortiumReleaseCutOff, filtering=True, current_release_staging=args.staging, test=args.test) + #Create case lists files + logger.info("CREATE CASE LIST FILES") + #Remove old caselists first + if not os.path.exists(CASE_LIST_PATH): + os.mkdir(CASE_LIST_PATH) + caselists = os.listdir(CASE_LIST_PATH) + [os.remove(os.path.join(CASE_LIST_PATH,caselist)) for caselist in caselists] + CLINICAL_PATH = os.path.join(GENIE_RELEASE_DIR,'data_clinical_%s.txt' % args.genieVersion) + GENE_MATRIX_PATH = os.path.join(GENIE_RELEASE_DIR,"data_gene_matrix_%s.txt" % args.genieVersion) + create_case_lists.create_case_lists(CLINICAL_PATH, GENE_MATRIX_PATH, CASE_LIST_PATH, "genie_private") + caseListFiles = os.listdir(CASE_LIST_PATH) + caseListEntities = [] + for casePath in caseListFiles: + casePath = os.path.join(CASE_LIST_PATH, casePath) + caseListEntities.append(storeFile(syn, casePath, parent=caseListSynId, staging=args.staging, caseLists=True, genieVersion=args.genieVersion)) + + logger.info("REMOVING UNNECESSARY FILES") + genie_files = os.listdir(GENIE_RELEASE_DIR) + #deletePatterns = ('data_clinical_supp_patient_','data_clinical_supp_sample_','data_CNA_','data_mutations_extended_','data_fusions_','genie_private_data_cna_hg19_') + #[os.remove(os.path.join(GENIE_RELEASE_DIR,genieFile)) for genieFile in genie_files if genieFile.startswith(deletePatterns)] + [os.remove(os.path.join(GENIE_RELEASE_DIR,genieFile)) for genieFile in genie_files if args.genieVersion not in genieFile and "meta" not in genieFile and "case_lists" not in genieFile] + os.remove(CLINICAL_PATH) +####### + logger.info("REVISE METADATA FILES") + command_reviseMetadataFiles(syn, args, databaseSynIdMappingDf) + logger.info("CBIO VALIDATION") + #Must be exit 0 because the validator sometimes fails, but we still want to capture the output + command = [cbioValidatorPath,'-s',GENIE_RELEASE_DIR,'-n','; exit 0'] + cbioOutput = subprocess.check_output(" ".join(command), shell=True) + logger.info(cbioOutput.decode("utf-8")) + if not args.test and not args.staging: + with open("cbioValidatorLogsConsortium_%s.txt" % args.genieVersion, "w") as cbioLog: + cbioLog.write(cbioOutput.decode("utf-8")) + syn.store(File("cbioValidatorLogsConsortium_%s.txt" % args.genieVersion, parentId = "syn10155804")) + os.remove("cbioValidatorLogsConsortium_%s.txt" % args.genieVersion) + logger.info("REMOVING OLD FILES") + + process.rmFiles(CASE_LIST_PATH) + if os.path.exists('%s/genie_private_meta_cna_hg19_seg.txt' % GENIE_RELEASE_DIR): + os.unlink('%s/genie_private_meta_cna_hg19_seg.txt' % GENIE_RELEASE_DIR) + logger.info("CREATING LINK VERSION") + createLinkVersion(syn, args.genieVersion, caseListEntities, genePanelEntities, databaseSynIdMappingDf) + + if not args.test and not args.staging: + processTracker = syn.tableQuery("SELECT timeEndProcessing FROM %s where center = 'SAGE' and processingType = 'dbToStage'" % processTrackerSynId) + processTrackerDf = processTracker.asDataFrame() + processTrackerDf['timeEndProcessing'][0] = str(int(time.time()*1000)) + syn.store(synapseclient.Table(processTrackerSynId,processTrackerDf)) + logger.info("COMPLETED DATABASE TO STAGING") if not args.test: logger.info("DASHBOARD UPDATE") From fc24af5894511f02ce406fdac0901582ae6a7501 Mon Sep 17 00:00:00 2001 From: thomasyu888 Date: Sun, 3 Feb 2019 22:35:54 -0800 Subject: [PATCH 23/27] Make additions --- genie/dashboard_table_updater.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/genie/dashboard_table_updater.py b/genie/dashboard_table_updater.py index 285b0e6d..bb2ecb22 100644 --- a/genie/dashboard_table_updater.py +++ b/genie/dashboard_table_updater.py @@ -421,9 +421,14 @@ def check_column_decreases(currentdf, olderdf): new_counts = currentdf[col].value_counts() if olderdf.get(col) is not None: old_counts = olderdf[col].value_counts() + #Make sure any values that exist in the new get added to the old to show the decrease new_keys = pd.Series(index=new_counts.keys()[~new_counts.keys().isin(old_counts.keys())]) old_counts = old_counts.add(new_keys,fill_value=0) old_counts.fillna(0,inplace=True) + #Make sure any values that don't exist in the old get added to show the decrease + new_keys = pd.Series(index=old_counts.keys()[~old_counts.keys().isin(new_counts.keys())]) + new_counts = new_counts.add(new_keys,fill_value=0) + new_counts.fillna(0,inplace=True) if any(new_counts - old_counts < 0): logger.info("\tDECREASE IN COLUMN: %s" % col) diff = new_counts[new_counts - old_counts < 0] From 4aec88bef16d8bf2d3757fa60aca75a6a702226b Mon Sep 17 00:00:00 2001 From: thomasyu888 Date: Sun, 3 Feb 2019 22:47:08 -0800 Subject: [PATCH 24/27] Add more print --- genie/dashboard_table_updater.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/genie/dashboard_table_updater.py b/genie/dashboard_table_updater.py index bb2ecb22..37aeda2a 100644 --- a/genie/dashboard_table_updater.py +++ b/genie/dashboard_table_updater.py @@ -432,6 +432,8 @@ def check_column_decreases(currentdf, olderdf): if any(new_counts - old_counts < 0): logger.info("\tDECREASE IN COLUMN: %s" % col) diff = new_counts[new_counts - old_counts < 0] + diffs = new_counts-old_counts + logger.info("\t" + ",".join(diffs[diffs<0].index)) diff_map[col] = True else: diff_map[col] = False From 1fcdc029e6c3b6d36a4aeb9b0521b99dff3c8583 Mon Sep 17 00:00:00 2001 From: thomasyu888 Date: Sun, 3 Feb 2019 22:49:24 -0800 Subject: [PATCH 25/27] Change index to string so it can be joined --- genie/dashboard_table_updater.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/genie/dashboard_table_updater.py b/genie/dashboard_table_updater.py index 37aeda2a..15fba6a0 100644 --- a/genie/dashboard_table_updater.py +++ b/genie/dashboard_table_updater.py @@ -433,7 +433,7 @@ def check_column_decreases(currentdf, olderdf): logger.info("\tDECREASE IN COLUMN: %s" % col) diff = new_counts[new_counts - old_counts < 0] diffs = new_counts-old_counts - logger.info("\t" + ",".join(diffs[diffs<0].index)) + logger.info("\t" + ",".join(diffs[diffs<0].index.astype(str))) diff_map[col] = True else: diff_map[col] = False From f0c272a22882174e143efd76dfdf4d52b3c2a902 Mon Sep 17 00:00:00 2001 From: thomasyu888 Date: Mon, 4 Feb 2019 11:51:03 -0800 Subject: [PATCH 26/27] Should compare samples that existed in the previous release --- genie/dashboard_table_updater.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/genie/dashboard_table_updater.py b/genie/dashboard_table_updater.py index 15fba6a0..610ec75a 100644 --- a/genie/dashboard_table_updater.py +++ b/genie/dashboard_table_updater.py @@ -470,7 +470,8 @@ def print_clinical_values_difference_table(syn, database_mappingdf): older_sampledf = pd.read_csv(older_sample_ent.path,sep="\t",comment="#") older_sampledf['CENTER'] = [patient.split("-")[1] for patient in older_sampledf['PATIENT_ID']] - current_sampledf = current_sampledf[current_sampledf['CENTER'].isin(older_sampledf['CENTER'].unique())] + #Rather than take the CENTER, must take the SAMPLE_ID to compare + current_sampledf = current_sampledf[current_sampledf['SAMPLE_ID'].isin(older_sampledf['SAMPLE_ID'].unique())] logger.info("SAMPLE CLINICAL VALUE DECREASES") center_decrease_mapping = dict() @@ -484,7 +485,8 @@ def print_clinical_values_difference_table(syn, database_mappingdf): older_patient_ent = syn.get(older_clinical_synids['data_clinical_patient.txt'], followLink=True) current_patientdf = pd.read_csv(current_patient_ent.path,sep="\t",comment="#") older_patientdf = pd.read_csv(older_patient_ent.path,sep="\t",comment="#") - current_patientdf = current_patientdf[current_patientdf['CENTER'].isin(older_patientdf['CENTER'].unique())] + #Rather than take the CENTER, must take the PATIENT_ID to compare + current_patientdf = current_patientdf[current_patientdf['PATIENT_ID'].isin(older_patientdf['PATIENT_ID'].unique())] logger.info("PATIENT CLINICAL VALUE DECREASES") for center in older_patientdf['CENTER'].unique(): From 52b41a063f96f5356e25b1510de099e3f7b2fb50 Mon Sep 17 00:00:00 2001 From: thomasyu888 Date: Mon, 4 Feb 2019 12:54:21 -0800 Subject: [PATCH 27/27] Add values of diff --- genie/dashboard_table_updater.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/genie/dashboard_table_updater.py b/genie/dashboard_table_updater.py index 610ec75a..9dba3fb9 100644 --- a/genie/dashboard_table_updater.py +++ b/genie/dashboard_table_updater.py @@ -433,7 +433,7 @@ def check_column_decreases(currentdf, olderdf): logger.info("\tDECREASE IN COLUMN: %s" % col) diff = new_counts[new_counts - old_counts < 0] diffs = new_counts-old_counts - logger.info("\t" + ",".join(diffs[diffs<0].index.astype(str))) + logger.info("\t" + diffs[diffs<0].to_csv().replace("\n","; ")) diff_map[col] = True else: diff_map[col] = False