Merge pull request #65 from Sage-Bionetworks/maf

Change t_depth to STRING, add in removal of floats for maf processing, exclude panel
Sage-Bionetworks · Feb 4, 2019 · 9e886be · 9e886be
2 parents a209ae5 + 52b41a0
commit 9e886be
Show file tree

Hide file tree

Showing 7 changed files with 82 additions and 35 deletions.
diff --git a/analyses/genomicData/MAFinBED.R b/analyses/genomicData/MAFinBED.R
@@ -4,8 +4,8 @@ library(synapser)
 library(VariantAnnotation)
 
 args = commandArgs(trailingOnly=TRUE)
-if (length(args) != 1) {
- stop("Must supply a boolean value")
+if (length(args) != 2) {
+ stop("Must supply a boolean value and a filepath to write variants not in bed")
 }
 # SAGE login
 tryCatch({
@@ -37,6 +37,10 @@ sampleData$AGE_AT_SEQ_REPORT_NUMERICAL <- NULL
 patientData$BIRTH_YEAR_NUMERICAL <- NULL
 patientData$CENTER <- NULL
 genieClinData <- merge.data.frame(patientData, sampleData, by="PATIENT_ID")
+
+#EXCLUDE PHS-TRISEQ-V1 SAMPLES
+genieClinData <- genieClinData[genieClinData$SEQ_ASSAY_ID != "PHS-TRISEQ-V1",]
+
 # read aggregated BED file data
 genieBed = synTableQuery(sprintf('SELECT * FROM %s', bedSynId),includeRowIdAndRowVersion=F)
 genieBedData = synapser::as.data.frame(genieBed)
@@ -52,7 +56,8 @@ print(nrow(genieMutData))
 
 
 originalCols = colnames(genieMutData)
-# records with count data that preclude a VAF estimate - set VAF to 100% (1/1, alt/depth)
+# records with count data that preclude a VAF estimate - set VAF to 100% (1/1, alt/depth) genieMutData$t_depth <- as.numeric(genieMutData$t_depth)
+genieMutData$t_depth <- as.numeric(genieMutData$t_depth)
 noVAF.idx = which((genieMutData$t_depth==0)|is.na(genieMutData$t_depth))
 #keeps the order if factors exist
 #genieMutData$t_alt_count_num = as.numeric(levels(genieMutData$t_alt_count))[genieMutData$t_alt_count]
@@ -107,10 +112,28 @@ for (panelName in seq_assays) {
 }
 genieMutData$t_depth_new <- NULL
 genieMutData$t_alt_count_num <- NULL
+# Commented out below because of PFLM-4975
 #Compare old inBED with new inBED column
 #If there are differences, update only the diffs
-updateMutData = genieMutData[genieMutData$inBED != oldInBed,]
+# updateMutData = genieMutData[genieMutData$inBED != oldInBed,]
+# if (nrow(updateMutData) > 0) { 
+# #write.csv(updateMutData[c("ROW_ID","ROW_VERSION","inBED")],"update_inbed.csv",row.names = F)
+# #Need to chunk the upload
 
-if (nrow(updateMutData) > 0) { 
- synStore(Table(mafSynId, updateMutData[c("ROW_ID","ROW_VERSION","inBED")]))
-}
+# #it is an amazon problem, where amazon kills the connection while reading the csv from s3.
+# chunk = 100000
+# rows = 0
+# while (rows*chunk < nrow(updateMutData)) {
+# if ((rows + 1)* chunk > nrow(updateMutData)) {
+# to_update = updateMutData[((rows*chunk)+1):nrow(updateMutData),c("ROW_ID","ROW_VERSION","inBED")]
+# } else{
+# to_update = updateMutData[((rows*chunk)+1):((rows+1)*chunk),c("ROW_ID","ROW_VERSION","inBED")]
+# }
+# synStore(Table(mafSynId, to_update))
+# rows = rows+1
+# }
+# #synStore(Table(mafSynId, updateMutData[c("ROW_ID","ROW_VERSION","inBED")]))
+# #synStore(Table(mafSynId, "update_inbed.csv"))
+# }
+updateMutData = genieMutData[genieMutData$inBED == FALSE,c('Chromosome', 'Start_Position', 'End_Position', 'Reference_Allele', 'Tumor_Seq_Allele2', 'Tumor_Sample_Barcode', 'Center')]
+write.csv(updateMutData,args[2],row.names = F)
diff --git a/analyses/mergeFlag/mergeCheck.R b/analyses/mergeFlag/mergeCheck.R
@@ -127,6 +127,7 @@ for (center in centers) {
  }
 
  # records with count data that preclude a VAF estimate - set VAF to 100% (1/1, alt/depth)
+ genieMutData$t_depth <- as.numeric(genieMutData$t_depth)
  noVAF.idx = which((genieMutData$t_depth==0)|is.na(genieMutData$t_depth))
  genieMutData$t_alt_count_num = as.numeric(levels(genieMutData$t_alt_count))[genieMutData$t_alt_count]
  genieMutData$t_alt_count_num[noVAF.idx] = 1

diff --git a/genie/dashboardTemplate.Rmd b/genie/dashboardTemplate.Rmd
@@ -104,7 +104,11 @@ plotPrimarySites <- function(clinical, oncotreeLink, release) {
  oncotree_json = data$TISSUE
  oncotreeDict = extract(oncotree_json, "", "")
  clinical$PRIMARY_CODES <- unlist(sapply(clinical$ONCOTREE_CODE, function(code) {
- oncotreeDict[[toupper(code)]]["ONCOTREE_PRIMARY_NODE"]
+ if (toupper(code) %in% names(oncotreeDict)){
+ oncotreeDict[[toupper(code)]]["ONCOTREE_PRIMARY_NODE"]
+ } else {
+ "DEPRECATED_CODE"
+ }
  }))
  })
  clinical$CENTER = createCenterColumn(clinical)
@@ -288,7 +292,7 @@ kable(samplesPerReleaseDf,row.names = F)
 #primary site distribution
 par(mar=c(10,3,3,1))
 barplot(sort(log(table(this_mut$FILTER)),decreasing = T), main="Log Distribution of Mutation FILTERs",las=2)
-plotPrimarySites(this_samples, "http://oncotree.mskcc.org/api/tumorTypes/tree?version=oncotree_2017_06_21","%s")
+plotPrimarySites(this_samples, "http://oncotree.mskcc.org/api/tumorTypes/tree?version=oncotree_2018_06_01","%s")
 #Center X Race
 plotCenterXRace(this_patient)
 #Center X Ethnicity

diff --git a/genie/dashboard_table_updater.py b/genie/dashboard_table_updater.py
@@ -256,7 +256,7 @@ def update_oncotree_code_tables(syn, database_mappingdf):
  oncotree_link_ent = syn.get(oncotree_link_synid)
  oncotree_link = oncotree_link_ent.externalURL
  oncotree_mapping = genie.process_functions.get_oncotree_code_mappings(oncotree_link)
- clinicaldf['PRIMARY_CODES'] = [oncotree_mapping[i.upper()]['ONCOTREE_PRIMARY_NODE'] for i in clinicaldf.ONCOTREE_CODE]
+ clinicaldf['PRIMARY_CODES'] = [oncotree_mapping[i.upper()]['ONCOTREE_PRIMARY_NODE'] if i.upper() in oncotree_mapping.keys() else 'DEPRECATED_CODE' for i in clinicaldf.ONCOTREE_CODE]
 
  # ### DISTRIBUTION OF PRIMARY ONCOTREE CODE TABLE UPDATE
  primary_code_distributiondf = pd.DataFrame(columns=set(clinicaldf['CENTER']), index=set(clinicaldf['PRIMARY_CODES']))
@@ -421,12 +421,19 @@ def check_column_decreases(currentdf, olderdf):
  new_counts = currentdf[col].value_counts()
  if olderdf.get(col) is not None:
  old_counts = olderdf[col].value_counts()
+ #Make sure any values that exist in the new get added to the old to show the decrease
  new_keys = pd.Series(index=new_counts.keys()[~new_counts.keys().isin(old_counts.keys())])
  old_counts = old_counts.add(new_keys,fill_value=0)
  old_counts.fillna(0,inplace=True)
+ #Make sure any values that don't exist in the old get added to show the decrease
+ new_keys = pd.Series(index=old_counts.keys()[~old_counts.keys().isin(new_counts.keys())])
+ new_counts = new_counts.add(new_keys,fill_value=0)
+ new_counts.fillna(0,inplace=True)
  if any(new_counts - old_counts < 0):
  logger.info("\tDECREASE IN COLUMN: %s" % col)
  diff = new_counts[new_counts - old_counts < 0]
+ diffs = new_counts-old_counts
+ logger.info("\t" + diffs[diffs<0].to_csv().replace("\n","; "))
  diff_map[col] = True
  else:
  diff_map[col] = False
@@ -463,7 +470,8 @@ def print_clinical_values_difference_table(syn, database_mappingdf):
 
  older_sampledf = pd.read_csv(older_sample_ent.path,sep="\t",comment="#")
  older_sampledf['CENTER'] = [patient.split("-")[1] for patient in older_sampledf['PATIENT_ID']]
- current_sampledf = current_sampledf[current_sampledf['CENTER'].isin(older_sampledf['CENTER'].unique())]
+ #Rather than take the CENTER, must take the SAMPLE_ID to compare
+ current_sampledf = current_sampledf[current_sampledf['SAMPLE_ID'].isin(older_sampledf['SAMPLE_ID'].unique())]
 
  logger.info("SAMPLE CLINICAL VALUE DECREASES")
  center_decrease_mapping = dict()
@@ -477,7 +485,8 @@ def print_clinical_values_difference_table(syn, database_mappingdf):
  older_patient_ent = syn.get(older_clinical_synids['data_clinical_patient.txt'], followLink=True)
  current_patientdf = pd.read_csv(current_patient_ent.path,sep="\t",comment="#")
  older_patientdf = pd.read_csv(older_patient_ent.path,sep="\t",comment="#")
- current_patientdf = current_patientdf[current_patientdf['CENTER'].isin(older_patientdf['CENTER'].unique())]
+ #Rather than take the CENTER, must take the PATIENT_ID to compare
+ current_patientdf = current_patientdf[current_patientdf['PATIENT_ID'].isin(older_patientdf['PATIENT_ID'].unique())]
 
  logger.info("PATIENT CLINICAL VALUE DECREASES")
  for center in older_patientdf['CENTER'].unique():

diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py
@@ -129,7 +129,8 @@ def configureMafRow(rowArray, headers, keepSamples, remove_variants):
  seq = str(rowArray[headers.index('Tumor_Seq_Allele2')])
  sampleId = str(rowArray[headers.index('Tumor_Sample_Barcode')])
  variant = chrom +' '+ start+ ' '+end +' '+ref + ' '+ seq+ ' ' + sampleId
- if pd.Series(sampleId).isin(keepSamples).any() and not pd.Series(variant).isin(remove_variants).any():
+ #if pd.Series(sampleId).isin(keepSamples).any() and not pd.Series(variant).isin(remove_variants).any():
+ if sampleId in keepSamples.tolist() and not variant in remove_variants.tolist():
  fillnas = ['t_depth','t_ref_count','t_alt_count','n_depth','n_ref_count','n_alt_count']
  for i in fillnas:
  #mutationsDf[i] = mutationsDf[i].fillna("NA")
@@ -151,12 +152,15 @@ def configureMafRow(rowArray, headers, keepSamples, remove_variants):
 #Run MAF in BED script, filter data and update MAFinBED database
 def runMAFinBED(syn, CENTER_MAPPING_DF, databaseSynIdMappingDf, test=False, genieVersion="test"):
  MAFinBED_script = os.path.join(os.path.dirname(os.path.abspath(__file__)),'../analyses/genomicData/MAFinBED.R')
- command = ['Rscript', MAFinBED_script, str(test)]
+ notinbed_variant_file = os.path.join(os.path.dirname(os.path.abspath(__file__)),'../analyses/genomicData/notinbed.csv')
+
+ command = ['Rscript', MAFinBED_script, str(test), notinbed_variant_file]
  subprocess.check_call(command)
 
- mutationSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == "vcf2maf"][0]
- removedVariants = syn.tableQuery("select Chromosome, Start_Position, End_Position, Reference_Allele, Tumor_Seq_Allele2, Tumor_Sample_Barcode, Center from %s where inBED is False and Center in ('%s')" % (mutationSynId,"','".join(CENTER_MAPPING_DF.center)))
- removedVariantsDf = removedVariants.asDataFrame()
+ # mutationSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == "vcf2maf"][0]
+ # removedVariants = syn.tableQuery("select Chromosome, Start_Position, End_Position, Reference_Allele, Tumor_Seq_Allele2, Tumor_Sample_Barcode, Center from %s where inBED is False and Center in ('%s')" % (mutationSynId,"','".join(CENTER_MAPPING_DF.center)))
+ # removedVariantsDf = removedVariants.asDataFrame()
+ removedVariantsDf = pd.read_csv(notinbed_variant_file)
  removedVariantsDf['removeVariants'] = removedVariantsDf['Chromosome'].astype(str) + ' ' + removedVariantsDf['Start_Position'].astype(str) + ' ' + removedVariantsDf['End_Position'].astype(str) + ' ' + removedVariantsDf['Reference_Allele'].astype(str) + ' ' + removedVariantsDf['Tumor_Seq_Allele2'].astype(str) + ' ' + removedVariantsDf['Tumor_Sample_Barcode'].astype(str)
  #Store filtered vairants
  for center in removedVariantsDf['Center'].unique():
@@ -242,6 +246,8 @@ def stagingToCbio(syn, processingDate, genieVersion, CENTER_MAPPING_DF, database
  totalSample = ['PATIENT_ID']
  totalSample.extend(sampleCols)
  sampleCols = totalSample
+ #Make sure to only grab samples that have patient information
+ sampleDf = sampleDf[sampleDf['PATIENT_ID'].isin(patientDf['PATIENT_ID'])]
  clinicalDf = sampleDf.merge(patientDf, on="PATIENT_ID",how="outer")
  #Remove patients without any sample or patient ids
  clinicalDf = clinicalDf[~clinicalDf['SAMPLE_ID'].isnull()]
@@ -280,10 +286,10 @@ def stagingToCbio(syn, processingDate, genieVersion, CENTER_MAPPING_DF, database
  'CANCER_TYPE_DETAILED': 'UNKNOWN',
  'ONCOTREE_PRIMARY_NODE': 'UNKNOWN',
  'ONCOTREE_SECONDARY_NODE': 'UNKNOWN'}
- clinicalDf['CANCER_TYPE'] = [oncotreeDict[code.upper()].get("CANCER_TYPE",float('nan')) for code in clinicalDf['ONCOTREE_CODE']]
- clinicalDf['CANCER_TYPE_DETAILED'] = [oncotreeDict[code.upper()].get("CANCER_TYPE_DETAILED",float('nan')) for code in clinicalDf['ONCOTREE_CODE']]
- clinicalDf['ONCOTREE_PRIMARY_NODE'] = [oncotreeDict[code.upper()].get("ONCOTREE_PRIMARY_NODE",float('nan')) for code in clinicalDf['ONCOTREE_CODE']]
- clinicalDf['ONCOTREE_SECONDARY_NODE'] = [oncotreeDict[code.upper()].get("ONCOTREE_SECONDARY_NODE",float('nan')) for code in clinicalDf['ONCOTREE_CODE']]
+ clinicalDf['CANCER_TYPE'] = [oncotreeDict[code.upper()]["CANCER_TYPE"] if code.upper() in oncotreeDict.keys() else float('nan') for code in clinicalDf['ONCOTREE_CODE']]
+ clinicalDf['CANCER_TYPE_DETAILED'] = [oncotreeDict[code.upper()]["CANCER_TYPE_DETAILED"] if code.upper() in oncotreeDict.keys() else float('nan') for code in clinicalDf['ONCOTREE_CODE']]
+ clinicalDf['ONCOTREE_PRIMARY_NODE'] = [oncotreeDict[code.upper()]["ONCOTREE_PRIMARY_NODE"] if code.upper() in oncotreeDict.keys() else float('nan') for code in clinicalDf['ONCOTREE_CODE']]
+ clinicalDf['ONCOTREE_SECONDARY_NODE'] = [oncotreeDict[code.upper()]["ONCOTREE_SECONDARY_NODE"] if code.upper() in oncotreeDict.keys() else float('nan') for code in clinicalDf['ONCOTREE_CODE']]
 
  #CANCER TYPES are added which is why the clinical file is written out.
  #clinicalDf.to_csv(CLINCICAL_PATH, sep="\t", index=False)
@@ -301,9 +307,11 @@ def stagingToCbio(syn, processingDate, genieVersion, CENTER_MAPPING_DF, database
  clinicalDf['AGE_AT_SEQ_REPORT'][clinicalDf['AGE_AT_SEQ_REPORT'] == "<6570"] = "<18"
 
  ############################################################
- #CENTER SPECIFIC CODE FOR RIGHT NOW (REMOVE UHN-555-V1)
+ #CENTER SPECIFIC CODE FOR RIGHT NOW (REMOVE UHN-555-V1, PHS-TRISEQ-V1)
  ############################################################
  clinicalDf = clinicalDf[clinicalDf['SEQ_ASSAY_ID'] != "UHN-555-V1"]
+ clinicalDf = clinicalDf[clinicalDf['SEQ_ASSAY_ID'] != "PHS-TRISEQ-V1"]
+
  #clinicalDf = clinicalDf[clinicalDf['CENTER'] != "WAKE"]
  #clinicalDf = clinicalDf[clinicalDf['CENTER'] != "CRUK"]
  ############################################################

diff --git a/genie/maf.py b/genie/maf.py
@@ -36,6 +36,7 @@ def createFinalMaf(self, mafDf, filePath, maf=False):
  mafSet = mafDf.to_csv(sep="\t", index=False, header=None)
  writeOrAppend = "w" if maf else "a"
  with open(filePath, writeOrAppend) as maf:
+ mafSet = process_functions.removeStringFloat(mafSet)
  maf.write(mafSet)
 
  #There is a isNarrow option, but note that the number of rows of the maf file 
@@ -58,7 +59,7 @@ def process_helper(self, filePath, path_to_GENIE, mafSynId, centerMafSynId,
  narrowMafColumns = [col['name'] for col in self.syn.getTableColumns(mafSynId) if col['name'] != 'inBED']
  #Strips out windows indentations \r
  command = ['dos2unix',filePath]
- subprocess.call(command)
+ subprocess.check_call(command)
  tempdir = os.path.join(path_to_GENIE, self.center)
  commandCall = ["perl",os.path.join(vcf2mafPath,"maf2maf.pl"),
  "--input-maf",filePath,
@@ -71,7 +72,7 @@ def process_helper(self, filePath, path_to_GENIE, mafSynId, centerMafSynId,
  "--custom-enst", os.path.join(vcf2mafPath,"data/isoform_overrides_uniprot")]
  if reference is not None:
  commandCall.extend(["--ref-fasta",reference])
- maf = subprocess.call(commandCall) 
+ maf = subprocess.check_call(commandCall) 
 
  process_functions.rmFiles(tempdir, recursive=False)
  open(narrowMafPath,"w").close()

diff --git a/genie/vcf.py b/genie/vcf.py
@@ -52,16 +52,16 @@ def process_helper(self, vcffiles, path_to_GENIE, mafSynId, centerMafSynId,
  newVCFPath = os.path.join(centerInputFolder, vcfName)
  #remove chr from each row
  command = ["sed", "'s/^chr//'", path, ">", newVCFPath]
- subprocess.call(" ".join(command), shell=True)
+ subprocess.check_call(" ".join(command), shell=True)
  #Empty spaces must be replaced with a period
  command = ["sed", '-i', "'s/\t\t/\t.\t/g'", newVCFPath]
- subprocess.call(" ".join(command), shell=True)
+ subprocess.check_call(" ".join(command), shell=True)
  #All INFO/HGVS values have a whitespace, which is not allowed in VCF specs. Replace that with a comma
  command = ['sed', '-i', "'s/ p\./,p./'", newVCFPath]
- subprocess.call(" ".join(command), shell=True)
+ subprocess.check_call(" ".join(command), shell=True)
  #Strips out windows indentations \r
  command = ['dos2unix',newVCFPath]
- subprocess.call(command)
+ subprocess.check_call(command)
  vcfCols = ["#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"]
  with open(newVCFPath,"r") as f:
  for line in f:
@@ -74,22 +74,23 @@ def process_helper(self, vcffiles, path_to_GENIE, mafSynId, centerMafSynId,
 
  samples = [i for i in cols if i not in vcfCols]
 
- tumorName = vcfName.replace(".vcf","")
+ #tumorName = vcfName.replace(".vcf","")
 
  if len(samples) == 1:
  tumor = samples[0]
  normal = "NORMAL"
- ### If the tumor name isn't TUMOR, set the sample id to be the tumor name
- if tumor != "TUMOR":
- tumorName = tumor
  elif len(samples) == 2:
- #Tumor is always first, normal is second
+ #Assumes that Tumor is always first, normal is second
  tumor = samples[0]
  normal = samples[1]
- tumorName = tumor 
  else:
  tumor = "TUMOR"
  normal = "NORMAL"
+ # ### If the tumor name isn't TUMOR, set the sample id to be the tumor name
+ if tumor == "TUMOR":
+ tumorName = vcfName.replace(".vcf","")
+ else:
+ tumorName = tumor
  newMAFPath = newVCFPath + ".maf"
  if os.path.isfile(newMAFPath):
  mafFiles.append(newMAFPath)
@@ -107,7 +108,7 @@ def process_helper(self, vcffiles, path_to_GENIE, mafSynId, centerMafSynId,
  '--custom-enst', os.path.join(vcf2mafPath, 'data/isoform_overrides_uniprot')]
  if reference is not None:
  command.extend(["--ref-fasta",reference])
- subprocess.call(command)
+ subprocess.check_call(command)
  if (os.path.isfile(newMAFPath)):
  mafFiles.append(newMAFPath)