Skip to content

Commit

Permalink
Merge pull request #65 from Sage-Bionetworks/maf
Browse files Browse the repository at this point in the history
Change t_depth to STRING, add in removal of floats for maf processing, exclude panel
  • Loading branch information
thomasyu888 authored Feb 4, 2019
2 parents a209ae5 + 52b41a0 commit 9e886be
Show file tree
Hide file tree
Showing 7 changed files with 82 additions and 35 deletions.
37 changes: 30 additions & 7 deletions analyses/genomicData/MAFinBED.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ library(synapser)
library(VariantAnnotation)

args = commandArgs(trailingOnly=TRUE)
if (length(args) != 1) {
stop("Must supply a boolean value")
if (length(args) != 2) {
stop("Must supply a boolean value and a filepath to write variants not in bed")
}
# SAGE login
tryCatch({
Expand Down Expand Up @@ -37,6 +37,10 @@ sampleData$AGE_AT_SEQ_REPORT_NUMERICAL <- NULL
patientData$BIRTH_YEAR_NUMERICAL <- NULL
patientData$CENTER <- NULL
genieClinData <- merge.data.frame(patientData, sampleData, by="PATIENT_ID")

#EXCLUDE PHS-TRISEQ-V1 SAMPLES
genieClinData <- genieClinData[genieClinData$SEQ_ASSAY_ID != "PHS-TRISEQ-V1",]

# read aggregated BED file data
genieBed = synTableQuery(sprintf('SELECT * FROM %s', bedSynId),includeRowIdAndRowVersion=F)
genieBedData = synapser::as.data.frame(genieBed)
Expand All @@ -52,7 +56,8 @@ print(nrow(genieMutData))


originalCols = colnames(genieMutData)
# records with count data that preclude a VAF estimate - set VAF to 100% (1/1, alt/depth)
# records with count data that preclude a VAF estimate - set VAF to 100% (1/1, alt/depth) genieMutData$t_depth <- as.numeric(genieMutData$t_depth)
genieMutData$t_depth <- as.numeric(genieMutData$t_depth)
noVAF.idx = which((genieMutData$t_depth==0)|is.na(genieMutData$t_depth))
#keeps the order if factors exist
#genieMutData$t_alt_count_num = as.numeric(levels(genieMutData$t_alt_count))[genieMutData$t_alt_count]
Expand Down Expand Up @@ -107,10 +112,28 @@ for (panelName in seq_assays) {
}
genieMutData$t_depth_new <- NULL
genieMutData$t_alt_count_num <- NULL
# Commented out below because of PFLM-4975
#Compare old inBED with new inBED column
#If there are differences, update only the diffs
updateMutData = genieMutData[genieMutData$inBED != oldInBed,]
# updateMutData = genieMutData[genieMutData$inBED != oldInBed,]
# if (nrow(updateMutData) > 0) {
# #write.csv(updateMutData[c("ROW_ID","ROW_VERSION","inBED")],"update_inbed.csv",row.names = F)
# #Need to chunk the upload

if (nrow(updateMutData) > 0) {
synStore(Table(mafSynId, updateMutData[c("ROW_ID","ROW_VERSION","inBED")]))
}
# #it is an amazon problem, where amazon kills the connection while reading the csv from s3.
# chunk = 100000
# rows = 0
# while (rows*chunk < nrow(updateMutData)) {
# if ((rows + 1)* chunk > nrow(updateMutData)) {
# to_update = updateMutData[((rows*chunk)+1):nrow(updateMutData),c("ROW_ID","ROW_VERSION","inBED")]
# } else{
# to_update = updateMutData[((rows*chunk)+1):((rows+1)*chunk),c("ROW_ID","ROW_VERSION","inBED")]
# }
# synStore(Table(mafSynId, to_update))
# rows = rows+1
# }
# #synStore(Table(mafSynId, updateMutData[c("ROW_ID","ROW_VERSION","inBED")]))
# #synStore(Table(mafSynId, "update_inbed.csv"))
# }
updateMutData = genieMutData[genieMutData$inBED == FALSE,c('Chromosome', 'Start_Position', 'End_Position', 'Reference_Allele', 'Tumor_Seq_Allele2', 'Tumor_Sample_Barcode', 'Center')]
write.csv(updateMutData,args[2],row.names = F)
1 change: 1 addition & 0 deletions analyses/mergeFlag/mergeCheck.R
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ for (center in centers) {
}

# records with count data that preclude a VAF estimate - set VAF to 100% (1/1, alt/depth)
genieMutData$t_depth <- as.numeric(genieMutData$t_depth)
noVAF.idx = which((genieMutData$t_depth==0)|is.na(genieMutData$t_depth))
genieMutData$t_alt_count_num = as.numeric(levels(genieMutData$t_alt_count))[genieMutData$t_alt_count]
genieMutData$t_alt_count_num[noVAF.idx] = 1
Expand Down
8 changes: 6 additions & 2 deletions genie/dashboardTemplate.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,11 @@ plotPrimarySites <- function(clinical, oncotreeLink, release) {
oncotree_json = data$TISSUE
oncotreeDict = extract(oncotree_json, "", "")
clinical$PRIMARY_CODES <- unlist(sapply(clinical$ONCOTREE_CODE, function(code) {
oncotreeDict[[toupper(code)]]["ONCOTREE_PRIMARY_NODE"]
if (toupper(code) %in% names(oncotreeDict)){
oncotreeDict[[toupper(code)]]["ONCOTREE_PRIMARY_NODE"]
} else {
"DEPRECATED_CODE"
}
}))
})
clinical$CENTER = createCenterColumn(clinical)
Expand Down Expand Up @@ -288,7 +292,7 @@ kable(samplesPerReleaseDf,row.names = F)
#primary site distribution
par(mar=c(10,3,3,1))
barplot(sort(log(table(this_mut$FILTER)),decreasing = T), main="Log Distribution of Mutation FILTERs",las=2)
plotPrimarySites(this_samples, "http://oncotree.mskcc.org/api/tumorTypes/tree?version=oncotree_2017_06_21","%s")
plotPrimarySites(this_samples, "http://oncotree.mskcc.org/api/tumorTypes/tree?version=oncotree_2018_06_01","%s")
#Center X Race
plotCenterXRace(this_patient)
#Center X Ethnicity
Expand Down
15 changes: 12 additions & 3 deletions genie/dashboard_table_updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ def update_oncotree_code_tables(syn, database_mappingdf):
oncotree_link_ent = syn.get(oncotree_link_synid)
oncotree_link = oncotree_link_ent.externalURL
oncotree_mapping = genie.process_functions.get_oncotree_code_mappings(oncotree_link)
clinicaldf['PRIMARY_CODES'] = [oncotree_mapping[i.upper()]['ONCOTREE_PRIMARY_NODE'] for i in clinicaldf.ONCOTREE_CODE]
clinicaldf['PRIMARY_CODES'] = [oncotree_mapping[i.upper()]['ONCOTREE_PRIMARY_NODE'] if i.upper() in oncotree_mapping.keys() else 'DEPRECATED_CODE' for i in clinicaldf.ONCOTREE_CODE]

# ### DISTRIBUTION OF PRIMARY ONCOTREE CODE TABLE UPDATE
primary_code_distributiondf = pd.DataFrame(columns=set(clinicaldf['CENTER']), index=set(clinicaldf['PRIMARY_CODES']))
Expand Down Expand Up @@ -421,12 +421,19 @@ def check_column_decreases(currentdf, olderdf):
new_counts = currentdf[col].value_counts()
if olderdf.get(col) is not None:
old_counts = olderdf[col].value_counts()
#Make sure any values that exist in the new get added to the old to show the decrease
new_keys = pd.Series(index=new_counts.keys()[~new_counts.keys().isin(old_counts.keys())])
old_counts = old_counts.add(new_keys,fill_value=0)
old_counts.fillna(0,inplace=True)
#Make sure any values that don't exist in the old get added to show the decrease
new_keys = pd.Series(index=old_counts.keys()[~old_counts.keys().isin(new_counts.keys())])
new_counts = new_counts.add(new_keys,fill_value=0)
new_counts.fillna(0,inplace=True)
if any(new_counts - old_counts < 0):
logger.info("\tDECREASE IN COLUMN: %s" % col)
diff = new_counts[new_counts - old_counts < 0]
diffs = new_counts-old_counts
logger.info("\t" + diffs[diffs<0].to_csv().replace("\n","; "))
diff_map[col] = True
else:
diff_map[col] = False
Expand Down Expand Up @@ -463,7 +470,8 @@ def print_clinical_values_difference_table(syn, database_mappingdf):

older_sampledf = pd.read_csv(older_sample_ent.path,sep="\t",comment="#")
older_sampledf['CENTER'] = [patient.split("-")[1] for patient in older_sampledf['PATIENT_ID']]
current_sampledf = current_sampledf[current_sampledf['CENTER'].isin(older_sampledf['CENTER'].unique())]
#Rather than take the CENTER, must take the SAMPLE_ID to compare
current_sampledf = current_sampledf[current_sampledf['SAMPLE_ID'].isin(older_sampledf['SAMPLE_ID'].unique())]

logger.info("SAMPLE CLINICAL VALUE DECREASES")
center_decrease_mapping = dict()
Expand All @@ -477,7 +485,8 @@ def print_clinical_values_difference_table(syn, database_mappingdf):
older_patient_ent = syn.get(older_clinical_synids['data_clinical_patient.txt'], followLink=True)
current_patientdf = pd.read_csv(current_patient_ent.path,sep="\t",comment="#")
older_patientdf = pd.read_csv(older_patient_ent.path,sep="\t",comment="#")
current_patientdf = current_patientdf[current_patientdf['CENTER'].isin(older_patientdf['CENTER'].unique())]
#Rather than take the CENTER, must take the PATIENT_ID to compare
current_patientdf = current_patientdf[current_patientdf['PATIENT_ID'].isin(older_patientdf['PATIENT_ID'].unique())]

logger.info("PATIENT CLINICAL VALUE DECREASES")
for center in older_patientdf['CENTER'].unique():
Expand Down
28 changes: 18 additions & 10 deletions genie/database_to_staging.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,8 @@ def configureMafRow(rowArray, headers, keepSamples, remove_variants):
seq = str(rowArray[headers.index('Tumor_Seq_Allele2')])
sampleId = str(rowArray[headers.index('Tumor_Sample_Barcode')])
variant = chrom +' '+ start+ ' '+end +' '+ref + ' '+ seq+ ' ' + sampleId
if pd.Series(sampleId).isin(keepSamples).any() and not pd.Series(variant).isin(remove_variants).any():
#if pd.Series(sampleId).isin(keepSamples).any() and not pd.Series(variant).isin(remove_variants).any():
if sampleId in keepSamples.tolist() and not variant in remove_variants.tolist():
fillnas = ['t_depth','t_ref_count','t_alt_count','n_depth','n_ref_count','n_alt_count']
for i in fillnas:
#mutationsDf[i] = mutationsDf[i].fillna("NA")
Expand All @@ -151,12 +152,15 @@ def configureMafRow(rowArray, headers, keepSamples, remove_variants):
#Run MAF in BED script, filter data and update MAFinBED database
def runMAFinBED(syn, CENTER_MAPPING_DF, databaseSynIdMappingDf, test=False, genieVersion="test"):
MAFinBED_script = os.path.join(os.path.dirname(os.path.abspath(__file__)),'../analyses/genomicData/MAFinBED.R')
command = ['Rscript', MAFinBED_script, str(test)]
notinbed_variant_file = os.path.join(os.path.dirname(os.path.abspath(__file__)),'../analyses/genomicData/notinbed.csv')

command = ['Rscript', MAFinBED_script, str(test), notinbed_variant_file]
subprocess.check_call(command)

mutationSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == "vcf2maf"][0]
removedVariants = syn.tableQuery("select Chromosome, Start_Position, End_Position, Reference_Allele, Tumor_Seq_Allele2, Tumor_Sample_Barcode, Center from %s where inBED is False and Center in ('%s')" % (mutationSynId,"','".join(CENTER_MAPPING_DF.center)))
removedVariantsDf = removedVariants.asDataFrame()
# mutationSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == "vcf2maf"][0]
# removedVariants = syn.tableQuery("select Chromosome, Start_Position, End_Position, Reference_Allele, Tumor_Seq_Allele2, Tumor_Sample_Barcode, Center from %s where inBED is False and Center in ('%s')" % (mutationSynId,"','".join(CENTER_MAPPING_DF.center)))
# removedVariantsDf = removedVariants.asDataFrame()
removedVariantsDf = pd.read_csv(notinbed_variant_file)
removedVariantsDf['removeVariants'] = removedVariantsDf['Chromosome'].astype(str) + ' ' + removedVariantsDf['Start_Position'].astype(str) + ' ' + removedVariantsDf['End_Position'].astype(str) + ' ' + removedVariantsDf['Reference_Allele'].astype(str) + ' ' + removedVariantsDf['Tumor_Seq_Allele2'].astype(str) + ' ' + removedVariantsDf['Tumor_Sample_Barcode'].astype(str)
#Store filtered vairants
for center in removedVariantsDf['Center'].unique():
Expand Down Expand Up @@ -242,6 +246,8 @@ def stagingToCbio(syn, processingDate, genieVersion, CENTER_MAPPING_DF, database
totalSample = ['PATIENT_ID']
totalSample.extend(sampleCols)
sampleCols = totalSample
#Make sure to only grab samples that have patient information
sampleDf = sampleDf[sampleDf['PATIENT_ID'].isin(patientDf['PATIENT_ID'])]
clinicalDf = sampleDf.merge(patientDf, on="PATIENT_ID",how="outer")
#Remove patients without any sample or patient ids
clinicalDf = clinicalDf[~clinicalDf['SAMPLE_ID'].isnull()]
Expand Down Expand Up @@ -280,10 +286,10 @@ def stagingToCbio(syn, processingDate, genieVersion, CENTER_MAPPING_DF, database
'CANCER_TYPE_DETAILED': 'UNKNOWN',
'ONCOTREE_PRIMARY_NODE': 'UNKNOWN',
'ONCOTREE_SECONDARY_NODE': 'UNKNOWN'}
clinicalDf['CANCER_TYPE'] = [oncotreeDict[code.upper()].get("CANCER_TYPE",float('nan')) for code in clinicalDf['ONCOTREE_CODE']]
clinicalDf['CANCER_TYPE_DETAILED'] = [oncotreeDict[code.upper()].get("CANCER_TYPE_DETAILED",float('nan')) for code in clinicalDf['ONCOTREE_CODE']]
clinicalDf['ONCOTREE_PRIMARY_NODE'] = [oncotreeDict[code.upper()].get("ONCOTREE_PRIMARY_NODE",float('nan')) for code in clinicalDf['ONCOTREE_CODE']]
clinicalDf['ONCOTREE_SECONDARY_NODE'] = [oncotreeDict[code.upper()].get("ONCOTREE_SECONDARY_NODE",float('nan')) for code in clinicalDf['ONCOTREE_CODE']]
clinicalDf['CANCER_TYPE'] = [oncotreeDict[code.upper()]["CANCER_TYPE"] if code.upper() in oncotreeDict.keys() else float('nan') for code in clinicalDf['ONCOTREE_CODE']]
clinicalDf['CANCER_TYPE_DETAILED'] = [oncotreeDict[code.upper()]["CANCER_TYPE_DETAILED"] if code.upper() in oncotreeDict.keys() else float('nan') for code in clinicalDf['ONCOTREE_CODE']]
clinicalDf['ONCOTREE_PRIMARY_NODE'] = [oncotreeDict[code.upper()]["ONCOTREE_PRIMARY_NODE"] if code.upper() in oncotreeDict.keys() else float('nan') for code in clinicalDf['ONCOTREE_CODE']]
clinicalDf['ONCOTREE_SECONDARY_NODE'] = [oncotreeDict[code.upper()]["ONCOTREE_SECONDARY_NODE"] if code.upper() in oncotreeDict.keys() else float('nan') for code in clinicalDf['ONCOTREE_CODE']]

#CANCER TYPES are added which is why the clinical file is written out.
#clinicalDf.to_csv(CLINCICAL_PATH, sep="\t", index=False)
Expand All @@ -301,9 +307,11 @@ def stagingToCbio(syn, processingDate, genieVersion, CENTER_MAPPING_DF, database
clinicalDf['AGE_AT_SEQ_REPORT'][clinicalDf['AGE_AT_SEQ_REPORT'] == "<6570"] = "<18"

############################################################
#CENTER SPECIFIC CODE FOR RIGHT NOW (REMOVE UHN-555-V1)
#CENTER SPECIFIC CODE FOR RIGHT NOW (REMOVE UHN-555-V1, PHS-TRISEQ-V1)
############################################################
clinicalDf = clinicalDf[clinicalDf['SEQ_ASSAY_ID'] != "UHN-555-V1"]
clinicalDf = clinicalDf[clinicalDf['SEQ_ASSAY_ID'] != "PHS-TRISEQ-V1"]

#clinicalDf = clinicalDf[clinicalDf['CENTER'] != "WAKE"]
#clinicalDf = clinicalDf[clinicalDf['CENTER'] != "CRUK"]
############################################################
Expand Down
5 changes: 3 additions & 2 deletions genie/maf.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def createFinalMaf(self, mafDf, filePath, maf=False):
mafSet = mafDf.to_csv(sep="\t", index=False, header=None)
writeOrAppend = "w" if maf else "a"
with open(filePath, writeOrAppend) as maf:
mafSet = process_functions.removeStringFloat(mafSet)
maf.write(mafSet)

#There is a isNarrow option, but note that the number of rows of the maf file
Expand All @@ -58,7 +59,7 @@ def process_helper(self, filePath, path_to_GENIE, mafSynId, centerMafSynId,
narrowMafColumns = [col['name'] for col in self.syn.getTableColumns(mafSynId) if col['name'] != 'inBED']
#Strips out windows indentations \r
command = ['dos2unix',filePath]
subprocess.call(command)
subprocess.check_call(command)
tempdir = os.path.join(path_to_GENIE, self.center)
commandCall = ["perl",os.path.join(vcf2mafPath,"maf2maf.pl"),
"--input-maf",filePath,
Expand All @@ -71,7 +72,7 @@ def process_helper(self, filePath, path_to_GENIE, mafSynId, centerMafSynId,
"--custom-enst", os.path.join(vcf2mafPath,"data/isoform_overrides_uniprot")]
if reference is not None:
commandCall.extend(["--ref-fasta",reference])
maf = subprocess.call(commandCall)
maf = subprocess.check_call(commandCall)

process_functions.rmFiles(tempdir, recursive=False)
open(narrowMafPath,"w").close()
Expand Down
23 changes: 12 additions & 11 deletions genie/vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,16 +52,16 @@ def process_helper(self, vcffiles, path_to_GENIE, mafSynId, centerMafSynId,
newVCFPath = os.path.join(centerInputFolder, vcfName)
#remove chr from each row
command = ["sed", "'s/^chr//'", path, ">", newVCFPath]
subprocess.call(" ".join(command), shell=True)
subprocess.check_call(" ".join(command), shell=True)
#Empty spaces must be replaced with a period
command = ["sed", '-i', "'s/\t\t/\t.\t/g'", newVCFPath]
subprocess.call(" ".join(command), shell=True)
subprocess.check_call(" ".join(command), shell=True)
#All INFO/HGVS values have a whitespace, which is not allowed in VCF specs. Replace that with a comma
command = ['sed', '-i', "'s/ p\./,p./'", newVCFPath]
subprocess.call(" ".join(command), shell=True)
subprocess.check_call(" ".join(command), shell=True)
#Strips out windows indentations \r
command = ['dos2unix',newVCFPath]
subprocess.call(command)
subprocess.check_call(command)
vcfCols = ["#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"]
with open(newVCFPath,"r") as f:
for line in f:
Expand All @@ -74,22 +74,23 @@ def process_helper(self, vcffiles, path_to_GENIE, mafSynId, centerMafSynId,

samples = [i for i in cols if i not in vcfCols]

tumorName = vcfName.replace(".vcf","")
#tumorName = vcfName.replace(".vcf","")

if len(samples) == 1:
tumor = samples[0]
normal = "NORMAL"
### If the tumor name isn't TUMOR, set the sample id to be the tumor name
if tumor != "TUMOR":
tumorName = tumor
elif len(samples) == 2:
#Tumor is always first, normal is second
#Assumes that Tumor is always first, normal is second
tumor = samples[0]
normal = samples[1]
tumorName = tumor
else:
tumor = "TUMOR"
normal = "NORMAL"
# ### If the tumor name isn't TUMOR, set the sample id to be the tumor name
if tumor == "TUMOR":
tumorName = vcfName.replace(".vcf","")
else:
tumorName = tumor
newMAFPath = newVCFPath + ".maf"
if os.path.isfile(newMAFPath):
mafFiles.append(newMAFPath)
Expand All @@ -107,7 +108,7 @@ def process_helper(self, vcffiles, path_to_GENIE, mafSynId, centerMafSynId,
'--custom-enst', os.path.join(vcf2mafPath, 'data/isoform_overrides_uniprot')]
if reference is not None:
command.extend(["--ref-fasta",reference])
subprocess.call(command)
subprocess.check_call(command)
if (os.path.isfile(newMAFPath)):
mafFiles.append(newMAFPath)

Expand Down

0 comments on commit 9e886be

Please sign in to comment.