From 934294a5a779f2c30fae895829d8b46467507a22 Mon Sep 17 00:00:00 2001
From: thomasyu888 <thomasyu888@gmail.com>
Date: Sat, 26 Jan 2019 19:10:20 -0800
Subject: [PATCH 01/27] Change t_depth to STRING, add in removal of floats for
 maf processing

---
 analyses/genomicData/MAFinBED.R | 3 ++-
 analyses/mergeFlag/mergeCheck.R | 1 +
 genie/maf.py                    | 1 +
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/analyses/genomicData/MAFinBED.R b/analyses/genomicData/MAFinBED.R
index 29333068..a72188f2 100644
--- a/analyses/genomicData/MAFinBED.R
+++ b/analyses/genomicData/MAFinBED.R
@@ -52,7 +52,8 @@ print(nrow(genieMutData))
 
 
 originalCols = colnames(genieMutData)
-# records with count data that preclude a VAF estimate - set VAF to 100% (1/1, alt/depth)
+# records with count data that preclude a VAF estimate - set VAF to 100% (1/1, alt/depth)    genieMutData$t_depth <-  as.numeric(genieMutData$t_depth)
+genieMutData$t_depth <-  as.numeric(genieMutData$t_depth)
 noVAF.idx = which((genieMutData$t_depth==0)|is.na(genieMutData$t_depth))
 #keeps the order if factors exist
 #genieMutData$t_alt_count_num = as.numeric(levels(genieMutData$t_alt_count))[genieMutData$t_alt_count]
diff --git a/analyses/mergeFlag/mergeCheck.R b/analyses/mergeFlag/mergeCheck.R
index 99ecf791..505088ee 100644
--- a/analyses/mergeFlag/mergeCheck.R
+++ b/analyses/mergeFlag/mergeCheck.R
@@ -127,6 +127,7 @@ for (center in centers) {
     }
     
     # records with count data that preclude a VAF estimate - set VAF to 100% (1/1, alt/depth)
+    genieMutData$t_depth <-  as.numeric(genieMutData$t_depth)
     noVAF.idx = which((genieMutData$t_depth==0)|is.na(genieMutData$t_depth))
     genieMutData$t_alt_count_num = as.numeric(levels(genieMutData$t_alt_count))[genieMutData$t_alt_count]
     genieMutData$t_alt_count_num[noVAF.idx] = 1
diff --git a/genie/maf.py b/genie/maf.py
index de4a549e..e2f714bf 100644
--- a/genie/maf.py
+++ b/genie/maf.py
@@ -36,6 +36,7 @@ def createFinalMaf(self, mafDf, filePath, maf=False):
 				mafSet = mafDf.to_csv(sep="\t", index=False, header=None)
 			writeOrAppend = "w" if maf else "a"
 			with open(filePath, writeOrAppend) as maf:
+				mafSet = process_functions.removeStringFloat(mafSet)
 				maf.write(mafSet)
 
 	#There is a isNarrow option, but note that the number of rows of the maf file 

From 015d534d6c8429c5a01f43ae5a3df0cbe2da5a1e Mon Sep 17 00:00:00 2001
From: thomasyu888 <thomasyu888@gmail.com>
Date: Sat, 26 Jan 2019 19:19:47 -0800
Subject: [PATCH 02/27] use check_call

---
 genie/maf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/genie/maf.py b/genie/maf.py
index e2f714bf..c9591994 100644
--- a/genie/maf.py
+++ b/genie/maf.py
@@ -59,7 +59,7 @@ def process_helper(self, filePath, path_to_GENIE, mafSynId, centerMafSynId,
 		narrowMafColumns = [col['name'] for col in self.syn.getTableColumns(mafSynId) if col['name'] != 'inBED']
 		#Strips out windows indentations \r
 		command = ['dos2unix',filePath]
-		subprocess.call(command)
+		subprocess.check_call(command)
 		tempdir = os.path.join(path_to_GENIE, self.center)
 		commandCall = ["perl",os.path.join(vcf2mafPath,"maf2maf.pl"),
 					   "--input-maf",filePath,
@@ -72,7 +72,7 @@ def process_helper(self, filePath, path_to_GENIE, mafSynId, centerMafSynId,
 					   "--custom-enst", os.path.join(vcf2mafPath,"data/isoform_overrides_uniprot")]
 		if reference is not None:
 			commandCall.extend(["--ref-fasta",reference])
-		maf = subprocess.call(commandCall) 
+		maf = subprocess.check_call(commandCall) 
 
 		process_functions.rmFiles(tempdir, recursive=False)
 		open(narrowMafPath,"w").close()

From f309fbef2cc38257b4336c0fd67c35830c4211df Mon Sep 17 00:00:00 2001
From: thomasyu888 <thomasyu888@gmail.com>
Date: Sat, 26 Jan 2019 19:25:54 -0800
Subject: [PATCH 03/27] Use subprocess check call

---
 genie/vcf.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/genie/vcf.py b/genie/vcf.py
index f7aca93e..0e1162c7 100644
--- a/genie/vcf.py
+++ b/genie/vcf.py
@@ -52,16 +52,16 @@ def process_helper(self, vcffiles, path_to_GENIE, mafSynId, centerMafSynId,
 			newVCFPath = os.path.join(centerInputFolder, vcfName)
 			#remove chr from each row
 			command = ["sed", "'s/^chr//'", path, ">", newVCFPath]
-			subprocess.call(" ".join(command), shell=True)
+			subprocess.check_call(" ".join(command), shell=True)
 			#Empty spaces must be replaced with a period
 			command = ["sed", '-i', "'s/\t\t/\t.\t/g'", newVCFPath]
-			subprocess.call(" ".join(command), shell=True)
+			subprocess.check_call(" ".join(command), shell=True)
 			#All INFO/HGVS values have a whitespace, which is not allowed in VCF specs. Replace that with a comma
 			command = ['sed', '-i', "'s/ p\./,p./'", newVCFPath]
-			subprocess.call(" ".join(command), shell=True)
+			subprocess.check_call(" ".join(command), shell=True)
 			#Strips out windows indentations \r
 			command = ['dos2unix',newVCFPath]
-			subprocess.call(command)
+			subprocess.check_call(command)
 			vcfCols = ["#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"]
 			with open(newVCFPath,"r") as f:
 				for line in f:
@@ -107,7 +107,7 @@ def process_helper(self, vcffiles, path_to_GENIE, mafSynId, centerMafSynId,
 						   '--custom-enst', os.path.join(vcf2mafPath, 'data/isoform_overrides_uniprot')]
 				if reference is not None:
 					command.extend(["--ref-fasta",reference])
-				subprocess.call(command)
+				subprocess.check_call(command)
 				if (os.path.isfile(newMAFPath)):
 					mafFiles.append(newMAFPath)
 		

From 994b4d25ad61961da9b33ebbe898c0282d23b93e Mon Sep 17 00:00:00 2001
From: thomasyu888 <thomasyu888@gmail.com>
Date: Sun, 27 Jan 2019 16:17:50 -0800
Subject: [PATCH 04/27] Fix when site submits both TUMOR and NORMAL column. 
 Sample id is not TUMOR

---
 genie/vcf.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/genie/vcf.py b/genie/vcf.py
index 0e1162c7..3cb2c1b4 100644
--- a/genie/vcf.py
+++ b/genie/vcf.py
@@ -74,22 +74,23 @@ def process_helper(self, vcffiles, path_to_GENIE, mafSynId, centerMafSynId,
 
 			samples = [i for i in cols if i not in vcfCols]
 
-			tumorName = vcfName.replace(".vcf","")
+			#tumorName = vcfName.replace(".vcf","")
 
 			if len(samples) == 1:
 				tumor = samples[0]
 				normal = "NORMAL"
-				### If the tumor name isn't TUMOR, set the sample id to be the tumor name
-				if tumor != "TUMOR":
-					tumorName = tumor
 			elif len(samples) == 2:
-				#Tumor is always first, normal is second
+				#Assumes that Tumor is always first, normal is second
 				tumor = samples[0]
 				normal = samples[1]
-				tumorName = tumor   
 			else:
 				tumor = "TUMOR"
 				normal = "NORMAL"
+			# ### If the tumor name isn't TUMOR, set the sample id to be the tumor name
+			if tumor != "TUMOR":
+				tumorName = vcfName.replace(".vcf","")
+			else:
+				tumorName = tumor
 			newMAFPath = newVCFPath + ".maf"
 			if os.path.isfile(newMAFPath):
 				mafFiles.append(newMAFPath)

From e115f3884db224a231c87b346f04d6edbf66d6bb Mon Sep 17 00:00:00 2001
From: thomasyu888 <thomasyu888@gmail.com>
Date: Sun, 27 Jan 2019 16:28:16 -0800
Subject: [PATCH 05/27] Fix logic

---
 genie/vcf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/genie/vcf.py b/genie/vcf.py
index 3cb2c1b4..b98b52ff 100644
--- a/genie/vcf.py
+++ b/genie/vcf.py
@@ -87,7 +87,7 @@ def process_helper(self, vcffiles, path_to_GENIE, mafSynId, centerMafSynId,
 				tumor = "TUMOR"
 				normal = "NORMAL"
 			# ### If the tumor name isn't TUMOR, set the sample id to be the tumor name
-			if tumor != "TUMOR":
+			if tumor == "TUMOR":
 				tumorName = vcfName.replace(".vcf","")
 			else:
 				tumorName = tumor

From 187ea2c590adff3f60e8cc122922c20f4e5e76a0 Mon Sep 17 00:00:00 2001
From: Thomas Yu <thomasyu888@gmail.com>
Date: Tue, 29 Jan 2019 17:24:57 -0800
Subject: [PATCH 06/27] Write out mafinbed to file then upload

---
 analyses/genomicData/MAFinBED.R | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/analyses/genomicData/MAFinBED.R b/analyses/genomicData/MAFinBED.R
index a72188f2..6a7eba7c 100644
--- a/analyses/genomicData/MAFinBED.R
+++ b/analyses/genomicData/MAFinBED.R
@@ -113,5 +113,7 @@ genieMutData$t_alt_count_num <- NULL
 updateMutData = genieMutData[genieMutData$inBED != oldInBed,]
 
 if (nrow(updateMutData) > 0) { 
-  synStore(Table(mafSynId, updateMutData[c("ROW_ID","ROW_VERSION","inBED")]))
+  write.csv(updateMutData[c("ROW_ID","ROW_VERSION","inBED")],"update_inbed.csv",row.names = F)
+  #synStore(Table(mafSynId, updateMutData[c("ROW_ID","ROW_VERSION","inBED")]))
+  synStore(Table(mafSynId, "update_inbed.csv"))
 }
\ No newline at end of file

From b2595fa841bfb3c51fbacb94ade3b4e272bb98e1 Mon Sep 17 00:00:00 2001
From: thomasyu888 <thomasyu888@gmail.com>
Date: Thu, 31 Jan 2019 02:41:57 -0800
Subject: [PATCH 07/27] Chunk the upload of maf

---
 analyses/genomicData/MAFinBED.R | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/analyses/genomicData/MAFinBED.R b/analyses/genomicData/MAFinBED.R
index 6a7eba7c..276995c0 100644
--- a/analyses/genomicData/MAFinBED.R
+++ b/analyses/genomicData/MAFinBED.R
@@ -113,7 +113,18 @@ genieMutData$t_alt_count_num <- NULL
 updateMutData = genieMutData[genieMutData$inBED != oldInBed,]
 
 if (nrow(updateMutData) > 0) { 
-  write.csv(updateMutData[c("ROW_ID","ROW_VERSION","inBED")],"update_inbed.csv",row.names = F)
+  #write.csv(updateMutData[c("ROW_ID","ROW_VERSION","inBED")],"update_inbed.csv",row.names = F)
+  #Need to chunk the upload
+  chunk = 100000
+  rows = 0
+  while (rows*chunk < nrow(updateMutData)) {
+    if ((rows + 1)* chunk > nrow(updateMutData)) {
+      to_update = updateMutData[((rows*chunk)+1):nrow(updateMutData),c("ROW_ID","ROW_VERSION","inBED")]
+    } else{
+      to_update = updateMutData[((rows*chunk)+1):((rows+1)*chunk),c("ROW_ID","ROW_VERSION","inBED")]
+    }
+    rows = rows+1
+  }
   #synStore(Table(mafSynId, updateMutData[c("ROW_ID","ROW_VERSION","inBED")]))
-  synStore(Table(mafSynId, "update_inbed.csv"))
+  #synStore(Table(mafSynId, "update_inbed.csv"))
 }
\ No newline at end of file

From e6eb8a6dcada1194c19bc295f4f7f9a8e8bd6a0f Mon Sep 17 00:00:00 2001
From: thomasyu888 <thomasyu888@gmail.com>
Date: Thu, 31 Jan 2019 02:44:39 -0800
Subject: [PATCH 08/27] update maf table in chunks

---
 analyses/genomicData/MAFinBED.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/analyses/genomicData/MAFinBED.R b/analyses/genomicData/MAFinBED.R
index 276995c0..b5c9530c 100644
--- a/analyses/genomicData/MAFinBED.R
+++ b/analyses/genomicData/MAFinBED.R
@@ -125,6 +125,7 @@ if (nrow(updateMutData) > 0) {
     }
     rows = rows+1
   }
+  synStore(Table(mafSynId, to_update))
   #synStore(Table(mafSynId, updateMutData[c("ROW_ID","ROW_VERSION","inBED")]))
   #synStore(Table(mafSynId, "update_inbed.csv"))
 }
\ No newline at end of file

From 96b5a2079c99076a945f38a8e7d42756b5f3861f Mon Sep 17 00:00:00 2001
From: thomasyu888 <thomasyu888@gmail.com>
Date: Thu, 31 Jan 2019 02:45:56 -0800
Subject: [PATCH 09/27] Move synstore inside of while

---
 analyses/genomicData/MAFinBED.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/analyses/genomicData/MAFinBED.R b/analyses/genomicData/MAFinBED.R
index b5c9530c..a970a65b 100644
--- a/analyses/genomicData/MAFinBED.R
+++ b/analyses/genomicData/MAFinBED.R
@@ -123,9 +123,9 @@ if (nrow(updateMutData) > 0) {
     } else{
       to_update = updateMutData[((rows*chunk)+1):((rows+1)*chunk),c("ROW_ID","ROW_VERSION","inBED")]
     }
+    synStore(Table(mafSynId, to_update))
     rows = rows+1
   }
-  synStore(Table(mafSynId, to_update))
   #synStore(Table(mafSynId, updateMutData[c("ROW_ID","ROW_VERSION","inBED")]))
   #synStore(Table(mafSynId, "update_inbed.csv"))
 }
\ No newline at end of file

From 42c145fe9eb6a7e51b04dbf6b7e8488599ee969b Mon Sep 17 00:00:00 2001
From: thomasyu888 <thomasyu888@gmail.com>
Date: Thu, 31 Jan 2019 12:00:45 -0800
Subject: [PATCH 10/27] Write out mafinbed file instead of using tables

---
 analyses/genomicData/MAFinBED.R | 44 ++++++++++++++++++---------------
 genie/database_to_staging.py    | 11 ++++++---
 2 files changed, 31 insertions(+), 24 deletions(-)

diff --git a/analyses/genomicData/MAFinBED.R b/analyses/genomicData/MAFinBED.R
index a970a65b..c0f6ab48 100644
--- a/analyses/genomicData/MAFinBED.R
+++ b/analyses/genomicData/MAFinBED.R
@@ -4,8 +4,8 @@ library(synapser)
 library(VariantAnnotation)
 
 args = commandArgs(trailingOnly=TRUE)
-if (length(args) != 1) {
-  stop("Must supply a boolean value")
+if (length(args) != 2) {
+  stop("Must supply a boolean value and a filepath to write variants not in bed")
 }
 # SAGE login
 tryCatch({
@@ -108,24 +108,28 @@ for (panelName in seq_assays) {
 }
 genieMutData$t_depth_new <- NULL
 genieMutData$t_alt_count_num <- NULL
+# Commented out below because of PFLM-4975
 #Compare old inBED with new inBED column
 #If there are differences, update only the diffs
-updateMutData = genieMutData[genieMutData$inBED != oldInBed,]
+# updateMutData = genieMutData[genieMutData$inBED != oldInBed,]
+# if (nrow(updateMutData) > 0) { 
+#   #write.csv(updateMutData[c("ROW_ID","ROW_VERSION","inBED")],"update_inbed.csv",row.names = F)
+#   #Need to chunk the upload
 
-if (nrow(updateMutData) > 0) { 
-  #write.csv(updateMutData[c("ROW_ID","ROW_VERSION","inBED")],"update_inbed.csv",row.names = F)
-  #Need to chunk the upload
-  chunk = 100000
-  rows = 0
-  while (rows*chunk < nrow(updateMutData)) {
-    if ((rows + 1)* chunk > nrow(updateMutData)) {
-      to_update = updateMutData[((rows*chunk)+1):nrow(updateMutData),c("ROW_ID","ROW_VERSION","inBED")]
-    } else{
-      to_update = updateMutData[((rows*chunk)+1):((rows+1)*chunk),c("ROW_ID","ROW_VERSION","inBED")]
-    }
-    synStore(Table(mafSynId, to_update))
-    rows = rows+1
-  }
-  #synStore(Table(mafSynId, updateMutData[c("ROW_ID","ROW_VERSION","inBED")]))
-  #synStore(Table(mafSynId, "update_inbed.csv"))
-}
\ No newline at end of file
+#   #it is an amazon problem, where amazon kills the connection while reading the csv from s3.
+#   chunk = 100000
+#   rows = 0
+#   while (rows*chunk < nrow(updateMutData)) {
+#     if ((rows + 1)* chunk > nrow(updateMutData)) {
+#       to_update = updateMutData[((rows*chunk)+1):nrow(updateMutData),c("ROW_ID","ROW_VERSION","inBED")]
+#     } else{
+#       to_update = updateMutData[((rows*chunk)+1):((rows+1)*chunk),c("ROW_ID","ROW_VERSION","inBED")]
+#     }
+#     synStore(Table(mafSynId, to_update))
+#     rows = rows+1
+#   }
+#   #synStore(Table(mafSynId, updateMutData[c("ROW_ID","ROW_VERSION","inBED")]))
+#   #synStore(Table(mafSynId, "update_inbed.csv"))
+# }
+updateMutData = genieMutData[genieMutData$inBED == FALSE,c('Chromosome', 'Start_Position', 'End_Position', 'Reference_Allele', 'Tumor_Seq_Allele2', 'Tumor_Sample_Barcode', 'Center')]
+write.csv(updateMutData,args[2],row.names = F)
diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py
index cca5294d..08450160 100644
--- a/genie/database_to_staging.py
+++ b/genie/database_to_staging.py
@@ -151,12 +151,15 @@ def configureMafRow(rowArray, headers, keepSamples, remove_variants):
 #Run MAF in BED script, filter data and update MAFinBED database
 def runMAFinBED(syn, CENTER_MAPPING_DF, databaseSynIdMappingDf, test=False, genieVersion="test"):
 	MAFinBED_script = os.path.join(os.path.dirname(os.path.abspath(__file__)),'../analyses/genomicData/MAFinBED.R')
-	command = ['Rscript', MAFinBED_script, str(test)]
+	notinbed_variant_file = os.path.join(os.path.dirname(os.path.abspath(__file__)),'../analyses/genomicData/notinbed.csv')
+
+	command = ['Rscript', MAFinBED_script, str(test), notinbed_variant_file]
 	subprocess.check_call(command)
 
-	mutationSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == "vcf2maf"][0]
-	removedVariants = syn.tableQuery("select Chromosome, Start_Position, End_Position, Reference_Allele, Tumor_Seq_Allele2, Tumor_Sample_Barcode, Center from %s where inBED is False and Center in ('%s')" % (mutationSynId,"','".join(CENTER_MAPPING_DF.center)))
-	removedVariantsDf = removedVariants.asDataFrame()
+	# mutationSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == "vcf2maf"][0]
+	# removedVariants = syn.tableQuery("select Chromosome, Start_Position, End_Position, Reference_Allele, Tumor_Seq_Allele2, Tumor_Sample_Barcode, Center from %s where inBED is False and Center in ('%s')" % (mutationSynId,"','".join(CENTER_MAPPING_DF.center)))
+	# removedVariantsDf = removedVariants.asDataFrame()
+	removedVariantsDf = pd.read_csv(notinbed_variant_file)
 	removedVariantsDf['removeVariants'] = removedVariantsDf['Chromosome'].astype(str) + ' ' + removedVariantsDf['Start_Position'].astype(str) + ' ' + removedVariantsDf['End_Position'].astype(str) + ' ' + removedVariantsDf['Reference_Allele'].astype(str) + ' ' + removedVariantsDf['Tumor_Seq_Allele2'].astype(str) + ' ' + removedVariantsDf['Tumor_Sample_Barcode'].astype(str)
 	#Store filtered vairants
 	for center in removedVariantsDf['Center'].unique():

From aa8a968fbff5b799fde5aa1fd94cc5eabc96eea7 Mon Sep 17 00:00:00 2001
From: thomasyu888 <thomasyu888@gmail.com>
Date: Thu, 31 Jan 2019 12:43:37 -0800
Subject: [PATCH 11/27] Make sure code is in oncotree, if not, return null

---
 genie/database_to_staging.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py
index 08450160..c408ac74 100644
--- a/genie/database_to_staging.py
+++ b/genie/database_to_staging.py
@@ -283,10 +283,10 @@ def stagingToCbio(syn, processingDate, genieVersion, CENTER_MAPPING_DF, database
 			  'CANCER_TYPE_DETAILED': 'UNKNOWN',
 			  'ONCOTREE_PRIMARY_NODE': 'UNKNOWN',
 			  'ONCOTREE_SECONDARY_NODE': 'UNKNOWN'}
-	clinicalDf['CANCER_TYPE'] = [oncotreeDict[code.upper()].get("CANCER_TYPE",float('nan')) for code in clinicalDf['ONCOTREE_CODE']]
-	clinicalDf['CANCER_TYPE_DETAILED'] = [oncotreeDict[code.upper()].get("CANCER_TYPE_DETAILED",float('nan')) for code in clinicalDf['ONCOTREE_CODE']]
-	clinicalDf['ONCOTREE_PRIMARY_NODE'] = [oncotreeDict[code.upper()].get("ONCOTREE_PRIMARY_NODE",float('nan')) for code in clinicalDf['ONCOTREE_CODE']]
-	clinicalDf['ONCOTREE_SECONDARY_NODE'] = [oncotreeDict[code.upper()].get("ONCOTREE_SECONDARY_NODE",float('nan')) for code in clinicalDf['ONCOTREE_CODE']]
+	clinicalDf['CANCER_TYPE'] = [oncotreeDict[code.upper()]["CANCER_TYPE"] if code.upper() in oncotreeDict.keys() else float('nan') for code in clinicalDf['ONCOTREE_CODE']]
+	clinicalDf['CANCER_TYPE_DETAILED'] = [oncotreeDict[code.upper()]["CANCER_TYPE_DETAILED"] if code.upper() in oncotreeDict.keys() else float('nan') for code in clinicalDf['ONCOTREE_CODE']]
+	clinicalDf['ONCOTREE_PRIMARY_NODE'] = [oncotreeDict[code.upper()]["ONCOTREE_PRIMARY_NODE"]  if code.upper() in oncotreeDict.keys() else float('nan') for code in clinicalDf['ONCOTREE_CODE']]
+	clinicalDf['ONCOTREE_SECONDARY_NODE'] = [oncotreeDict[code.upper()]["ONCOTREE_SECONDARY_NODE"] if code.upper() in oncotreeDict.keys() else float('nan') for code in clinicalDf['ONCOTREE_CODE']]
 
 	#CANCER TYPES are added which is why the clinical file is written out.
 	#clinicalDf.to_csv(CLINCICAL_PATH, sep="\t", index=False)

From 5590684e08bd921ce9d3e5dbaae3558364572fc0 Mon Sep 17 00:00:00 2001
From: thomasyu888 <thomasyu888@gmail.com>
Date: Thu, 31 Jan 2019 13:48:42 -0800
Subject: [PATCH 12/27] Use lists

---
 genie/database_to_staging.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py
index c408ac74..d709f88a 100644
--- a/genie/database_to_staging.py
+++ b/genie/database_to_staging.py
@@ -129,7 +129,8 @@ def configureMafRow(rowArray, headers, keepSamples, remove_variants):
 	seq = str(rowArray[headers.index('Tumor_Seq_Allele2')])
 	sampleId = str(rowArray[headers.index('Tumor_Sample_Barcode')])
 	variant = chrom +' '+ start+ ' '+end +' '+ref + ' '+ seq+ ' ' + sampleId
-	if pd.Series(sampleId).isin(keepSamples).any() and not pd.Series(variant).isin(remove_variants).any():
+	#if pd.Series(sampleId).isin(keepSamples).any() and not pd.Series(variant).isin(remove_variants).any():
+	if sampleId in keepSamples.tolist() and not variant in remove_variants.tolist():
 		fillnas = ['t_depth','t_ref_count','t_alt_count','n_depth','n_ref_count','n_alt_count']
 		for i in fillnas:
 			#mutationsDf[i] = mutationsDf[i].fillna("NA")

From a9637f5f979445b0af2e55387c695ba247da00a5 Mon Sep 17 00:00:00 2001
From: thomasyu888 <thomasyu888@gmail.com>
Date: Thu, 31 Jan 2019 16:44:45 -0800
Subject: [PATCH 13/27] Exclude another panel

---
 analyses/genomicData/MAFinBED.R | 4 ++++
 genie/database_to_staging.py    | 4 +++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/analyses/genomicData/MAFinBED.R b/analyses/genomicData/MAFinBED.R
index c0f6ab48..4a5863a0 100644
--- a/analyses/genomicData/MAFinBED.R
+++ b/analyses/genomicData/MAFinBED.R
@@ -37,6 +37,10 @@ sampleData$AGE_AT_SEQ_REPORT_NUMERICAL <- NULL
 patientData$BIRTH_YEAR_NUMERICAL <- NULL
 patientData$CENTER <- NULL
 genieClinData <- merge.data.frame(patientData, sampleData, by="PATIENT_ID")
+
+#EXCLUDE PHS-TRISEQ-V1 SAMPLES
+genieClinData <- genieClinData[genieClinData$SEQ_ASSAY_ID != "PHS-TRISEQ-V1",]
+
 # read aggregated BED file data
 genieBed = synTableQuery(sprintf('SELECT * FROM %s', bedSynId),includeRowIdAndRowVersion=F)
 genieBedData = synapser::as.data.frame(genieBed)
diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py
index d709f88a..0ca1feea 100644
--- a/genie/database_to_staging.py
+++ b/genie/database_to_staging.py
@@ -305,9 +305,11 @@ def stagingToCbio(syn, processingDate, genieVersion, CENTER_MAPPING_DF, database
 	clinicalDf['AGE_AT_SEQ_REPORT'][clinicalDf['AGE_AT_SEQ_REPORT'] == "<6570"] = "<18"
 
 	############################################################
-	#CENTER SPECIFIC CODE FOR RIGHT NOW (REMOVE UHN-555-V1)
+	#CENTER SPECIFIC CODE FOR RIGHT NOW (REMOVE UHN-555-V1, PHS-TRISEQ-V1)
 	############################################################
 	clinicalDf = clinicalDf[clinicalDf['SEQ_ASSAY_ID'] != "UHN-555-V1"]
+	clinicalDf = clinicalDf[clinicalDf['SEQ_ASSAY_ID'] != "PHS-TRISEQ-V1"]
+
 	#clinicalDf = clinicalDf[clinicalDf['CENTER'] != "WAKE"]
 	#clinicalDf = clinicalDf[clinicalDf['CENTER'] != "CRUK"]
 	############################################################

From 2b7cd1c3b81e94de15e0ce127cb575c426564df3 Mon Sep 17 00:00:00 2001
From: Thomas Yu <thomasyu888@gmail.com>
Date: Fri, 1 Feb 2019 12:53:05 -0800
Subject: [PATCH 14/27] Comment out maf for now, and fix clinical validation
 error

---
 genie/database_to_staging.py | 72 ++++++++++++++++++------------------
 1 file changed, 37 insertions(+), 35 deletions(-)

diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py
index 0ca1feea..0dc708b0 100644
--- a/genie/database_to_staging.py
+++ b/genie/database_to_staging.py
@@ -246,6 +246,8 @@ def stagingToCbio(syn, processingDate, genieVersion, CENTER_MAPPING_DF, database
 	totalSample = ['PATIENT_ID']
 	totalSample.extend(sampleCols)
 	sampleCols = totalSample
+	#Make sure to only grab samples that have patient information
+	sampleDf = sampleDf[sampleDf['PATIENT_ID'].isin(patientDf['PATIENT_ID'])]
 	clinicalDf = sampleDf.merge(patientDf, on="PATIENT_ID",how="outer")
 	#Remove patients without any sample or patient ids
 	clinicalDf = clinicalDf[~clinicalDf['SAMPLE_ID'].isnull()]
@@ -363,41 +365,41 @@ def stagingToCbio(syn, processingDate, genieVersion, CENTER_MAPPING_DF, database
 	sequenced_samples = "#sequenced_samples: " + " ".join(samples)
 
 	logger.info("FILTERING, STORING MUTATION FILES")
-	centerMafFileViewSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == "centerMafView"][0]
-	centerMafSynIds = syn.tableQuery("select id from %s " % centerMafFileViewSynId + "where name like '%mutation%'")
-	centerMafSynIdsDf = centerMafSynIds.asDataFrame()
-	with open(MUTATIONS_PATH, 'w') as f:
-		f.write(sequenced_samples + "\n") 
-	for index, mafSynId in enumerate(centerMafSynIdsDf.id):
-		mafEnt = syn.get(mafSynId)
-		logger.info(mafEnt.path)
-		with open(mafEnt.path,"r") as mafFile:
-			header = mafFile.readline()
-			headers = header.replace("\n","").split("\t")
-			if index == 0:
-				with open(MUTATIONS_PATH, 'a') as f:
-					f.write(header)
-				#Create maf file per center for their staging directory
-				for center in clinicalDf['CENTER'].unique():
-					with open(MUTATIONS_CENTER_PATH % center, 'w') as f:
-						f.write(header)
-		# with open(mafEnt.path,"r") as newMafFile:
-		# 	newMafFile.readline()
-			center = mafEnt.path.split("_")[3]
-			#Make sure to only write the centers that release = True
-			if center in CENTER_MAPPING_DF.center.tolist():
-				for row in mafFile:
-					rowArray = row.replace("\n","").split("\t")
-					center = rowArray[headers.index('Center')]
-					newMergedRow = configureMafRow(rowArray, headers, keepForMergedConsortiumSamples, remove_mafInBed_variants)
-					if newMergedRow is not None:
-						with open(MUTATIONS_PATH, 'a') as f:
-							f.write(newMergedRow)
-					newCenterRow = configureMafRow(rowArray, headers, keepForCenterConsortiumSamples, remove_mafInBed_variants)
-					if newCenterRow is not None:
-						with open(MUTATIONS_CENTER_PATH % center, 'a') as f:
-							f.write(newCenterRow)
-	storeFile(syn, MUTATIONS_PATH, parent= consortiumReleaseSynId, genieVersion=genieVersion, name="data_mutations_extended.txt", staging=current_release_staging)
+	# centerMafFileViewSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == "centerMafView"][0]
+	# centerMafSynIds = syn.tableQuery("select id from %s " % centerMafFileViewSynId + "where name like '%mutation%'")
+	# centerMafSynIdsDf = centerMafSynIds.asDataFrame()
+	# with open(MUTATIONS_PATH, 'w') as f:
+	# 	f.write(sequenced_samples + "\n") 
+	# for index, mafSynId in enumerate(centerMafSynIdsDf.id):
+	# 	mafEnt = syn.get(mafSynId)
+	# 	logger.info(mafEnt.path)
+	# 	with open(mafEnt.path,"r") as mafFile:
+	# 		header = mafFile.readline()
+	# 		headers = header.replace("\n","").split("\t")
+	# 		if index == 0:
+	# 			with open(MUTATIONS_PATH, 'a') as f:
+	# 				f.write(header)
+	# 			#Create maf file per center for their staging directory
+	# 			for center in clinicalDf['CENTER'].unique():
+	# 				with open(MUTATIONS_CENTER_PATH % center, 'w') as f:
+	# 					f.write(header)
+	# 	# with open(mafEnt.path,"r") as newMafFile:
+	# 	# 	newMafFile.readline()
+	# 		center = mafEnt.path.split("_")[3]
+	# 		#Make sure to only write the centers that release = True
+	# 		if center in CENTER_MAPPING_DF.center.tolist():
+	# 			for row in mafFile:
+	# 				rowArray = row.replace("\n","").split("\t")
+	# 				center = rowArray[headers.index('Center')]
+	# 				newMergedRow = configureMafRow(rowArray, headers, keepForMergedConsortiumSamples, remove_mafInBed_variants)
+	# 				if newMergedRow is not None:
+	# 					with open(MUTATIONS_PATH, 'a') as f:
+	# 						f.write(newMergedRow)
+	# 				newCenterRow = configureMafRow(rowArray, headers, keepForCenterConsortiumSamples, remove_mafInBed_variants)
+	# 				if newCenterRow is not None:
+	# 					with open(MUTATIONS_CENTER_PATH % center, 'a') as f:
+	# 						f.write(newCenterRow)
+	# storeFile(syn, MUTATIONS_PATH, parent= consortiumReleaseSynId, genieVersion=genieVersion, name="data_mutations_extended.txt", staging=current_release_staging)
 	if not current_release_staging:
 		for center in clinicalDf['CENTER'].unique():
 			storeFile(syn, MUTATIONS_CENTER_PATH % center, genieVersion=genieVersion, parent = CENTER_MAPPING_DF['stagingSynId'][CENTER_MAPPING_DF['center'] == center][0], centerStaging=True)

From 525e8d988489528d528e42d2a839810c3ee444d2 Mon Sep 17 00:00:00 2001
From: Thomas Yu <thomasyu888@gmail.com>
Date: Fri, 1 Feb 2019 12:53:58 -0800
Subject: [PATCH 15/27] comment out filter

---
 genie/database_to_staging.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py
index 0dc708b0..57e9cce9 100644
--- a/genie/database_to_staging.py
+++ b/genie/database_to_staging.py
@@ -266,8 +266,8 @@ def stagingToCbio(syn, processingDate, genieVersion, CENTER_MAPPING_DF, database
 	#########FILTERING#########
 	logger.info("REMOVING PHI")
 	clinicalDf = reAnnotatePHI(clinicalDf)
-	logger.info("MAF IN BED FILTER")
-	remove_mafInBed_variants = runMAFinBED(syn, CENTER_MAPPING_DF, databaseSynIdMappingDf, test, genieVersion=genieVersion)
+	#logger.info("MAF IN BED FILTER")
+	#remove_mafInBed_variants = runMAFinBED(syn, CENTER_MAPPING_DF, databaseSynIdMappingDf, test, genieVersion=genieVersion)
 	logger.info("MUTATION IN CIS FILTER")
 	remove_mutationInCis_samples = mutation_in_cis_filter(syn, skipMutationsInCis, test, variant_filtering_synId, CENTER_MAPPING_DF, genieVersion=genieVersion)
 	logger.info("SEQ DATE FILTER")

From abf0752cf722769fb71ec419b1c2c5a7e58a8367 Mon Sep 17 00:00:00 2001
From: Thomas Yu <thomasyu888@gmail.com>
Date: Fri, 1 Feb 2019 12:56:38 -0800
Subject: [PATCH 16/27] Uncomment

---
 genie/database_to_staging.py | 74 ++++++++++++++++++------------------
 1 file changed, 37 insertions(+), 37 deletions(-)

diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py
index 57e9cce9..0dc6ac43 100644
--- a/genie/database_to_staging.py
+++ b/genie/database_to_staging.py
@@ -266,8 +266,8 @@ def stagingToCbio(syn, processingDate, genieVersion, CENTER_MAPPING_DF, database
 	#########FILTERING#########
 	logger.info("REMOVING PHI")
 	clinicalDf = reAnnotatePHI(clinicalDf)
-	#logger.info("MAF IN BED FILTER")
-	#remove_mafInBed_variants = runMAFinBED(syn, CENTER_MAPPING_DF, databaseSynIdMappingDf, test, genieVersion=genieVersion)
+	logger.info("MAF IN BED FILTER")
+	remove_mafInBed_variants = runMAFinBED(syn, CENTER_MAPPING_DF, databaseSynIdMappingDf, test, genieVersion=genieVersion)
 	logger.info("MUTATION IN CIS FILTER")
 	remove_mutationInCis_samples = mutation_in_cis_filter(syn, skipMutationsInCis, test, variant_filtering_synId, CENTER_MAPPING_DF, genieVersion=genieVersion)
 	logger.info("SEQ DATE FILTER")
@@ -365,41 +365,41 @@ def stagingToCbio(syn, processingDate, genieVersion, CENTER_MAPPING_DF, database
 	sequenced_samples = "#sequenced_samples: " + " ".join(samples)
 
 	logger.info("FILTERING, STORING MUTATION FILES")
-	# centerMafFileViewSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == "centerMafView"][0]
-	# centerMafSynIds = syn.tableQuery("select id from %s " % centerMafFileViewSynId + "where name like '%mutation%'")
-	# centerMafSynIdsDf = centerMafSynIds.asDataFrame()
-	# with open(MUTATIONS_PATH, 'w') as f:
-	# 	f.write(sequenced_samples + "\n") 
-	# for index, mafSynId in enumerate(centerMafSynIdsDf.id):
-	# 	mafEnt = syn.get(mafSynId)
-	# 	logger.info(mafEnt.path)
-	# 	with open(mafEnt.path,"r") as mafFile:
-	# 		header = mafFile.readline()
-	# 		headers = header.replace("\n","").split("\t")
-	# 		if index == 0:
-	# 			with open(MUTATIONS_PATH, 'a') as f:
-	# 				f.write(header)
-	# 			#Create maf file per center for their staging directory
-	# 			for center in clinicalDf['CENTER'].unique():
-	# 				with open(MUTATIONS_CENTER_PATH % center, 'w') as f:
-	# 					f.write(header)
-	# 	# with open(mafEnt.path,"r") as newMafFile:
-	# 	# 	newMafFile.readline()
-	# 		center = mafEnt.path.split("_")[3]
-	# 		#Make sure to only write the centers that release = True
-	# 		if center in CENTER_MAPPING_DF.center.tolist():
-	# 			for row in mafFile:
-	# 				rowArray = row.replace("\n","").split("\t")
-	# 				center = rowArray[headers.index('Center')]
-	# 				newMergedRow = configureMafRow(rowArray, headers, keepForMergedConsortiumSamples, remove_mafInBed_variants)
-	# 				if newMergedRow is not None:
-	# 					with open(MUTATIONS_PATH, 'a') as f:
-	# 						f.write(newMergedRow)
-	# 				newCenterRow = configureMafRow(rowArray, headers, keepForCenterConsortiumSamples, remove_mafInBed_variants)
-	# 				if newCenterRow is not None:
-	# 					with open(MUTATIONS_CENTER_PATH % center, 'a') as f:
-	# 						f.write(newCenterRow)
-	# storeFile(syn, MUTATIONS_PATH, parent= consortiumReleaseSynId, genieVersion=genieVersion, name="data_mutations_extended.txt", staging=current_release_staging)
+	centerMafFileViewSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == "centerMafView"][0]
+	centerMafSynIds = syn.tableQuery("select id from %s " % centerMafFileViewSynId + "where name like '%mutation%'")
+	centerMafSynIdsDf = centerMafSynIds.asDataFrame()
+	with open(MUTATIONS_PATH, 'w') as f:
+		f.write(sequenced_samples + "\n") 
+	for index, mafSynId in enumerate(centerMafSynIdsDf.id):
+		mafEnt = syn.get(mafSynId)
+		logger.info(mafEnt.path)
+		with open(mafEnt.path,"r") as mafFile:
+			header = mafFile.readline()
+			headers = header.replace("\n","").split("\t")
+			if index == 0:
+				with open(MUTATIONS_PATH, 'a') as f:
+					f.write(header)
+				#Create maf file per center for their staging directory
+				for center in clinicalDf['CENTER'].unique():
+					with open(MUTATIONS_CENTER_PATH % center, 'w') as f:
+						f.write(header)
+		# with open(mafEnt.path,"r") as newMafFile:
+		# 	newMafFile.readline()
+			center = mafEnt.path.split("_")[3]
+			#Make sure to only write the centers that release = True
+			if center in CENTER_MAPPING_DF.center.tolist():
+				for row in mafFile:
+					rowArray = row.replace("\n","").split("\t")
+					center = rowArray[headers.index('Center')]
+					newMergedRow = configureMafRow(rowArray, headers, keepForMergedConsortiumSamples, remove_mafInBed_variants)
+					if newMergedRow is not None:
+						with open(MUTATIONS_PATH, 'a') as f:
+							f.write(newMergedRow)
+					newCenterRow = configureMafRow(rowArray, headers, keepForCenterConsortiumSamples, remove_mafInBed_variants)
+					if newCenterRow is not None:
+						with open(MUTATIONS_CENTER_PATH % center, 'a') as f:
+							f.write(newCenterRow)
+	storeFile(syn, MUTATIONS_PATH, parent= consortiumReleaseSynId, genieVersion=genieVersion, name="data_mutations_extended.txt", staging=current_release_staging)
 	if not current_release_staging:
 		for center in clinicalDf['CENTER'].unique():
 			storeFile(syn, MUTATIONS_CENTER_PATH % center, genieVersion=genieVersion, parent = CENTER_MAPPING_DF['stagingSynId'][CENTER_MAPPING_DF['center'] == center][0], centerStaging=True)

From 6dca94d1887e183338d44dbe39e527c4c40033c5 Mon Sep 17 00:00:00 2001
From: Thomas Yu <thomasyu888@gmail.com>
Date: Fri, 1 Feb 2019 13:13:52 -0800
Subject: [PATCH 17/27] Comment out

---
 genie/database_to_staging.py | 74 ++++++++++++++++++------------------
 1 file changed, 37 insertions(+), 37 deletions(-)

diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py
index 0dc6ac43..167af655 100644
--- a/genie/database_to_staging.py
+++ b/genie/database_to_staging.py
@@ -266,8 +266,8 @@ def stagingToCbio(syn, processingDate, genieVersion, CENTER_MAPPING_DF, database
 	#########FILTERING#########
 	logger.info("REMOVING PHI")
 	clinicalDf = reAnnotatePHI(clinicalDf)
-	logger.info("MAF IN BED FILTER")
-	remove_mafInBed_variants = runMAFinBED(syn, CENTER_MAPPING_DF, databaseSynIdMappingDf, test, genieVersion=genieVersion)
+	#logger.info("MAF IN BED FILTER")
+	#remove_mafInBed_variants = runMAFinBED(syn, CENTER_MAPPING_DF, databaseSynIdMappingDf, test, genieVersion=genieVersion)
 	logger.info("MUTATION IN CIS FILTER")
 	remove_mutationInCis_samples = mutation_in_cis_filter(syn, skipMutationsInCis, test, variant_filtering_synId, CENTER_MAPPING_DF, genieVersion=genieVersion)
 	logger.info("SEQ DATE FILTER")
@@ -368,41 +368,41 @@ def stagingToCbio(syn, processingDate, genieVersion, CENTER_MAPPING_DF, database
 	centerMafFileViewSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == "centerMafView"][0]
 	centerMafSynIds = syn.tableQuery("select id from %s " % centerMafFileViewSynId + "where name like '%mutation%'")
 	centerMafSynIdsDf = centerMafSynIds.asDataFrame()
-	with open(MUTATIONS_PATH, 'w') as f:
-		f.write(sequenced_samples + "\n") 
-	for index, mafSynId in enumerate(centerMafSynIdsDf.id):
-		mafEnt = syn.get(mafSynId)
-		logger.info(mafEnt.path)
-		with open(mafEnt.path,"r") as mafFile:
-			header = mafFile.readline()
-			headers = header.replace("\n","").split("\t")
-			if index == 0:
-				with open(MUTATIONS_PATH, 'a') as f:
-					f.write(header)
-				#Create maf file per center for their staging directory
-				for center in clinicalDf['CENTER'].unique():
-					with open(MUTATIONS_CENTER_PATH % center, 'w') as f:
-						f.write(header)
-		# with open(mafEnt.path,"r") as newMafFile:
-		# 	newMafFile.readline()
-			center = mafEnt.path.split("_")[3]
-			#Make sure to only write the centers that release = True
-			if center in CENTER_MAPPING_DF.center.tolist():
-				for row in mafFile:
-					rowArray = row.replace("\n","").split("\t")
-					center = rowArray[headers.index('Center')]
-					newMergedRow = configureMafRow(rowArray, headers, keepForMergedConsortiumSamples, remove_mafInBed_variants)
-					if newMergedRow is not None:
-						with open(MUTATIONS_PATH, 'a') as f:
-							f.write(newMergedRow)
-					newCenterRow = configureMafRow(rowArray, headers, keepForCenterConsortiumSamples, remove_mafInBed_variants)
-					if newCenterRow is not None:
-						with open(MUTATIONS_CENTER_PATH % center, 'a') as f:
-							f.write(newCenterRow)
-	storeFile(syn, MUTATIONS_PATH, parent= consortiumReleaseSynId, genieVersion=genieVersion, name="data_mutations_extended.txt", staging=current_release_staging)
-	if not current_release_staging:
-		for center in clinicalDf['CENTER'].unique():
-			storeFile(syn, MUTATIONS_CENTER_PATH % center, genieVersion=genieVersion, parent = CENTER_MAPPING_DF['stagingSynId'][CENTER_MAPPING_DF['center'] == center][0], centerStaging=True)
+	# with open(MUTATIONS_PATH, 'w') as f:
+	# 	f.write(sequenced_samples + "\n") 
+	# for index, mafSynId in enumerate(centerMafSynIdsDf.id):
+	# 	mafEnt = syn.get(mafSynId)
+	# 	logger.info(mafEnt.path)
+	# 	with open(mafEnt.path,"r") as mafFile:
+	# 		header = mafFile.readline()
+	# 		headers = header.replace("\n","").split("\t")
+	# 		if index == 0:
+	# 			with open(MUTATIONS_PATH, 'a') as f:
+	# 				f.write(header)
+	# 			#Create maf file per center for their staging directory
+	# 			for center in clinicalDf['CENTER'].unique():
+	# 				with open(MUTATIONS_CENTER_PATH % center, 'w') as f:
+	# 					f.write(header)
+	# 	# with open(mafEnt.path,"r") as newMafFile:
+	# 	# 	newMafFile.readline()
+	# 		center = mafEnt.path.split("_")[3]
+	# 		#Make sure to only write the centers that release = True
+	# 		if center in CENTER_MAPPING_DF.center.tolist():
+	# 			for row in mafFile:
+	# 				rowArray = row.replace("\n","").split("\t")
+	# 				center = rowArray[headers.index('Center')]
+	# 				newMergedRow = configureMafRow(rowArray, headers, keepForMergedConsortiumSamples, remove_mafInBed_variants)
+	# 				if newMergedRow is not None:
+	# 					with open(MUTATIONS_PATH, 'a') as f:
+	# 						f.write(newMergedRow)
+	# 				newCenterRow = configureMafRow(rowArray, headers, keepForCenterConsortiumSamples, remove_mafInBed_variants)
+	# 				if newCenterRow is not None:
+	# 					with open(MUTATIONS_CENTER_PATH % center, 'a') as f:
+	# 						f.write(newCenterRow)
+	# storeFile(syn, MUTATIONS_PATH, parent= consortiumReleaseSynId, genieVersion=genieVersion, name="data_mutations_extended.txt", staging=current_release_staging)
+	# if not current_release_staging:
+	# 	for center in clinicalDf['CENTER'].unique():
+	# 		storeFile(syn, MUTATIONS_CENTER_PATH % center, genieVersion=genieVersion, parent = CENTER_MAPPING_DF['stagingSynId'][CENTER_MAPPING_DF['center'] == center][0], centerStaging=True)
 
 	#Only need to upload these files once
 	#if filtering:

From 04ddf9e411bd8af17d4de6c3b87ede0455b67430 Mon Sep 17 00:00:00 2001
From: Thomas Yu <thomasyu888@gmail.com>
Date: Fri, 1 Feb 2019 13:16:10 -0800
Subject: [PATCH 18/27] uncomment

---
 genie/database_to_staging.py | 74 ++++++++++++++++++------------------
 1 file changed, 37 insertions(+), 37 deletions(-)

diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py
index 167af655..0dc6ac43 100644
--- a/genie/database_to_staging.py
+++ b/genie/database_to_staging.py
@@ -266,8 +266,8 @@ def stagingToCbio(syn, processingDate, genieVersion, CENTER_MAPPING_DF, database
 	#########FILTERING#########
 	logger.info("REMOVING PHI")
 	clinicalDf = reAnnotatePHI(clinicalDf)
-	#logger.info("MAF IN BED FILTER")
-	#remove_mafInBed_variants = runMAFinBED(syn, CENTER_MAPPING_DF, databaseSynIdMappingDf, test, genieVersion=genieVersion)
+	logger.info("MAF IN BED FILTER")
+	remove_mafInBed_variants = runMAFinBED(syn, CENTER_MAPPING_DF, databaseSynIdMappingDf, test, genieVersion=genieVersion)
 	logger.info("MUTATION IN CIS FILTER")
 	remove_mutationInCis_samples = mutation_in_cis_filter(syn, skipMutationsInCis, test, variant_filtering_synId, CENTER_MAPPING_DF, genieVersion=genieVersion)
 	logger.info("SEQ DATE FILTER")
@@ -368,41 +368,41 @@ def stagingToCbio(syn, processingDate, genieVersion, CENTER_MAPPING_DF, database
 	centerMafFileViewSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == "centerMafView"][0]
 	centerMafSynIds = syn.tableQuery("select id from %s " % centerMafFileViewSynId + "where name like '%mutation%'")
 	centerMafSynIdsDf = centerMafSynIds.asDataFrame()
-	# with open(MUTATIONS_PATH, 'w') as f:
-	# 	f.write(sequenced_samples + "\n") 
-	# for index, mafSynId in enumerate(centerMafSynIdsDf.id):
-	# 	mafEnt = syn.get(mafSynId)
-	# 	logger.info(mafEnt.path)
-	# 	with open(mafEnt.path,"r") as mafFile:
-	# 		header = mafFile.readline()
-	# 		headers = header.replace("\n","").split("\t")
-	# 		if index == 0:
-	# 			with open(MUTATIONS_PATH, 'a') as f:
-	# 				f.write(header)
-	# 			#Create maf file per center for their staging directory
-	# 			for center in clinicalDf['CENTER'].unique():
-	# 				with open(MUTATIONS_CENTER_PATH % center, 'w') as f:
-	# 					f.write(header)
-	# 	# with open(mafEnt.path,"r") as newMafFile:
-	# 	# 	newMafFile.readline()
-	# 		center = mafEnt.path.split("_")[3]
-	# 		#Make sure to only write the centers that release = True
-	# 		if center in CENTER_MAPPING_DF.center.tolist():
-	# 			for row in mafFile:
-	# 				rowArray = row.replace("\n","").split("\t")
-	# 				center = rowArray[headers.index('Center')]
-	# 				newMergedRow = configureMafRow(rowArray, headers, keepForMergedConsortiumSamples, remove_mafInBed_variants)
-	# 				if newMergedRow is not None:
-	# 					with open(MUTATIONS_PATH, 'a') as f:
-	# 						f.write(newMergedRow)
-	# 				newCenterRow = configureMafRow(rowArray, headers, keepForCenterConsortiumSamples, remove_mafInBed_variants)
-	# 				if newCenterRow is not None:
-	# 					with open(MUTATIONS_CENTER_PATH % center, 'a') as f:
-	# 						f.write(newCenterRow)
-	# storeFile(syn, MUTATIONS_PATH, parent= consortiumReleaseSynId, genieVersion=genieVersion, name="data_mutations_extended.txt", staging=current_release_staging)
-	# if not current_release_staging:
-	# 	for center in clinicalDf['CENTER'].unique():
-	# 		storeFile(syn, MUTATIONS_CENTER_PATH % center, genieVersion=genieVersion, parent = CENTER_MAPPING_DF['stagingSynId'][CENTER_MAPPING_DF['center'] == center][0], centerStaging=True)
+	with open(MUTATIONS_PATH, 'w') as f:
+		f.write(sequenced_samples + "\n") 
+	for index, mafSynId in enumerate(centerMafSynIdsDf.id):
+		mafEnt = syn.get(mafSynId)
+		logger.info(mafEnt.path)
+		with open(mafEnt.path,"r") as mafFile:
+			header = mafFile.readline()
+			headers = header.replace("\n","").split("\t")
+			if index == 0:
+				with open(MUTATIONS_PATH, 'a') as f:
+					f.write(header)
+				#Create maf file per center for their staging directory
+				for center in clinicalDf['CENTER'].unique():
+					with open(MUTATIONS_CENTER_PATH % center, 'w') as f:
+						f.write(header)
+		# with open(mafEnt.path,"r") as newMafFile:
+		# 	newMafFile.readline()
+			center = mafEnt.path.split("_")[3]
+			#Make sure to only write the centers that release = True
+			if center in CENTER_MAPPING_DF.center.tolist():
+				for row in mafFile:
+					rowArray = row.replace("\n","").split("\t")
+					center = rowArray[headers.index('Center')]
+					newMergedRow = configureMafRow(rowArray, headers, keepForMergedConsortiumSamples, remove_mafInBed_variants)
+					if newMergedRow is not None:
+						with open(MUTATIONS_PATH, 'a') as f:
+							f.write(newMergedRow)
+					newCenterRow = configureMafRow(rowArray, headers, keepForCenterConsortiumSamples, remove_mafInBed_variants)
+					if newCenterRow is not None:
+						with open(MUTATIONS_CENTER_PATH % center, 'a') as f:
+							f.write(newCenterRow)
+	storeFile(syn, MUTATIONS_PATH, parent= consortiumReleaseSynId, genieVersion=genieVersion, name="data_mutations_extended.txt", staging=current_release_staging)
+	if not current_release_staging:
+		for center in clinicalDf['CENTER'].unique():
+			storeFile(syn, MUTATIONS_CENTER_PATH % center, genieVersion=genieVersion, parent = CENTER_MAPPING_DF['stagingSynId'][CENTER_MAPPING_DF['center'] == center][0], centerStaging=True)
 
 	#Only need to upload these files once
 	#if filtering:

From f67ea50bb2cc3c31d4159997f65e60d309d2aa8a Mon Sep 17 00:00:00 2001
From: Thomas Yu <thomasyu888@gmail.com>
Date: Fri, 1 Feb 2019 13:57:48 -0800
Subject: [PATCH 19/27] Comment

---
 genie/dashboard_table_updater.py |  2 +-
 genie/database_to_staging.py     | 74 ++++++++++++++++----------------
 2 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/genie/dashboard_table_updater.py b/genie/dashboard_table_updater.py
index 7d552fc4..b6ea22f4 100644
--- a/genie/dashboard_table_updater.py
+++ b/genie/dashboard_table_updater.py
@@ -256,7 +256,7 @@ def update_oncotree_code_tables(syn, database_mappingdf):
 	oncotree_link_ent = syn.get(oncotree_link_synid)
 	oncotree_link = oncotree_link_ent.externalURL
 	oncotree_mapping = genie.process_functions.get_oncotree_code_mappings(oncotree_link)
-	clinicaldf['PRIMARY_CODES'] = [oncotree_mapping[i.upper()]['ONCOTREE_PRIMARY_NODE'] for i in clinicaldf.ONCOTREE_CODE]
+	clinicaldf['PRIMARY_CODES'] = [oncotree_mapping[i.upper()]['ONCOTREE_PRIMARY_NODE'] if i.upper() in oncotree_mapping.keys() else 'NOT_MAPPED' for i in clinicaldf.ONCOTREE_CODE]
 
 	# ### DISTRIBUTION OF PRIMARY ONCOTREE CODE TABLE UPDATE
 	primary_code_distributiondf = pd.DataFrame(columns=set(clinicaldf['CENTER']), index=set(clinicaldf['PRIMARY_CODES']))
diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py
index 0dc6ac43..167af655 100644
--- a/genie/database_to_staging.py
+++ b/genie/database_to_staging.py
@@ -266,8 +266,8 @@ def stagingToCbio(syn, processingDate, genieVersion, CENTER_MAPPING_DF, database
 	#########FILTERING#########
 	logger.info("REMOVING PHI")
 	clinicalDf = reAnnotatePHI(clinicalDf)
-	logger.info("MAF IN BED FILTER")
-	remove_mafInBed_variants = runMAFinBED(syn, CENTER_MAPPING_DF, databaseSynIdMappingDf, test, genieVersion=genieVersion)
+	#logger.info("MAF IN BED FILTER")
+	#remove_mafInBed_variants = runMAFinBED(syn, CENTER_MAPPING_DF, databaseSynIdMappingDf, test, genieVersion=genieVersion)
 	logger.info("MUTATION IN CIS FILTER")
 	remove_mutationInCis_samples = mutation_in_cis_filter(syn, skipMutationsInCis, test, variant_filtering_synId, CENTER_MAPPING_DF, genieVersion=genieVersion)
 	logger.info("SEQ DATE FILTER")
@@ -368,41 +368,41 @@ def stagingToCbio(syn, processingDate, genieVersion, CENTER_MAPPING_DF, database
 	centerMafFileViewSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == "centerMafView"][0]
 	centerMafSynIds = syn.tableQuery("select id from %s " % centerMafFileViewSynId + "where name like '%mutation%'")
 	centerMafSynIdsDf = centerMafSynIds.asDataFrame()
-	with open(MUTATIONS_PATH, 'w') as f:
-		f.write(sequenced_samples + "\n") 
-	for index, mafSynId in enumerate(centerMafSynIdsDf.id):
-		mafEnt = syn.get(mafSynId)
-		logger.info(mafEnt.path)
-		with open(mafEnt.path,"r") as mafFile:
-			header = mafFile.readline()
-			headers = header.replace("\n","").split("\t")
-			if index == 0:
-				with open(MUTATIONS_PATH, 'a') as f:
-					f.write(header)
-				#Create maf file per center for their staging directory
-				for center in clinicalDf['CENTER'].unique():
-					with open(MUTATIONS_CENTER_PATH % center, 'w') as f:
-						f.write(header)
-		# with open(mafEnt.path,"r") as newMafFile:
-		# 	newMafFile.readline()
-			center = mafEnt.path.split("_")[3]
-			#Make sure to only write the centers that release = True
-			if center in CENTER_MAPPING_DF.center.tolist():
-				for row in mafFile:
-					rowArray = row.replace("\n","").split("\t")
-					center = rowArray[headers.index('Center')]
-					newMergedRow = configureMafRow(rowArray, headers, keepForMergedConsortiumSamples, remove_mafInBed_variants)
-					if newMergedRow is not None:
-						with open(MUTATIONS_PATH, 'a') as f:
-							f.write(newMergedRow)
-					newCenterRow = configureMafRow(rowArray, headers, keepForCenterConsortiumSamples, remove_mafInBed_variants)
-					if newCenterRow is not None:
-						with open(MUTATIONS_CENTER_PATH % center, 'a') as f:
-							f.write(newCenterRow)
-	storeFile(syn, MUTATIONS_PATH, parent= consortiumReleaseSynId, genieVersion=genieVersion, name="data_mutations_extended.txt", staging=current_release_staging)
-	if not current_release_staging:
-		for center in clinicalDf['CENTER'].unique():
-			storeFile(syn, MUTATIONS_CENTER_PATH % center, genieVersion=genieVersion, parent = CENTER_MAPPING_DF['stagingSynId'][CENTER_MAPPING_DF['center'] == center][0], centerStaging=True)
+	# with open(MUTATIONS_PATH, 'w') as f:
+	# 	f.write(sequenced_samples + "\n") 
+	# for index, mafSynId in enumerate(centerMafSynIdsDf.id):
+	# 	mafEnt = syn.get(mafSynId)
+	# 	logger.info(mafEnt.path)
+	# 	with open(mafEnt.path,"r") as mafFile:
+	# 		header = mafFile.readline()
+	# 		headers = header.replace("\n","").split("\t")
+	# 		if index == 0:
+	# 			with open(MUTATIONS_PATH, 'a') as f:
+	# 				f.write(header)
+	# 			#Create maf file per center for their staging directory
+	# 			for center in clinicalDf['CENTER'].unique():
+	# 				with open(MUTATIONS_CENTER_PATH % center, 'w') as f:
+	# 					f.write(header)
+	# 	# with open(mafEnt.path,"r") as newMafFile:
+	# 	# 	newMafFile.readline()
+	# 		center = mafEnt.path.split("_")[3]
+	# 		#Make sure to only write the centers that release = True
+	# 		if center in CENTER_MAPPING_DF.center.tolist():
+	# 			for row in mafFile:
+	# 				rowArray = row.replace("\n","").split("\t")
+	# 				center = rowArray[headers.index('Center')]
+	# 				newMergedRow = configureMafRow(rowArray, headers, keepForMergedConsortiumSamples, remove_mafInBed_variants)
+	# 				if newMergedRow is not None:
+	# 					with open(MUTATIONS_PATH, 'a') as f:
+	# 						f.write(newMergedRow)
+	# 				newCenterRow = configureMafRow(rowArray, headers, keepForCenterConsortiumSamples, remove_mafInBed_variants)
+	# 				if newCenterRow is not None:
+	# 					with open(MUTATIONS_CENTER_PATH % center, 'a') as f:
+	# 						f.write(newCenterRow)
+	# storeFile(syn, MUTATIONS_PATH, parent= consortiumReleaseSynId, genieVersion=genieVersion, name="data_mutations_extended.txt", staging=current_release_staging)
+	# if not current_release_staging:
+	# 	for center in clinicalDf['CENTER'].unique():
+	# 		storeFile(syn, MUTATIONS_CENTER_PATH % center, genieVersion=genieVersion, parent = CENTER_MAPPING_DF['stagingSynId'][CENTER_MAPPING_DF['center'] == center][0], centerStaging=True)
 
 	#Only need to upload these files once
 	#if filtering:

From bf0504761538a708098b703cda84f92f525e9c08 Mon Sep 17 00:00:00 2001
From: Thomas Yu <thomasyu888@gmail.com>
Date: Fri, 1 Feb 2019 14:14:03 -0800
Subject: [PATCH 20/27] Fix oncotree mapping

---
 genie/dashboardTemplate.Rmd      | 8 ++++++--
 genie/dashboard_table_updater.py | 2 +-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/genie/dashboardTemplate.Rmd b/genie/dashboardTemplate.Rmd
index a13dfff8..87bd606f 100644
--- a/genie/dashboardTemplate.Rmd
+++ b/genie/dashboardTemplate.Rmd
@@ -104,7 +104,11 @@ plotPrimarySites <- function(clinical, oncotreeLink, release) {
     oncotree_json = data$TISSUE
     oncotreeDict = extract(oncotree_json, "", "")
     clinical$PRIMARY_CODES <- unlist(sapply(clinical$ONCOTREE_CODE, function(code) {
-      oncotreeDict[[toupper(code)]]["ONCOTREE_PRIMARY_NODE"]
+      if (toupper(code) %in% names(oncotreeDict)){
+        oncotreeDict[[toupper(code)]]["ONCOTREE_PRIMARY_NODE"]
+      } else {
+        "DEPRECATED_CODE"
+      }
     }))
   })
   clinical$CENTER = createCenterColumn(clinical)
@@ -288,7 +292,7 @@ kable(samplesPerReleaseDf,row.names = F)
 #primary site distribution
 par(mar=c(10,3,3,1))
 barplot(sort(log(table(this_mut$FILTER)),decreasing = T), main="Log Distribution of Mutation FILTERs",las=2)
-plotPrimarySites(this_samples, "http://oncotree.mskcc.org/api/tumorTypes/tree?version=oncotree_2017_06_21","%s")
+plotPrimarySites(this_samples, "http://oncotree.mskcc.org/api/tumorTypes/tree?version=oncotree_2018_06_01","%s")
 #Center X Race
 plotCenterXRace(this_patient)
 #Center X Ethnicity
diff --git a/genie/dashboard_table_updater.py b/genie/dashboard_table_updater.py
index b6ea22f4..285b0e6d 100644
--- a/genie/dashboard_table_updater.py
+++ b/genie/dashboard_table_updater.py
@@ -256,7 +256,7 @@ def update_oncotree_code_tables(syn, database_mappingdf):
 	oncotree_link_ent = syn.get(oncotree_link_synid)
 	oncotree_link = oncotree_link_ent.externalURL
 	oncotree_mapping = genie.process_functions.get_oncotree_code_mappings(oncotree_link)
-	clinicaldf['PRIMARY_CODES'] = [oncotree_mapping[i.upper()]['ONCOTREE_PRIMARY_NODE'] if i.upper() in oncotree_mapping.keys() else 'NOT_MAPPED' for i in clinicaldf.ONCOTREE_CODE]
+	clinicaldf['PRIMARY_CODES'] = [oncotree_mapping[i.upper()]['ONCOTREE_PRIMARY_NODE'] if i.upper() in oncotree_mapping.keys() else 'DEPRECATED_CODE' for i in clinicaldf.ONCOTREE_CODE]
 
 	# ### DISTRIBUTION OF PRIMARY ONCOTREE CODE TABLE UPDATE
 	primary_code_distributiondf = pd.DataFrame(columns=set(clinicaldf['CENTER']), index=set(clinicaldf['PRIMARY_CODES']))

From afab94f40a65c3a030f3f351a5f87ce78a1e6749 Mon Sep 17 00:00:00 2001
From: Thomas Yu <thomasyu888@gmail.com>
Date: Fri, 1 Feb 2019 14:17:06 -0800
Subject: [PATCH 21/27] uncomment

---
 genie/database_to_staging.py | 218 +++++++++++++++++------------------
 1 file changed, 109 insertions(+), 109 deletions(-)

diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py
index 167af655..6ee6e5b7 100644
--- a/genie/database_to_staging.py
+++ b/genie/database_to_staging.py
@@ -266,8 +266,8 @@ def stagingToCbio(syn, processingDate, genieVersion, CENTER_MAPPING_DF, database
 	#########FILTERING#########
 	logger.info("REMOVING PHI")
 	clinicalDf = reAnnotatePHI(clinicalDf)
-	#logger.info("MAF IN BED FILTER")
-	#remove_mafInBed_variants = runMAFinBED(syn, CENTER_MAPPING_DF, databaseSynIdMappingDf, test, genieVersion=genieVersion)
+	logger.info("MAF IN BED FILTER")
+	remove_mafInBed_variants = runMAFinBED(syn, CENTER_MAPPING_DF, databaseSynIdMappingDf, test, genieVersion=genieVersion)
 	logger.info("MUTATION IN CIS FILTER")
 	remove_mutationInCis_samples = mutation_in_cis_filter(syn, skipMutationsInCis, test, variant_filtering_synId, CENTER_MAPPING_DF, genieVersion=genieVersion)
 	logger.info("SEQ DATE FILTER")
@@ -368,41 +368,41 @@ def stagingToCbio(syn, processingDate, genieVersion, CENTER_MAPPING_DF, database
 	centerMafFileViewSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == "centerMafView"][0]
 	centerMafSynIds = syn.tableQuery("select id from %s " % centerMafFileViewSynId + "where name like '%mutation%'")
 	centerMafSynIdsDf = centerMafSynIds.asDataFrame()
-	# with open(MUTATIONS_PATH, 'w') as f:
-	# 	f.write(sequenced_samples + "\n") 
-	# for index, mafSynId in enumerate(centerMafSynIdsDf.id):
-	# 	mafEnt = syn.get(mafSynId)
-	# 	logger.info(mafEnt.path)
-	# 	with open(mafEnt.path,"r") as mafFile:
-	# 		header = mafFile.readline()
-	# 		headers = header.replace("\n","").split("\t")
-	# 		if index == 0:
-	# 			with open(MUTATIONS_PATH, 'a') as f:
-	# 				f.write(header)
-	# 			#Create maf file per center for their staging directory
-	# 			for center in clinicalDf['CENTER'].unique():
-	# 				with open(MUTATIONS_CENTER_PATH % center, 'w') as f:
-	# 					f.write(header)
-	# 	# with open(mafEnt.path,"r") as newMafFile:
-	# 	# 	newMafFile.readline()
-	# 		center = mafEnt.path.split("_")[3]
-	# 		#Make sure to only write the centers that release = True
-	# 		if center in CENTER_MAPPING_DF.center.tolist():
-	# 			for row in mafFile:
-	# 				rowArray = row.replace("\n","").split("\t")
-	# 				center = rowArray[headers.index('Center')]
-	# 				newMergedRow = configureMafRow(rowArray, headers, keepForMergedConsortiumSamples, remove_mafInBed_variants)
-	# 				if newMergedRow is not None:
-	# 					with open(MUTATIONS_PATH, 'a') as f:
-	# 						f.write(newMergedRow)
-	# 				newCenterRow = configureMafRow(rowArray, headers, keepForCenterConsortiumSamples, remove_mafInBed_variants)
-	# 				if newCenterRow is not None:
-	# 					with open(MUTATIONS_CENTER_PATH % center, 'a') as f:
-	# 						f.write(newCenterRow)
-	# storeFile(syn, MUTATIONS_PATH, parent= consortiumReleaseSynId, genieVersion=genieVersion, name="data_mutations_extended.txt", staging=current_release_staging)
-	# if not current_release_staging:
-	# 	for center in clinicalDf['CENTER'].unique():
-	# 		storeFile(syn, MUTATIONS_CENTER_PATH % center, genieVersion=genieVersion, parent = CENTER_MAPPING_DF['stagingSynId'][CENTER_MAPPING_DF['center'] == center][0], centerStaging=True)
+	with open(MUTATIONS_PATH, 'w') as f:
+		f.write(sequenced_samples + "\n") 
+	for index, mafSynId in enumerate(centerMafSynIdsDf.id):
+		mafEnt = syn.get(mafSynId)
+		logger.info(mafEnt.path)
+		with open(mafEnt.path,"r") as mafFile:
+			header = mafFile.readline()
+			headers = header.replace("\n","").split("\t")
+			if index == 0:
+				with open(MUTATIONS_PATH, 'a') as f:
+					f.write(header)
+				#Create maf file per center for their staging directory
+				for center in clinicalDf['CENTER'].unique():
+					with open(MUTATIONS_CENTER_PATH % center, 'w') as f:
+						f.write(header)
+		# with open(mafEnt.path,"r") as newMafFile:
+		# 	newMafFile.readline()
+			center = mafEnt.path.split("_")[3]
+			#Make sure to only write the centers that release = True
+			if center in CENTER_MAPPING_DF.center.tolist():
+				for row in mafFile:
+					rowArray = row.replace("\n","").split("\t")
+					center = rowArray[headers.index('Center')]
+					newMergedRow = configureMafRow(rowArray, headers, keepForMergedConsortiumSamples, remove_mafInBed_variants)
+					if newMergedRow is not None:
+						with open(MUTATIONS_PATH, 'a') as f:
+							f.write(newMergedRow)
+					newCenterRow = configureMafRow(rowArray, headers, keepForCenterConsortiumSamples, remove_mafInBed_variants)
+					if newCenterRow is not None:
+						with open(MUTATIONS_CENTER_PATH % center, 'a') as f:
+							f.write(newCenterRow)
+	storeFile(syn, MUTATIONS_PATH, parent= consortiumReleaseSynId, genieVersion=genieVersion, name="data_mutations_extended.txt", staging=current_release_staging)
+	if not current_release_staging:
+		for center in clinicalDf['CENTER'].unique():
+			storeFile(syn, MUTATIONS_CENTER_PATH % center, genieVersion=genieVersion, parent = CENTER_MAPPING_DF['stagingSynId'][CENTER_MAPPING_DF['center'] == center][0], centerStaging=True)
 
 	#Only need to upload these files once
 	#if filtering:
@@ -740,79 +740,79 @@ def main():
 	process.checkUrl(args.oncotreeLink)
 
 	#get syn id of case list folder in consortium release
-	caseListSynId = findCaseListId(syn, consortiumSynId)
-
-	if not args.test and not args.staging:
-		processTrackerSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == 'processTracker'].values[0]
-		processTracker = syn.tableQuery("SELECT timeStartProcessing FROM %s where center = 'SAGE' and processingType = 'dbToStage'" % processTrackerSynId)
-		processTrackerDf = processTracker.asDataFrame()
-		processTrackerDf['timeStartProcessing'][0] = str(int(time.time()*1000))
-		syn.store(synapseclient.Table(processTrackerSynId,processTrackerDf))
-
-	syn.table_query_timeout = 50000
-	centerMappingSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == 'centerMapping'].values[0]
-	#Only release files where release is true
-	CENTER_MAPPING = syn.tableQuery('SELECT * FROM %s where release is true' % centerMappingSynId)
-	CENTER_MAPPING_DF = CENTER_MAPPING.asDataFrame()
-	processingDate = datetime.datetime.strptime(args.processingDate, '%b-%Y')
-#######
-	cbioValidatorPath = os.path.join(args.cbioportalPath,"core/src/main/scripts/importer/validateData.py")
-	assert os.path.exists(cbioValidatorPath), "Please specify correct cbioportalPath"
-
-	logger.info("STAGING TO CONSORTIUM")
-	genePanelEntities = stagingToCbio(syn, processingDate, args.genieVersion, CENTER_MAPPING_DF, databaseSynIdMappingDf, oncotree_url=args.oncotreeLink, consortiumReleaseCutOff= args.consortiumReleaseCutOff, current_release_staging=args.staging, skipMutationsInCis=args.skipMutationsInCis, test=args.test)
+# 	caseListSynId = findCaseListId(syn, consortiumSynId)
+
+# 	if not args.test and not args.staging:
+# 		processTrackerSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == 'processTracker'].values[0]
+# 		processTracker = syn.tableQuery("SELECT timeStartProcessing FROM %s where center = 'SAGE' and processingType = 'dbToStage'" % processTrackerSynId)
+# 		processTrackerDf = processTracker.asDataFrame()
+# 		processTrackerDf['timeStartProcessing'][0] = str(int(time.time()*1000))
+# 		syn.store(synapseclient.Table(processTrackerSynId,processTrackerDf))
+
+# 	syn.table_query_timeout = 50000
+# 	centerMappingSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == 'centerMapping'].values[0]
+# 	#Only release files where release is true
+# 	CENTER_MAPPING = syn.tableQuery('SELECT * FROM %s where release is true' % centerMappingSynId)
+# 	CENTER_MAPPING_DF = CENTER_MAPPING.asDataFrame()
+# 	processingDate = datetime.datetime.strptime(args.processingDate, '%b-%Y')
+# #######
+# 	cbioValidatorPath = os.path.join(args.cbioportalPath,"core/src/main/scripts/importer/validateData.py")
+# 	assert os.path.exists(cbioValidatorPath), "Please specify correct cbioportalPath"
+
+# 	logger.info("STAGING TO CONSORTIUM")
+# 	genePanelEntities = stagingToCbio(syn, processingDate, args.genieVersion, CENTER_MAPPING_DF, databaseSynIdMappingDf, oncotree_url=args.oncotreeLink, consortiumReleaseCutOff= args.consortiumReleaseCutOff, current_release_staging=args.staging, skipMutationsInCis=args.skipMutationsInCis, test=args.test)
 	
-	#No need to run twice anymore
-	#stagingToCbio(syn, processingDate, args.genieVersion, CENTER_MAPPING_DF, databaseSynIdMappingDf, oncotree_url=args.oncotreeLink, consortiumReleaseCutOff= args.consortiumReleaseCutOff, filtering=True, current_release_staging=args.staging, test=args.test)
-	#Create case lists files
-	logger.info("CREATE CASE LIST FILES")
-	#Remove old caselists first
-	if not os.path.exists(CASE_LIST_PATH):
-		os.mkdir(CASE_LIST_PATH)
-	caselists = os.listdir(CASE_LIST_PATH)
-	[os.remove(os.path.join(CASE_LIST_PATH,caselist)) for caselist in caselists]
-	CLINICAL_PATH = os.path.join(GENIE_RELEASE_DIR,'data_clinical_%s.txt' % args.genieVersion)
-	GENE_MATRIX_PATH = os.path.join(GENIE_RELEASE_DIR,"data_gene_matrix_%s.txt" % args.genieVersion)
-	create_case_lists.create_case_lists(CLINICAL_PATH, GENE_MATRIX_PATH, CASE_LIST_PATH, "genie_private")
-	caseListFiles = os.listdir(CASE_LIST_PATH)
-	caseListEntities = []
-	for casePath in caseListFiles:
-		casePath = os.path.join(CASE_LIST_PATH, casePath)
-		caseListEntities.append(storeFile(syn, casePath, parent=caseListSynId, staging=args.staging, caseLists=True, genieVersion=args.genieVersion))
-
-	logger.info("REMOVING UNNECESSARY FILES")
-	genie_files = os.listdir(GENIE_RELEASE_DIR)
-	#deletePatterns = ('data_clinical_supp_patient_','data_clinical_supp_sample_','data_CNA_','data_mutations_extended_','data_fusions_','genie_private_data_cna_hg19_')
-	#[os.remove(os.path.join(GENIE_RELEASE_DIR,genieFile)) for genieFile in genie_files if genieFile.startswith(deletePatterns)]
-	[os.remove(os.path.join(GENIE_RELEASE_DIR,genieFile)) for genieFile in genie_files if args.genieVersion not in genieFile and "meta" not in genieFile and "case_lists" not in genieFile]
-	os.remove(CLINICAL_PATH)
-#######	
-	logger.info("REVISE METADATA FILES")
-	command_reviseMetadataFiles(syn, args, databaseSynIdMappingDf)
-	logger.info("CBIO VALIDATION")
-	#Must be exit 0 because the validator sometimes fails, but we still want to capture the output	
-	command = [cbioValidatorPath,'-s',GENIE_RELEASE_DIR,'-n','; exit 0']
-	cbioOutput = subprocess.check_output(" ".join(command), shell=True)
-	logger.info(cbioOutput.decode("utf-8"))
-	if not args.test and not args.staging:
-		with open("cbioValidatorLogsConsortium_%s.txt" % args.genieVersion, "w") as cbioLog:
-			cbioLog.write(cbioOutput.decode("utf-8"))
-		syn.store(File("cbioValidatorLogsConsortium_%s.txt" % args.genieVersion, parentId = "syn10155804"))
-		os.remove("cbioValidatorLogsConsortium_%s.txt" % args.genieVersion)
-	logger.info("REMOVING OLD FILES")
-
-	process.rmFiles(CASE_LIST_PATH)
-	if os.path.exists('%s/genie_private_meta_cna_hg19_seg.txt' % GENIE_RELEASE_DIR):
-		os.unlink('%s/genie_private_meta_cna_hg19_seg.txt' % GENIE_RELEASE_DIR)
-	logger.info("CREATING LINK VERSION")
-	createLinkVersion(syn, args.genieVersion, caseListEntities, genePanelEntities, databaseSynIdMappingDf)
-
-	if not args.test and not args.staging:
-		processTracker = syn.tableQuery("SELECT timeEndProcessing FROM %s where center = 'SAGE' and processingType = 'dbToStage'" % processTrackerSynId)
-		processTrackerDf = processTracker.asDataFrame()
-		processTrackerDf['timeEndProcessing'][0] = str(int(time.time()*1000))
-		syn.store(synapseclient.Table(processTrackerSynId,processTrackerDf))
-	logger.info("COMPLETED DATABASE TO STAGING")
+# 	#No need to run twice anymore
+# 	#stagingToCbio(syn, processingDate, args.genieVersion, CENTER_MAPPING_DF, databaseSynIdMappingDf, oncotree_url=args.oncotreeLink, consortiumReleaseCutOff= args.consortiumReleaseCutOff, filtering=True, current_release_staging=args.staging, test=args.test)
+# 	#Create case lists files
+# 	logger.info("CREATE CASE LIST FILES")
+# 	#Remove old caselists first
+# 	if not os.path.exists(CASE_LIST_PATH):
+# 		os.mkdir(CASE_LIST_PATH)
+# 	caselists = os.listdir(CASE_LIST_PATH)
+# 	[os.remove(os.path.join(CASE_LIST_PATH,caselist)) for caselist in caselists]
+# 	CLINICAL_PATH = os.path.join(GENIE_RELEASE_DIR,'data_clinical_%s.txt' % args.genieVersion)
+# 	GENE_MATRIX_PATH = os.path.join(GENIE_RELEASE_DIR,"data_gene_matrix_%s.txt" % args.genieVersion)
+# 	create_case_lists.create_case_lists(CLINICAL_PATH, GENE_MATRIX_PATH, CASE_LIST_PATH, "genie_private")
+# 	caseListFiles = os.listdir(CASE_LIST_PATH)
+# 	caseListEntities = []
+# 	for casePath in caseListFiles:
+# 		casePath = os.path.join(CASE_LIST_PATH, casePath)
+# 		caseListEntities.append(storeFile(syn, casePath, parent=caseListSynId, staging=args.staging, caseLists=True, genieVersion=args.genieVersion))
+
+# 	logger.info("REMOVING UNNECESSARY FILES")
+# 	genie_files = os.listdir(GENIE_RELEASE_DIR)
+# 	#deletePatterns = ('data_clinical_supp_patient_','data_clinical_supp_sample_','data_CNA_','data_mutations_extended_','data_fusions_','genie_private_data_cna_hg19_')
+# 	#[os.remove(os.path.join(GENIE_RELEASE_DIR,genieFile)) for genieFile in genie_files if genieFile.startswith(deletePatterns)]
+# 	[os.remove(os.path.join(GENIE_RELEASE_DIR,genieFile)) for genieFile in genie_files if args.genieVersion not in genieFile and "meta" not in genieFile and "case_lists" not in genieFile]
+# 	os.remove(CLINICAL_PATH)
+# #######	
+# 	logger.info("REVISE METADATA FILES")
+# 	command_reviseMetadataFiles(syn, args, databaseSynIdMappingDf)
+# 	logger.info("CBIO VALIDATION")
+# 	#Must be exit 0 because the validator sometimes fails, but we still want to capture the output	
+# 	command = [cbioValidatorPath,'-s',GENIE_RELEASE_DIR,'-n','; exit 0']
+# 	cbioOutput = subprocess.check_output(" ".join(command), shell=True)
+# 	logger.info(cbioOutput.decode("utf-8"))
+# 	if not args.test and not args.staging:
+# 		with open("cbioValidatorLogsConsortium_%s.txt" % args.genieVersion, "w") as cbioLog:
+# 			cbioLog.write(cbioOutput.decode("utf-8"))
+# 		syn.store(File("cbioValidatorLogsConsortium_%s.txt" % args.genieVersion, parentId = "syn10155804"))
+# 		os.remove("cbioValidatorLogsConsortium_%s.txt" % args.genieVersion)
+# 	logger.info("REMOVING OLD FILES")
+
+# 	process.rmFiles(CASE_LIST_PATH)
+# 	if os.path.exists('%s/genie_private_meta_cna_hg19_seg.txt' % GENIE_RELEASE_DIR):
+# 		os.unlink('%s/genie_private_meta_cna_hg19_seg.txt' % GENIE_RELEASE_DIR)
+# 	logger.info("CREATING LINK VERSION")
+# 	createLinkVersion(syn, args.genieVersion, caseListEntities, genePanelEntities, databaseSynIdMappingDf)
+
+# 	if not args.test and not args.staging:
+# 		processTracker = syn.tableQuery("SELECT timeEndProcessing FROM %s where center = 'SAGE' and processingType = 'dbToStage'" % processTrackerSynId)
+# 		processTrackerDf = processTracker.asDataFrame()
+# 		processTrackerDf['timeEndProcessing'][0] = str(int(time.time()*1000))
+# 		syn.store(synapseclient.Table(processTrackerSynId,processTrackerDf))
+# 	logger.info("COMPLETED DATABASE TO STAGING")
 
 	if not args.test:
 		logger.info("DASHBOARD UPDATE")

From 51f801f66acd9b676fa7848cc0eb977eda687b2c Mon Sep 17 00:00:00 2001
From: thomasyu888 <thomasyu888@gmail.com>
Date: Sun, 3 Feb 2019 21:23:53 -0800
Subject: [PATCH 22/27] Remove comments

---
 genie/database_to_staging.py | 144 +++++++++++++++++------------------
 1 file changed, 72 insertions(+), 72 deletions(-)

diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py
index 6ee6e5b7..0dc6ac43 100644
--- a/genie/database_to_staging.py
+++ b/genie/database_to_staging.py
@@ -740,79 +740,79 @@ def main():
 	process.checkUrl(args.oncotreeLink)
 
 	#get syn id of case list folder in consortium release
-# 	caseListSynId = findCaseListId(syn, consortiumSynId)
-
-# 	if not args.test and not args.staging:
-# 		processTrackerSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == 'processTracker'].values[0]
-# 		processTracker = syn.tableQuery("SELECT timeStartProcessing FROM %s where center = 'SAGE' and processingType = 'dbToStage'" % processTrackerSynId)
-# 		processTrackerDf = processTracker.asDataFrame()
-# 		processTrackerDf['timeStartProcessing'][0] = str(int(time.time()*1000))
-# 		syn.store(synapseclient.Table(processTrackerSynId,processTrackerDf))
-
-# 	syn.table_query_timeout = 50000
-# 	centerMappingSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == 'centerMapping'].values[0]
-# 	#Only release files where release is true
-# 	CENTER_MAPPING = syn.tableQuery('SELECT * FROM %s where release is true' % centerMappingSynId)
-# 	CENTER_MAPPING_DF = CENTER_MAPPING.asDataFrame()
-# 	processingDate = datetime.datetime.strptime(args.processingDate, '%b-%Y')
-# #######
-# 	cbioValidatorPath = os.path.join(args.cbioportalPath,"core/src/main/scripts/importer/validateData.py")
-# 	assert os.path.exists(cbioValidatorPath), "Please specify correct cbioportalPath"
-
-# 	logger.info("STAGING TO CONSORTIUM")
-# 	genePanelEntities = stagingToCbio(syn, processingDate, args.genieVersion, CENTER_MAPPING_DF, databaseSynIdMappingDf, oncotree_url=args.oncotreeLink, consortiumReleaseCutOff= args.consortiumReleaseCutOff, current_release_staging=args.staging, skipMutationsInCis=args.skipMutationsInCis, test=args.test)
+	caseListSynId = findCaseListId(syn, consortiumSynId)
+
+	if not args.test and not args.staging:
+		processTrackerSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == 'processTracker'].values[0]
+		processTracker = syn.tableQuery("SELECT timeStartProcessing FROM %s where center = 'SAGE' and processingType = 'dbToStage'" % processTrackerSynId)
+		processTrackerDf = processTracker.asDataFrame()
+		processTrackerDf['timeStartProcessing'][0] = str(int(time.time()*1000))
+		syn.store(synapseclient.Table(processTrackerSynId,processTrackerDf))
+
+	syn.table_query_timeout = 50000
+	centerMappingSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == 'centerMapping'].values[0]
+	#Only release files where release is true
+	CENTER_MAPPING = syn.tableQuery('SELECT * FROM %s where release is true' % centerMappingSynId)
+	CENTER_MAPPING_DF = CENTER_MAPPING.asDataFrame()
+	processingDate = datetime.datetime.strptime(args.processingDate, '%b-%Y')
+#######
+	cbioValidatorPath = os.path.join(args.cbioportalPath,"core/src/main/scripts/importer/validateData.py")
+	assert os.path.exists(cbioValidatorPath), "Please specify correct cbioportalPath"
+
+	logger.info("STAGING TO CONSORTIUM")
+	genePanelEntities = stagingToCbio(syn, processingDate, args.genieVersion, CENTER_MAPPING_DF, databaseSynIdMappingDf, oncotree_url=args.oncotreeLink, consortiumReleaseCutOff= args.consortiumReleaseCutOff, current_release_staging=args.staging, skipMutationsInCis=args.skipMutationsInCis, test=args.test)
 	
-# 	#No need to run twice anymore
-# 	#stagingToCbio(syn, processingDate, args.genieVersion, CENTER_MAPPING_DF, databaseSynIdMappingDf, oncotree_url=args.oncotreeLink, consortiumReleaseCutOff= args.consortiumReleaseCutOff, filtering=True, current_release_staging=args.staging, test=args.test)
-# 	#Create case lists files
-# 	logger.info("CREATE CASE LIST FILES")
-# 	#Remove old caselists first
-# 	if not os.path.exists(CASE_LIST_PATH):
-# 		os.mkdir(CASE_LIST_PATH)
-# 	caselists = os.listdir(CASE_LIST_PATH)
-# 	[os.remove(os.path.join(CASE_LIST_PATH,caselist)) for caselist in caselists]
-# 	CLINICAL_PATH = os.path.join(GENIE_RELEASE_DIR,'data_clinical_%s.txt' % args.genieVersion)
-# 	GENE_MATRIX_PATH = os.path.join(GENIE_RELEASE_DIR,"data_gene_matrix_%s.txt" % args.genieVersion)
-# 	create_case_lists.create_case_lists(CLINICAL_PATH, GENE_MATRIX_PATH, CASE_LIST_PATH, "genie_private")
-# 	caseListFiles = os.listdir(CASE_LIST_PATH)
-# 	caseListEntities = []
-# 	for casePath in caseListFiles:
-# 		casePath = os.path.join(CASE_LIST_PATH, casePath)
-# 		caseListEntities.append(storeFile(syn, casePath, parent=caseListSynId, staging=args.staging, caseLists=True, genieVersion=args.genieVersion))
-
-# 	logger.info("REMOVING UNNECESSARY FILES")
-# 	genie_files = os.listdir(GENIE_RELEASE_DIR)
-# 	#deletePatterns = ('data_clinical_supp_patient_','data_clinical_supp_sample_','data_CNA_','data_mutations_extended_','data_fusions_','genie_private_data_cna_hg19_')
-# 	#[os.remove(os.path.join(GENIE_RELEASE_DIR,genieFile)) for genieFile in genie_files if genieFile.startswith(deletePatterns)]
-# 	[os.remove(os.path.join(GENIE_RELEASE_DIR,genieFile)) for genieFile in genie_files if args.genieVersion not in genieFile and "meta" not in genieFile and "case_lists" not in genieFile]
-# 	os.remove(CLINICAL_PATH)
-# #######	
-# 	logger.info("REVISE METADATA FILES")
-# 	command_reviseMetadataFiles(syn, args, databaseSynIdMappingDf)
-# 	logger.info("CBIO VALIDATION")
-# 	#Must be exit 0 because the validator sometimes fails, but we still want to capture the output	
-# 	command = [cbioValidatorPath,'-s',GENIE_RELEASE_DIR,'-n','; exit 0']
-# 	cbioOutput = subprocess.check_output(" ".join(command), shell=True)
-# 	logger.info(cbioOutput.decode("utf-8"))
-# 	if not args.test and not args.staging:
-# 		with open("cbioValidatorLogsConsortium_%s.txt" % args.genieVersion, "w") as cbioLog:
-# 			cbioLog.write(cbioOutput.decode("utf-8"))
-# 		syn.store(File("cbioValidatorLogsConsortium_%s.txt" % args.genieVersion, parentId = "syn10155804"))
-# 		os.remove("cbioValidatorLogsConsortium_%s.txt" % args.genieVersion)
-# 	logger.info("REMOVING OLD FILES")
-
-# 	process.rmFiles(CASE_LIST_PATH)
-# 	if os.path.exists('%s/genie_private_meta_cna_hg19_seg.txt' % GENIE_RELEASE_DIR):
-# 		os.unlink('%s/genie_private_meta_cna_hg19_seg.txt' % GENIE_RELEASE_DIR)
-# 	logger.info("CREATING LINK VERSION")
-# 	createLinkVersion(syn, args.genieVersion, caseListEntities, genePanelEntities, databaseSynIdMappingDf)
-
-# 	if not args.test and not args.staging:
-# 		processTracker = syn.tableQuery("SELECT timeEndProcessing FROM %s where center = 'SAGE' and processingType = 'dbToStage'" % processTrackerSynId)
-# 		processTrackerDf = processTracker.asDataFrame()
-# 		processTrackerDf['timeEndProcessing'][0] = str(int(time.time()*1000))
-# 		syn.store(synapseclient.Table(processTrackerSynId,processTrackerDf))
-# 	logger.info("COMPLETED DATABASE TO STAGING")
+	#No need to run twice anymore
+	#stagingToCbio(syn, processingDate, args.genieVersion, CENTER_MAPPING_DF, databaseSynIdMappingDf, oncotree_url=args.oncotreeLink, consortiumReleaseCutOff= args.consortiumReleaseCutOff, filtering=True, current_release_staging=args.staging, test=args.test)
+	#Create case lists files
+	logger.info("CREATE CASE LIST FILES")
+	#Remove old caselists first
+	if not os.path.exists(CASE_LIST_PATH):
+		os.mkdir(CASE_LIST_PATH)
+	caselists = os.listdir(CASE_LIST_PATH)
+	[os.remove(os.path.join(CASE_LIST_PATH,caselist)) for caselist in caselists]
+	CLINICAL_PATH = os.path.join(GENIE_RELEASE_DIR,'data_clinical_%s.txt' % args.genieVersion)
+	GENE_MATRIX_PATH = os.path.join(GENIE_RELEASE_DIR,"data_gene_matrix_%s.txt" % args.genieVersion)
+	create_case_lists.create_case_lists(CLINICAL_PATH, GENE_MATRIX_PATH, CASE_LIST_PATH, "genie_private")
+	caseListFiles = os.listdir(CASE_LIST_PATH)
+	caseListEntities = []
+	for casePath in caseListFiles:
+		casePath = os.path.join(CASE_LIST_PATH, casePath)
+		caseListEntities.append(storeFile(syn, casePath, parent=caseListSynId, staging=args.staging, caseLists=True, genieVersion=args.genieVersion))
+
+	logger.info("REMOVING UNNECESSARY FILES")
+	genie_files = os.listdir(GENIE_RELEASE_DIR)
+	#deletePatterns = ('data_clinical_supp_patient_','data_clinical_supp_sample_','data_CNA_','data_mutations_extended_','data_fusions_','genie_private_data_cna_hg19_')
+	#[os.remove(os.path.join(GENIE_RELEASE_DIR,genieFile)) for genieFile in genie_files if genieFile.startswith(deletePatterns)]
+	[os.remove(os.path.join(GENIE_RELEASE_DIR,genieFile)) for genieFile in genie_files if args.genieVersion not in genieFile and "meta" not in genieFile and "case_lists" not in genieFile]
+	os.remove(CLINICAL_PATH)
+#######	
+	logger.info("REVISE METADATA FILES")
+	command_reviseMetadataFiles(syn, args, databaseSynIdMappingDf)
+	logger.info("CBIO VALIDATION")
+	#Must be exit 0 because the validator sometimes fails, but we still want to capture the output	
+	command = [cbioValidatorPath,'-s',GENIE_RELEASE_DIR,'-n','; exit 0']
+	cbioOutput = subprocess.check_output(" ".join(command), shell=True)
+	logger.info(cbioOutput.decode("utf-8"))
+	if not args.test and not args.staging:
+		with open("cbioValidatorLogsConsortium_%s.txt" % args.genieVersion, "w") as cbioLog:
+			cbioLog.write(cbioOutput.decode("utf-8"))
+		syn.store(File("cbioValidatorLogsConsortium_%s.txt" % args.genieVersion, parentId = "syn10155804"))
+		os.remove("cbioValidatorLogsConsortium_%s.txt" % args.genieVersion)
+	logger.info("REMOVING OLD FILES")
+
+	process.rmFiles(CASE_LIST_PATH)
+	if os.path.exists('%s/genie_private_meta_cna_hg19_seg.txt' % GENIE_RELEASE_DIR):
+		os.unlink('%s/genie_private_meta_cna_hg19_seg.txt' % GENIE_RELEASE_DIR)
+	logger.info("CREATING LINK VERSION")
+	createLinkVersion(syn, args.genieVersion, caseListEntities, genePanelEntities, databaseSynIdMappingDf)
+
+	if not args.test and not args.staging:
+		processTracker = syn.tableQuery("SELECT timeEndProcessing FROM %s where center = 'SAGE' and processingType = 'dbToStage'" % processTrackerSynId)
+		processTrackerDf = processTracker.asDataFrame()
+		processTrackerDf['timeEndProcessing'][0] = str(int(time.time()*1000))
+		syn.store(synapseclient.Table(processTrackerSynId,processTrackerDf))
+	logger.info("COMPLETED DATABASE TO STAGING")
 
 	if not args.test:
 		logger.info("DASHBOARD UPDATE")

From fc24af5894511f02ce406fdac0901582ae6a7501 Mon Sep 17 00:00:00 2001
From: thomasyu888 <thomasyu888@gmail.com>
Date: Sun, 3 Feb 2019 22:35:54 -0800
Subject: [PATCH 23/27] Make additions

---
 genie/dashboard_table_updater.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/genie/dashboard_table_updater.py b/genie/dashboard_table_updater.py
index 285b0e6d..bb2ecb22 100644
--- a/genie/dashboard_table_updater.py
+++ b/genie/dashboard_table_updater.py
@@ -421,9 +421,14 @@ def check_column_decreases(currentdf, olderdf):
 		new_counts = currentdf[col].value_counts()
 		if olderdf.get(col) is not None:
 			old_counts = olderdf[col].value_counts()
+			#Make sure any values that exist in the new get added to the old to show the decrease
 			new_keys = pd.Series(index=new_counts.keys()[~new_counts.keys().isin(old_counts.keys())])
 			old_counts = old_counts.add(new_keys,fill_value=0)
 			old_counts.fillna(0,inplace=True)
+			#Make sure any values that don't exist in the old get added to show the decrease
+			new_keys = pd.Series(index=old_counts.keys()[~old_counts.keys().isin(new_counts.keys())])
+			new_counts = new_counts.add(new_keys,fill_value=0)
+			new_counts.fillna(0,inplace=True)
 			if any(new_counts - old_counts < 0):
 				logger.info("\tDECREASE IN COLUMN: %s" % col)
 				diff = new_counts[new_counts - old_counts < 0]

From 4aec88bef16d8bf2d3757fa60aca75a6a702226b Mon Sep 17 00:00:00 2001
From: thomasyu888 <thomasyu888@gmail.com>
Date: Sun, 3 Feb 2019 22:47:08 -0800
Subject: [PATCH 24/27] Add more print

---
 genie/dashboard_table_updater.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/genie/dashboard_table_updater.py b/genie/dashboard_table_updater.py
index bb2ecb22..37aeda2a 100644
--- a/genie/dashboard_table_updater.py
+++ b/genie/dashboard_table_updater.py
@@ -432,6 +432,8 @@ def check_column_decreases(currentdf, olderdf):
 			if any(new_counts - old_counts < 0):
 				logger.info("\tDECREASE IN COLUMN: %s" % col)
 				diff = new_counts[new_counts - old_counts < 0]
+				diffs = new_counts-old_counts
+				logger.info("\t" + ",".join(diffs[diffs<0].index))
 				diff_map[col] = True
 			else:
 				diff_map[col] = False

From 1fcdc029e6c3b6d36a4aeb9b0521b99dff3c8583 Mon Sep 17 00:00:00 2001
From: thomasyu888 <thomasyu888@gmail.com>
Date: Sun, 3 Feb 2019 22:49:24 -0800
Subject: [PATCH 25/27] Change index to string so it can be joined

---
 genie/dashboard_table_updater.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/genie/dashboard_table_updater.py b/genie/dashboard_table_updater.py
index 37aeda2a..15fba6a0 100644
--- a/genie/dashboard_table_updater.py
+++ b/genie/dashboard_table_updater.py
@@ -433,7 +433,7 @@ def check_column_decreases(currentdf, olderdf):
 				logger.info("\tDECREASE IN COLUMN: %s" % col)
 				diff = new_counts[new_counts - old_counts < 0]
 				diffs = new_counts-old_counts
-				logger.info("\t" + ",".join(diffs[diffs<0].index))
+				logger.info("\t" + ",".join(diffs[diffs<0].index.astype(str)))
 				diff_map[col] = True
 			else:
 				diff_map[col] = False

From f0c272a22882174e143efd76dfdf4d52b3c2a902 Mon Sep 17 00:00:00 2001
From: thomasyu888 <thomasyu888@gmail.com>
Date: Mon, 4 Feb 2019 11:51:03 -0800
Subject: [PATCH 26/27] Should compare samples that existed in the previous
 release

---
 genie/dashboard_table_updater.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/genie/dashboard_table_updater.py b/genie/dashboard_table_updater.py
index 15fba6a0..610ec75a 100644
--- a/genie/dashboard_table_updater.py
+++ b/genie/dashboard_table_updater.py
@@ -470,7 +470,8 @@ def print_clinical_values_difference_table(syn, database_mappingdf):
 
 	older_sampledf =  pd.read_csv(older_sample_ent.path,sep="\t",comment="#")
 	older_sampledf['CENTER'] = [patient.split("-")[1] for patient in older_sampledf['PATIENT_ID']]
-	current_sampledf = current_sampledf[current_sampledf['CENTER'].isin(older_sampledf['CENTER'].unique())]
+	#Rather than take the CENTER, must take the SAMPLE_ID to compare
+	current_sampledf = current_sampledf[current_sampledf['SAMPLE_ID'].isin(older_sampledf['SAMPLE_ID'].unique())]
 
 	logger.info("SAMPLE CLINICAL VALUE DECREASES")
 	center_decrease_mapping = dict()
@@ -484,7 +485,8 @@ def print_clinical_values_difference_table(syn, database_mappingdf):
 	older_patient_ent = syn.get(older_clinical_synids['data_clinical_patient.txt'], followLink=True)
 	current_patientdf = pd.read_csv(current_patient_ent.path,sep="\t",comment="#")
 	older_patientdf =  pd.read_csv(older_patient_ent.path,sep="\t",comment="#")
-	current_patientdf = current_patientdf[current_patientdf['CENTER'].isin(older_patientdf['CENTER'].unique())]
+	#Rather than take the CENTER, must take the PATIENT_ID to compare
+	current_patientdf = current_patientdf[current_patientdf['PATIENT_ID'].isin(older_patientdf['PATIENT_ID'].unique())]
 
 	logger.info("PATIENT CLINICAL VALUE DECREASES")
 	for center in older_patientdf['CENTER'].unique():

From 52b41a063f96f5356e25b1510de099e3f7b2fb50 Mon Sep 17 00:00:00 2001
From: thomasyu888 <thomasyu888@gmail.com>
Date: Mon, 4 Feb 2019 12:54:21 -0800
Subject: [PATCH 27/27] Add values of diff

---
 genie/dashboard_table_updater.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/genie/dashboard_table_updater.py b/genie/dashboard_table_updater.py
index 610ec75a..9dba3fb9 100644
--- a/genie/dashboard_table_updater.py
+++ b/genie/dashboard_table_updater.py
@@ -433,7 +433,7 @@ def check_column_decreases(currentdf, olderdf):
 				logger.info("\tDECREASE IN COLUMN: %s" % col)
 				diff = new_counts[new_counts - old_counts < 0]
 				diffs = new_counts-old_counts
-				logger.info("\t" + ",".join(diffs[diffs<0].index.astype(str)))
+				logger.info("\t" + diffs[diffs<0].to_csv().replace("\n","; "))
 				diff_map[col] = True
 			else:
 				diff_map[col] = False