diff --git a/CHANGELOG.md b/CHANGELOG.md index 23e1095..204f02c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#117](https://github.com/mskcc/forte/pull/117) - add supporting-reads_gene-fusions\*.zip files to fusioncatcher outputs - [#118](https://github.com/mskcc/forte/pull/118) - change the way the plug-n-play starfusion reference is downloaded. +- -[#126](https://github.com/mskcc/forte/pull/126) - enable clinical genes prioritization in Metafusion ### `Fixed` diff --git a/assets/clinical_allowlist_hg19.txt b/assets/clinical_allowlist_hg19.txt new file mode 100644 index 0000000..0b72c5a --- /dev/null +++ b/assets/clinical_allowlist_hg19.txt @@ -0,0 +1,319 @@ +gene_name ensembl_transcript refseq_transcript +AR ENST00000374690 NM_000044 +BTK ENST00000308731 NM_000061 +CDKN2A ENST00000304494 NM_000077 +EPOR ENST00000222139 NM_000121 +ESR1 ENST00000206249 NM_000125 +FGFR2 ENST00000358487 NM_000141 +FGFR3 ENST00000440486 NM_000142 +JAK3 ENST00000458235 NM_000215 +KIT ENST00000288135 NM_000222 +MET ENST00000397752 NM_000245 +PTCH1 ENST00000331920 NM_000264 +NME1 ENST00000393196 NM_000269 +RAG1 ENST00000299440 NM_000448 +GNAS ENST00000371085 NM_000516 +RAG2 ENST00000311485 NM_000536 +CD44 ENST00000428726 NM_000610 +CD79B ENST00000006750 NM_000626 +BCL2 ENST00000333681 NM_000633 +CSF1 ENST00000329608 NM_000757 +DCK ENST00000286648 NM_000788 +PGR ENST00000325455 NM_000926 +RARA ENST00000254066 NM_000964 +SS18 ENST00000415083 NM_001007559 +MYBL1 ENST00000522677 NM_001080416 +NUTM2A ENST00000381707 NM_001099338 +BCOR ENST00000378444 NM_001123385 +YAP1 ENST00000282441 NM_001130145 +MYB ENST00000341911 NM_001130173 +BIRC3 ENST00000263464 NM_001165 +MEAF6 ENST00000296214 NM_001270875 +NUTM2B ENST00000429828 NM_001278495 +EIF4A1 ENST00000293831 NM_001416 +ID4 ENST00000378700 NM_001546 +ACVR2A ENST00000241416 NM_001616 +ARAF ENST00000377045 NM_001654 +RHOA ENST00000418115 NM_001664 +BCL6 ENST00000406870 NM_001706 +BMP7 ENST00000395863 NM_001719 +CCND2 ENST00000261254 NM_001759 +CCND3 ENST00000372991 NM_001760 +ENTPD1 ENST00000371205 NM_001776 +CDC25A ENST00000302506 NM_001789 +CEBPE ENST00000206513 NM_001805 +CEBPG ENST00000284000 NM_001806 +ERBB3 ENST00000267101 NM_001982 +ETV6 ENST00000396373 NM_001987 +FOXO1 ENST00000379561 NM_002015 +FLT1 ENST00000282397 NM_002019 +GATA1 ENST00000376670 NM_002049 +IRF8 ENST00000268638 NM_002163 +IDH2 ENST00000330062 NM_002168 +IL7R ENST00000303115 NM_002185 +ITPKB ENST00000429204 NM_002221 +JAK1 ENST00000342505 NM_002227 +LMO1 ENST00000335790 NM_002315 +LTK ENST00000263800 NM_002344 +MAL ENST00000309988 NM_002371 +MN1 ENST00000302326 NM_002430 +IRF4 ENST00000380956 NM_002460 +MYH11 ENST00000300036 NM_002474 +NPM1 ENST00000296930 NM_002520 +NRAS ENST00000369535 NM_002524 +NTRK1 ENST00000524377 NM_002529 +PBX1 ENST00000420696 NM_002585 +PDGFB ENST00000331163 NM_002608 +PDGFRB ENST00000261799 NM_002609 +PIM1 ENST00000373509 NM_002648 +PLAG1 ENST00000316981 NM_002655 +PPAT ENST00000264220 NM_002703 +PRKACA ENST00000308677 NM_002730 +PRKAR2B ENST00000265717 NM_002736 +PRKCA ENST00000413366 NM_002737 +PRKD1 ENST00000331968 NM_002742 +MAP2K1 ENST00000307102 NM_002755 +PTPN1 ENST00000371621 NM_002827 +PTPN11 ENST00000351677 NM_002834 +RAF1 ENST00000251849 NM_002880 +SOX11 ENST00000322002 NM_003108 +SRF ENST00000265354 NM_003131 +STAT6 ENST00000300134 NM_003153 +TCF3 ENST00000262965 NM_003200 +TNFSF4 ENST00000281834 NM_003326 +TYK2 ENST00000525621 NM_003331 +XPO1 ENST00000401558 NM_003400 +HMGA2 ENST00000403681 NM_003483 +TP63 ENST00000264731 NM_003722 +RAB7L1 ENST00000367139 NM_003929 +NFKB1 ENST00000226574 NM_003998 +BCL2A1 ENST00000267953 NM_004049 +DNTT ENST00000371174 NM_004088 +E2F2 ENST00000361729 NM_004091 +FLT3 ENST00000241453 NM_004119 +ALK ENST00000389048 NM_004304 +BCR ENST00000305877 NM_004327 +CALR ENST00000316448 NM_004343 +CEBPA ENST00000498907 NM_004364 +CREBBP ENST00000262367 NM_004380 +ERBB2 ENST00000269571 NM_004448 +ETV5 ENST00000306376 NM_004454 +EZH2 ENST00000320356 NM_004456 +SH3BP5 ENST00000383791 NM_004844 +NRG2 ENST00000361474 NM_004883 +CDKN2B ENST00000276925 NM_004936 +ETV1 ENST00000430479 NM_004956 +FUS ENST00000254108 NM_004960 +JAK2 ENST00000381652 NM_004972 +KRAS ENST00000311936 NM_004985 +PDCD1 ENST00000334409 NM_005018 +NUP214 ENST00000359428 NM_005085 +ABL1 ENST00000318560 NM_005157 +BCL3 ENST00000164227 NM_005178 +CBL ENST00000264033 NM_005188 +CEBPD ENST00000408965 NM_005195 +EGFR ENST00000275493 NM_005228 +ERBB4 ENST00000342788 NM_005235 +EWSR1 ENST00000397938 NM_005243 +FGR ENST00000374005 NM_005248 +FOS ENST00000303562 NM_005252 +GLI1 ENST00000228682 NM_005269 +GRB7 ENST00000309156 NM_005310 +MPL ENST00000372470 NM_005373 +SH2B3 ENST00000341259 NM_005475 +TLX1 ENST00000370196 NM_005521 +LMO2 ENST00000257818 NM_005574 +LYL1 ENST00000264824 NM_005583 +MUSK ENST00000374448 NM_005592 +TMPRSS2 ENST00000332149 NM_005656 +PRKD3 ENST00000234179 NM_005813 +IDH1 ENST00000345146 NM_005896 +IKZF1 ENST00000331340 NM_006060 +TFG ENST00000240851 NM_006070 +DNAJB1 ENST00000254322 NM_006145 +NFE2L2 ENST00000397062 NM_006164 +NTRK2 ENST00000277120 NM_006180 +PDGFRA ENST00000257290 NM_006206 +PIK3CA ENST00000263967 NM_006218 +TFE3 ENST00000315869 NM_006521 +NCOA2 ENST00000452400 NM_006540 +FOSB ENST00000353609 NM_006732 +U2AF1 ENST00000291552 NM_006758 +YWHAE ENST00000264335 NM_006761 +KAT6A ENST00000265713 NM_006766 +PIM2 ENST00000376509 NM_006875 +DNMT3B ENST00000328111 NM_006892 +PYCR1 ENST00000329875 NM_006907 +PICALM ENST00000393346 NM_007166 +BRD3 ENST00000303407 NM_007371 +CHIC2 ENST00000263921 NM_012110 +MGEA5 ENST00000361464 NM_012215 +NFATC2 ENST00000371564 NM_012340 +RLF ENST00000372771 NM_012421 +SF3B1 ENST00000335508 NM_012433 +STAT5B ENST00000293328 NM_012448 +TNFRSF13B ENST00000261652 NM_012452 +IKZF3 ENST00000346872 NM_012481 +FAM216A ENST00000377673 NM_013300 +BLNK ENST00000224337 NM_013314 +CD274 ENST00000381577 NM_014143 +SETD2 ENST00000409792 NM_014159 +LIMD1 ENST00000273317 NM_014240 +NEK6 ENST00000320246 NM_014397 +KIAA0101 ENST00000300035 NM_014736 +MAST1 ENST00000251472 NM_014975 +CAMTA1 ENST00000303635 NM_015215 +WWTR1 ENST00000360632 NM_015472 +VGLL3 ENST00000398399 NM_016206 +CYB5R2 ENST00000299498 NM_016229 +KLF2 ENST00000248071 NM_016270 +NUP98 ENST00000324932 NM_016320 +PAX5 ENST00000358127 NM_016734 +MBTD1 ENST00000586178 NM_017643 +EXOC2 ENST00000230449 NM_018303 +STRBP ENST00000348403 NM_018387 +PAG1 ENST00000220597 NM_018440 +BATF3 ENST00000243440 NM_018664 +MAML3 ENST00000509479 NM_018717 +AICDA ENST00000229335 NM_020661 +RET ENST00000355710 NM_020975 +LZTS1 ENST00000381569 NM_021020 +TLX3 ENST00000296921 NM_021025 +RELA ENST00000406246 NM_021975 +THADA ENST00000405975 NM_022065 +PRDM16 ENST00000270722 NM_022114 +RBM15 ENST00000369784 NM_022768 +CBFB ENST00000412916 NM_022845 +WHSC1L1 ENST00000317025 NM_023034 +FGFR1 ENST00000447712 NM_023110 +EBF1 ENST00000313708 NM_024007 +PHF1 ENST00000374516 NM_024165 +NOTCH2 ENST00000256646 NM_024408 +PDGFD ENST00000393158 NM_025208 +PDCD1LG2 ENST00000397747 NM_025239 +ZCCHC7 ENST00000336755 NM_032226 +CARD11 ENST00000396946 NM_032415 +MAML2 ENST00000524717 NM_032427 +GLIS2 ENST00000433375 NM_032575 +GATA2 ENST00000341105 NM_032638 +RSPO3 ENST00000356698 NM_032784 +CCNB3 ENST00000376042 NM_033031 +CCND1 ENST00000227507 NM_053056 +BCL11B ENST00000357195 NM_138576 +BAX ENST00000345358 NM_138761 +TAF15 ENST00000605844 NM_139215 +HOXA9 ENST00000343483 NM_152739 +PTK2B ENST00000346049 NM_173176 +JAZF1 ENST00000283928 NM_175061 +P2RY8 ENST00000381297 NM_178129 +RSPO2 ENST00000276659 NM_178565 +VGLL2 ENST00000326274 NM_182645 +PRKACB ENST00000370685 NM_182948 +CREB3L2 ENST00000330387 NM_194071 +TERT ENST00000310581 NM_198253 +SS18L1 ENST00000331758 NM_198935 +AKT1 ENST00000555528 NM_005163 +AR ENST00000396043 NM_001011645 +ARHGAP26 ENST00000274498 NM_015071 +BCORL1 ENST00000540052 NM_021946 +BRAF ENST00000288602 NM_004333 +BRD4 ENST00000263377 NM_058243 +CIC ENST00000575354 NM_015125 +CSF1 ENST00000369802 NM_172212 +DEK ENST00000397239 NM_003472 +EPC1 ENST00000263062 NM_025209 +ERG ENST00000417133 NM_004449 +ESR1 ENST00000440973 NM_001122742 +ETV4 ENST00000319349 NM_001986 +FGF1 ENST00000378046 NM_00800 +FGFR1 ENST00000397091 NM_015850 +FGFR4 ENST00000292408 NM_002011 +GRM1 ENST00000492807 NM_001278065 +IGF1R ENST00000268035 NM_000875 +LTK ENST00000453182 NM_001135685 +MKL2 ENST00000574045 NM_014048 +NCOA1 ENST00000288599 NM_147223 +NCOA3 ENST00000372004 NM_006534 +NOTCH1 ENST00000277541 NM_017617 +NR4A3 ENST00000330847 NM_173200 +NRG1 ENST00000522402 NM_001159996 +NRG1 ENST00000341377 NM_004495 +NRG1 ENST00000356819 NM_013956 +NRG1 ENST00000287842 NM_013957 +NRG1 ENST00000520502 NM_013959 +NRG1 ENST00000520407 NM_013962 +NTRK1 ENST00000392302 NM_001007792 +NTRK3 ENST00000317501 NM_001007156 +NTRK3 ENST00000317501 NM_002530 +NUTM1 ENST00000333756 NM_175741 +PAX3 ENST00000392069 NM_181459 +PAX7 ENST00000375375 NM_002584 +PPARG ENST00000287820 NM_015869 +PRB3 ENST00000381842 NM_006249 +PRKCB ENST00000303531 NM_002738 +PRKD2 ENST00000433867 NM_016457 +RAD51B ENST00000487270 NM_133509 +RET ENST00000340058 NM_020630 +ROS1 ENST00000368508 NM_002944 +SS18 ENST00000269137 NM_005637 +STAT6 ENST00000300134 NM_001178078 +TCF12 ENST00000438423 NM_207036 +TFEB ENST00000230323 NM_007162 +TMPRSS2 ENST00000398585 NM_001135099 +USP6 ENST00000574788 NM_004505 +AKT3 ENST00000366539 NM_005465 +ASXL1 ENST00000375687 NM_015338 +CCDC50 ENST00000392455 NM_174908 +CHD1 ENST00000284049 NM_001270 +CIITA ENST00000324288 NM_000246 +CRLF2 ENST00000400841 NM_022148 +CSF1R ENST00000286301 NM_005211 +CTLA4 ENST00000302823 NM_005214 +DLEU1 ENST00000378180 NR_002605 +FOXP1 ENST00000318789 NM_032682 +FUT8 ENST00000557164 NM_004480 +IL16 ENST00000394652 NM_004513 +KDM6A ENST00000377967 NM_021140 +LRMP ENST00000354454 NM_006152 +MALT1 ENST00000348428 NM_006785 +MECOM ENST00000494292 NM_004991 +MKL1 ENST00000355630 NM_020831 +MLF1 ENST00000355893 NM_022443 +MLLT4 ENST00000392108 NM_001040000 +MUC1 ENST00000337604 NM_002456 +MYC ENST00000377970 NM_002467 +MYD88 ENST00000396334 NM_002468 +NFKB2 ENST00000189444 NM_002502 +PLCG1 ENST00000244007 NM_002660 +PLCG2 ENST00000359376 NM_002661 +RANBP1 ENST00000331821 NM_002882 +RUNX1 ENST00000300305 NM_001754 +S1PR2 ENST00000590320 NM_004230 +SEMA6A ENST00000343348 NM_020796 +SERPINA9 ENST00000337425 NM_175739 +SETBP1 ENST00000282030 NM_015559 +TAL1 ENST00000371884 NM_003189 +WT1 ENST00000332351 NM_000378 +ABL2 ENST00000512653 NM_005158 +ASB13 ENST00000479033 NR_0024581 +BMF ENST00000354670 NM_033503 +CDK6 ENST00000265734 NM_001259 +CSF3R ENST00000373103 NM_156039 +DENND3 ENST00000519811 NM_014957 +DNM2 ENST00000389253 NM_004945 +DNMT3A ENST00000321117 NM_175629 +DUSP22 ENST00000344450 NM_020185 +FBXW7 ENST00000281708 NM_033632 +IKZF2 ENST00000457361 NM_016260 +KMT2A ENST00000534358 NM_005933 +MLLT10 ENST00000377072 NM_004641 +MME ENST00000460393 NM_000902 +NF1 ENST00000358273 NM_000267 +NT5C2 ENST00000343289 NM_012229 +PAICS ENST00000264221 NM_006452 +PHF6 ENST00000370800 NM_032335 +PML ENST00000395135 NM_002675 +SLC29A1 ENST00000427851 NM_001078175 +SRSF2 ENST00000359995 NM_003016 +STAT3 ENST00000404395 NM_003150 +STIL ENST00000360380 NM_003035 diff --git a/bin/Metafusion_forte.sh b/bin/Metafusion_forte.sh index d61511f..470df60 100755 --- a/bin/Metafusion_forte.sh +++ b/bin/Metafusion_forte.sh @@ -7,7 +7,7 @@ set -eo pipefail # __author__ = "Alexandria Dymun" # __email__ = "pintoa1@mskcc.org" # __contributor__ = "Anne Marie Noronha (noronhaa@mskcc.org)" -# __version__ = "0.0.1" +# __version__ = "0.0.2" # __status__ = "Dev" output_ANC_RT_SG=1 @@ -15,7 +15,7 @@ RT_call_filter=1 blck_filter=1 ANC_filter=1 usage() { - echo "Usage: Metafusion_forte.sh --num_tools= --genome_fasta --recurrent_bedpe --outdir --cff --gene_bed --gene_info " 1>&2; + echo "Usage: Metafusion_forte.sh --num_tools= --genome_fasta --recurrent_bedpe --outdir --cff --gene_bed --gene_info --clinical_genes " 1>&2; exit 1; } @@ -50,6 +50,10 @@ while test $# -gt 0;do recurrent_bedpe="$2" shift 2 ;; + --clinical_genes) + clinical_genes="$2" + shift 2 + ;; *) #OTHER_ARGUMENTS+=("$1") shift # Remove generic argument from processing @@ -83,10 +87,10 @@ cff=$outdir/$(basename $cff).renamed #Annotate cff if [ $genome_fasta ]; then echo Annotate cff, extract sequence surrounding breakpoint - reann_cff_fusion.py --cff $cff --gene_bed $gene_bed --ref_fa $genome_fasta > $outdir/$(basename $cff).reann.WITH_SEQ + reann_cff_fusion.py --cff $cff --gene_bed $gene_bed --ref_fa $genome_fasta --clinical_genes $clinical_genes > $outdir/$(basename $cff).reann.WITH_SEQ else echo Annotate cff, no extraction of sequence surrounding breakpoint - reann_cff_fusion.py --cff $cff --gene_bed $gene_bed > $outdir/$(basename $cff).reann.NO_SEQ + reann_cff_fusion.py --cff $cff --gene_bed $gene_bed --clinical_genes $clinical_genes > $outdir/$(basename $cff).reann.NO_SEQ fi # Assign .cff based on SEQ or NOSEQ diff --git a/bin/add_annotations_cff.R b/bin/add_annotations_cff.R index 25c030d..bf22a75 100755 --- a/bin/add_annotations_cff.R +++ b/bin/add_annotations_cff.R @@ -1,7 +1,7 @@ #!/usr/local/bin/Rscript # __author__ = "Anne Marie Noronha" # __email__ = "noronhaa@mskcc.org" -# __version__ = "0.0.1" +# __version__ = "0.0.2" suppressPackageStartupMessages({ @@ -11,7 +11,7 @@ suppressPackageStartupMessages({ usage <- function() { message("Usage:") - message("add_annotations_cff.R --cff-file --agfusion-file --oncokb-file --out-prefix ") + message("add_annotations_cff.R --cff-file --agfusion-file --oncokb-file --out-prefix --transcripts ") } args = commandArgs(TRUE) @@ -39,14 +39,14 @@ parse_args <- function(x){ args_opt <- parse_args(paste(args,collapse=" ")) -possible_args = c("cff", "oncokb", "agfusion", "out_prefix") +possible_args = c("cff", "oncokb", "agfusion", "out_prefix","transcripts") if (length(setdiff(names(args_opt),possible_args)) > 0){ message("Invalid options") usage() quit() } -required_args <- c("cff","agfusion","out_prefix") +required_args <- c("cff","agfusion","out_prefix","transcripts") if (length(setdiff(required_args,names(args_opt))) > 0) { message("Missing required arguments") usage() @@ -57,11 +57,17 @@ oncokb_file = args_opt$oncokb agfusion_file = args_opt$agfusion cff_file = args_opt$cff out_prefix = args_opt$out_prefix +transcripts = args_opt$transcripts cff = fread(cff_file) final_cff_cols <- c(names(cff)) agfusion_tab = fread(agfusion_file) %>% select(c(`5'_transcript`,`3'_transcript`,`5'_breakpoint`,`3'_breakpoint`,Fusion_effect)) +#Add transcript version corresponding to gtf ensembl version +transcripts <- read.delim(transcripts,header = F) +transcripts <- transcripts[,c("V15","V16")] + final_cff_cols <- c(final_cff_cols,"Fusion_effect") + if (!is.null(oncokb_file)){ oncokb_tab = fread(oncokb_file) %>% select(-Fusion) final_cff_cols = c(final_cff_cols,names(oncokb_tab %>% select(-Tumor_Sample_Barcode))) @@ -83,6 +89,26 @@ cff <- merge( all.x = T, all.y = T ) +### merge +cff <- merge( + cff, + transcripts, + by.x = "gene3_transcript_id", + by.y = "V15", + all.x = T , + all.y = F) +cff$gene3_transcript_id <- ifelse(is.na(cff$gene3_transcript_id),NA,paste0(cff$gene3_transcript_id,".",cff$V16)) +cff$V16 <- NULL +cff <- merge( + cff, + transcripts, + by.x = "gene5_transcript_id", + by.y = "V15", + all.x = T, + all.y = F) + +cff$gene5_transcript_id <- ifelse(is.na(cff$gene5_transcript_id),NA,paste0(cff$gene5_transcript_id,".",cff$V16)) +cff$V16 <- NULL cff <- as.data.frame(cff)[,c(final_cff_cols)] #cff <- cff %>% mutate(!!final_cff_cols[34] := Fusion_effect) %>% select(-c(Fusion_effect)) diff --git a/bin/add_flags_and_cluster_information.R b/bin/add_flags_and_cluster_information.R index b74ff92..6d4d354 100755 --- a/bin/add_flags_and_cluster_information.R +++ b/bin/add_flags_and_cluster_information.R @@ -57,7 +57,9 @@ library(data.table) "closest_exon3", "captured_reads", "gene5_transcript_id", - "gene3_transcript_id" + "gene3_transcript_id", + "is_clinical5", + "is_clinical3" ) colnames(unfiltered_cff) <- header_cff cluster <- fread(args[2],data.table = F) @@ -184,7 +186,7 @@ library(data.table) ) if (!is.null(filters)){ unfiltered_cff <- merge(unfiltered_cff,filters, by="FID", all.x = T, all.y = F) %>% - mutate(Metafusion_flag=ifelse(is.null(Metafusion_flag) | is.na(Metafusion_flag) | Metafusion_flag=="", tmpflag, paste(Metafusion_flag,tmpflag,sep=","))) %>% + mutate(Metafusion_flag=ifelse(is.null(Metafusion_flag) | is.na(Metafusion_flag) | Metafusion_flag=="", tmpflag, ifelse(is.na(tmpflag), Metafusion_flag, paste(Metafusion_flag,tmpflag,sep=",")))) %>% select(-c(tmpflag)) } diff --git a/bin/final_generate_v75_gene_bed.R b/bin/final_generate_v75_gene_bed.R index ea98ac6..406f013 100755 --- a/bin/final_generate_v75_gene_bed.R +++ b/bin/final_generate_v75_gene_bed.R @@ -3,15 +3,16 @@ # __author__ = "Alexandria Dymun" # __email__ = "pintoa1@mskcc.org" # __contributor__ = "Anne Marie Noronha (noronhaa@mskcc.org)" -# __version__ = "0.0.1" +# __version__ = "0.0.2" # __status__ = "Dev" suppressPackageStartupMessages({ -# library(plyr) + library(plyr) library(dplyr) library(data.table) library(stringr) + options(scipen = 999) }) usage <- function() { @@ -35,10 +36,12 @@ if (length(args)!=2) { gtf <- rtracklayer::import(args[1]) gtf_df <- as.data.frame(gtf) +#remove incomplete transcripts mRNA_end_NF and mRNA_start_NF (not finished) +gtf_df <- gtf_df[!grepl("NF",gtf_df$tag),] file.to_write <- args[2] -### ensure start is 0 based +### convert start to 0-based to match metafusion expectations of gff format gtf_df <- gtf_df %>% rename( chr = seqnames diff --git a/conf/igenomes.config b/conf/igenomes.config index c618acc..41d857c 100644 --- a/conf/igenomes.config +++ b/conf/igenomes.config @@ -31,7 +31,9 @@ params { } } metafusion_blocklist = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh37/blocklist_breakpoints.bedpe.gz" + clinicalgenes = "${projectDir}/assets/clinical_allowlist_hg19.txt" ensembl_version = 75 + transcripts = "https://ftp.ensembl.org/pub/release-75/mysql/homo_sapiens_core_75_37/transcript.txt.gz" } 'GRCh38' { fasta = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/WholeGenomeFasta/genome.fa" @@ -48,7 +50,8 @@ params { cdna = "http://ftp.ensemblgenomes.org/pub/viruses/fasta/sars_cov_2/cdna/Sars_cov_2.ASM985889v3.cdna.all.fa.gz" metafusion_blocklist = "https://raw.githubusercontent.com/anoronh4/forte-references/main/GRCh37_test/blocklist_breakpoints.bedpe" ensembl_version = 75 - + clinicalgenes = "${projectDir}/assets/clinical_allowlist_hg19.txt" + transcripts = "https://ftp.ensembl.org/pub/release-75/mysql/homo_sapiens_core_75_37/transcript.txt.gz" } /* 'hg38' { diff --git a/conf/modules.config b/conf/modules.config index 9277358..aa63a22 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -210,6 +210,7 @@ process { ] } withName: 'AGAT_SPADDINTRONS' { + cpus = 4 storeDir = { "${params.reference_base}/${params.genome}/metafusion/introns" } publishDir = [ enabled: false, diff --git a/docs/output.md b/docs/output.md index 210cd96..3a5c61e 100644 --- a/docs/output.md +++ b/docs/output.md @@ -141,7 +141,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d -FORTE uses a custom fork of [Metafusion](https://github.com/mskcc/MetaFusion) to filter, cluster and annotate the fusion calls. Several `intermediate` files are included in the output. +FORTE uses a custom fork of [Metafusion](https://github.com/mskcc/MetaFusion) to filter, cluster and annotate the fusion calls. Several `intermediate` files are included in the output, [see wiki for detailed information](https://github.com/mskcc/forte/wiki/Metafusion-Output). `Fusion_effect` information is added using a custom fork of [AGFusion](https://github.com/anoronh4/AGFusion). diff --git a/main.nf b/main.nf index c0a5e68..de37dd9 100644 --- a/main.nf +++ b/main.nf @@ -28,7 +28,8 @@ params.metafusion_blocklist = WorkflowMain.getGenomeAttribute(params, 'metafus params.metafusion_gene_bed = WorkflowMain.getGenomeAttribute(params, 'metafusion_gene_bed') params.metafusion_gene_info = WorkflowMain.getGenomeAttribute(params, 'metafusion_gene_info') params.ensembl_version = WorkflowMain.getGenomeAttribute(params, 'ensembl_version') - +params.clinicalgenes = WorkflowMain.getGenomeAttribute(params, 'clinicalgenes') +params.transcripts = WorkflowMain.getGenomeAttribute(params, 'transcripts') /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/modules/local/cff_annotate/main.nf b/modules/local/cff_annotate/main.nf index d96216c..c2d2e31 100644 --- a/modules/local/cff_annotate/main.nf +++ b/modules/local/cff_annotate/main.nf @@ -8,6 +8,7 @@ process CFF_ANNOTATE { input: tuple val(meta), path(cff), path(oncokb), path(agfusion) + path(transcripts) output: tuple val(meta), path("${prefix}.unfiltered.cff"), emit: unfiltered_cff @@ -25,12 +26,13 @@ process CFF_ANNOTATE { --cff ${cff} \\ ${oncokb_param} \\ --agfusion ${agfusion} \\ - --out-prefix ${prefix} + --out-prefix ${prefix} \\ + --transcripts ${transcripts} cat <<-END_VERSIONS > versions.yml "${task.process}": R: \$(R --version | head -n1) - add_annotations_cff.R: 0.0.1 + add_annotations_cff.R: 0.0.2 END_VERSIONS """ } diff --git a/modules/local/metafusion/container/Dockerfile b/modules/local/metafusion/container/Dockerfile index 1283139..9b55238 100755 --- a/modules/local/metafusion/container/Dockerfile +++ b/modules/local/metafusion/container/Dockerfile @@ -1,10 +1,10 @@ -FROM mapostolides/metafusion +FROM mapostolides/metafusion:latest LABEL author="Alexandria Dymun (pintoa1@mskcc.org)" \ maintainer="Anne Marie Noronha (noronhaa@mskcc.org)" \ - version.image="0.0.6" + version.image="0.0.8" -ENV METAFUSION_TAG="v1.0.1" +ENV METAFUSION_TAG="v1.0.3" ENV PATH="${PATH}:/MetaFusion/scripts" RUN R -e "chooseCRANmirror(ind=52); install.packages(c('plyr','data.table'))" diff --git a/modules/local/metafusion/genebed/main.nf b/modules/local/metafusion/genebed/main.nf index 34cba9c..97b4d12 100644 --- a/modules/local/metafusion/genebed/main.nf +++ b/modules/local/metafusion/genebed/main.nf @@ -9,10 +9,9 @@ process METAFUSION_GENEBED { input: tuple val(meta), path(gff) - val ensembl_version output: - tuple val(meta), path("*.metafusion.gene.bed"), emit: metafusion_gene_bed + tuple val(meta), path("${meta.id}.metafusion.gene.bed"), emit: metafusion_gene_bed path "versions.yml" , emit: versions when: @@ -24,12 +23,12 @@ process METAFUSION_GENEBED { """ final_generate_v75_gene_bed.R \\ $gff \\ - ${ensembl_version}.metafusion.gene.bed + ${prefix}.metafusion.gene.bed cat <<-END_VERSIONS > versions.yml "${task.process}": R: \$(R --version | head -n1) - final_generate_v75_gene_bed.R: 0.0.1 + final_generate_v75_gene_bed.R: 0.0.2 END_VERSIONS """ @@ -42,7 +41,7 @@ process METAFUSION_GENEBED { cat <<-END_VERSIONS > versions.yml "${task.process}": R: \$(R --version | head -n1) - final_generate_v75_gene_bed.R: 0.0.1 + final_generate_v75_gene_bed.R: 0.0.2 END_VERSIONS """ } diff --git a/modules/local/metafusion/genebed/resources/usr/bin/final_generate_v75_gene_bed.R b/modules/local/metafusion/genebed/resources/usr/bin/final_generate_v75_gene_bed.R index 6a5b59c..4e5191c 100755 --- a/modules/local/metafusion/genebed/resources/usr/bin/final_generate_v75_gene_bed.R +++ b/modules/local/metafusion/genebed/resources/usr/bin/final_generate_v75_gene_bed.R @@ -1,9 +1,9 @@ - #!/usr/local/bin/Rscript + # __author__ = "Alexandria Dymun" # __email__ = "pintoa1@mskcc.org" # __contributor__ = "Anne Marie Noronha (noronhaa@mskcc.org)" -# __version__ = "0.0.1" +# __version__ = "0.0.2" # __status__ = "Dev" @@ -12,6 +12,7 @@ suppressPackageStartupMessages({ library(dplyr) library(data.table) library(stringr) + options(scipen = 999) }) usage <- function() { @@ -35,6 +36,8 @@ if (length(args)!=2) { gtf <- rtracklayer::import(args[1]) gtf_df <- as.data.frame(gtf) +#remove incomplete transcripts mRNA_end_NF and mRNA_start_NF (not finished) +gtf_df <- gtf_df[!grepl("NF",gtf_df$tag),] file.to_write <- args[2] diff --git a/modules/local/metafusion/run/main.nf b/modules/local/metafusion/run/main.nf index bd1d8fa..42e251c 100644 --- a/modules/local/metafusion/run/main.nf +++ b/modules/local/metafusion/run/main.nf @@ -3,8 +3,8 @@ process METAFUSION_RUN { label "process_low" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'docker://cmopipeline/metafusion:0.0.6' : - 'docker.io/cmopipeline/metafusion:0.0.6' }" + 'docker://cmopipeline/metafusion:0.0.8' : + 'docker.io/cmopipeline/metafusion:0.0.8' }" input: tuple val(meta), path(cff) @@ -12,6 +12,7 @@ process METAFUSION_RUN { path info path fasta path blocklist + path clinicalgenes output: tuple val(meta), path("*final*cluster") , emit: cluster @@ -37,12 +38,13 @@ process METAFUSION_RUN { --gene_info $info \\ --genome_fasta $fasta \\ --recurrent_bedpe $blocklist \\ + --clinical_genes $clinicalgenes \\ ${args} cat <<-END_VERSIONS > versions.yml "${task.process}": Metafusion docker: \$METAFUSION_TAG - Metafusion_forte.sh: 0.0.1 + Metafusion_forte.sh: 0.0.2 END_VERSIONS """ } diff --git a/subworkflows/local/fusion.nf b/subworkflows/local/fusion.nf index 0539428..5d4dcf9 100644 --- a/subworkflows/local/fusion.nf +++ b/subworkflows/local/fusion.nf @@ -35,8 +35,9 @@ workflow FUSION { ch_versions = Channel.empty() fasta = params.fasta //gene_bed = params.metafusion_gene_bed - //gene_info = params.metafusion_gene_info + clinicalgenes = params.clinicalgenes //blocklist = params.metafusion_blocklist + transcripts = params.transcripts STAR_FOR_ARRIBA( reads, @@ -114,7 +115,8 @@ workflow FUSION { gene_bed.map{ it[1] }.first(), gene_info.map{ it[1] }.first(), fasta, - blocklist + blocklist, + clinicalgenes ) ADD_FLAG( @@ -139,7 +141,8 @@ workflow FUSION { CFF_FINALIZE( ADD_FLAG.out.unfiltered_cff .join(ONCOKB_FUSIONANNOTATOR.out.oncokb_fusions, by:0) - .join(AGFUSION_BATCH.out.fusion_transcripts_tsv, by:0) + .join(AGFUSION_BATCH.out.fusion_transcripts_tsv, by:0), + transcripts ) } else { CFF_FINALIZE( @@ -147,7 +150,8 @@ workflow FUSION { .join(AGFUSION_BATCH.out.fusion_transcripts_tsv, by:0) .map{ meta, cff, agfusion_file -> [ meta, cff, [], agfusion_file ] - } + }, + transcripts ) } ch_versions = ch_versions.mix(ADD_FLAG.out.versions.first()) diff --git a/subworkflows/local/prepare_references.nf b/subworkflows/local/prepare_references.nf index 97e4124..caaed87 100644 --- a/subworkflows/local/prepare_references.nf +++ b/subworkflows/local/prepare_references.nf @@ -84,14 +84,16 @@ workflow PREPARE_REFERENCES { ARRIBA_DOWNLOAD() + //cosmic_usr = params.cosmic_usr ?: "" + //cosmic_passwd = params.cosmic_passwd ?: "" + AGAT_SPADDINTRONS( - [[:],gtf], + [[id:params.ensembl_version],gtf], [] ) METAFUSION_GENEBED( - AGAT_SPADDINTRONS.out.gff, - params.ensembl_version + AGAT_SPADDINTRONS.out.gff ) METAFUSION_GENEINFO(