diff --git a/rnaindel/analysis/alignment_feature_calculator.py b/rnaindel/analysis/alignment_feature_calculator.py index 2dece41..e17bc51 100755 --- a/rnaindel/analysis/alignment_feature_calculator.py +++ b/rnaindel/analysis/alignment_feature_calculator.py @@ -206,7 +206,10 @@ def make_indel_alignment(variant, bam, downsample_threshold=1500): try: valn = VariantAlignment( - variant, bam, downsample_threshold=downsample_threshold, exclude_duplicates=True + variant, + bam, + downsample_threshold=downsample_threshold, + exclude_duplicates=True, ) except: return None diff --git a/rnaindel/occurrence/counter.py b/rnaindel/occurrence/counter.py index 9fbe939..a31523a 100755 --- a/rnaindel/occurrence/counter.py +++ b/rnaindel/occurrence/counter.py @@ -2,6 +2,7 @@ import os import sys +import gzip import pysam import argparse from functools import partial @@ -61,7 +62,15 @@ def validate_file_lst(filename): def is_rnaindel_output_vcf(vcf): - headers = [line for line in open(vcf) if line.startswith("##source=RNAIndel")] + if vcf.endswith(".gz"): + headers = [ + line + for line in gzip.open(vcf, "rt") + if line.startswith("##source=RNAIndel") + ] + else: + headers = [line for line in open(vcf) if line.startswith("##source=RNAIndel")] + if headers: return True else: @@ -93,7 +102,15 @@ def make_indel_from_vcf_line(line, genome): def collect_somatic_predictions_from_vcf(vcf, genome): - somatic_lines = [line for line in open(vcf) if "predicted_class=somatic" in line] + if vcf.endswith(".gz"): + somatic_lines = [ + line for line in gzip.open(vcf, "rt") if "predicted_class=somatic" in line + ] + else: + somatic_lines = [ + line for line in open(vcf) if "predicted_class=somatic" in line + ] + parsed_somatic_lines = [ make_indel_from_vcf_line(line, genome) for line in somatic_lines @@ -102,10 +119,19 @@ def collect_somatic_predictions_from_vcf(vcf, genome): return to_flat_lst(parsed_somatic_lines) +def rreplace(s, old, new): + li = s.rsplit(old, 1) + return new.join(li) + + def annotate_vcf_with_recurrence(vcf, genome, occurrence_dict, outdir): new_vcf = edit_header(vcf) - fi = open(vcf) + if vcf.endswith(".gz"): + fi = gzip.open(vcf, "rt") + else: + fi = open(vcf) + for line in fi: if line.startswith("#"): pass @@ -120,18 +146,33 @@ def annotate_vcf_with_recurrence(vcf, genome, occurrence_dict, outdir): fi.close() if outdir: - filename = os.path.basename(vcf).replace(".vcf", ".occurrence.vcf") - fo = open(os.path.join(outdir, filename), "w") + filename = os.path.join(outdir, os.path.basename(vcf)) else: - fo = open(vcf, "w") + filename = vcf + + filename = rreplace(filename, ".vcf", ".occurrence.vcf") + filename = rreplace(filename, ".gz", "") + + fo = open(filename, "w") + fo.write("".join(new_vcf)) fo.close() + pysam.tabix_index(filename, preset="vcf", force=True, keep_original=False) + def edit_header(vcf): - header_lines = [line for line in open(vcf) if line.startswith("##")] + if vcf.endswith(".gz"): + header_lines = [line for line in gzip.open(vcf, "rt") if line.startswith("##")] + else: + header_lines = [line for line in open(vcf) if line.startswith("##")] + new_header_line = '##INFO=\n' - bottom = [line for line in open(vcf) if line.startswith("#CHROM")] + + if vcf.endswith(".gz"): + bottom = [line for line in gzip.open(vcf, "rt") if line.startswith("#CHROM")] + else: + bottom = [line for line in open(vcf) if line.startswith("#CHROM")] if new_header_line not in header_lines: return header_lines + [new_header_line] + bottom diff --git a/rnaindel/version.py b/rnaindel/version.py index 1e3bed4..05103bc 100755 --- a/rnaindel/version.py +++ b/rnaindel/version.py @@ -1 +1,2 @@ -__version__ = "3.2.2" +__version__ = "3.2.3" +