diff --git a/CHANGELOG.md b/CHANGELOG.md index 0b4c2ce..5a50a32 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +## 2.3.5 (2018-12-30) + +Improvements: +Using the temporary file to allow to run CIRCexplorer2 in the same directory simultaneously (issue #26). Thanks @egaffo. Happy New Year! + ## 2.3.4 (2018-10-26) Bugfixes: diff --git a/circ2/annotate.py b/circ2/annotate.py index d498739..2df581d 100644 --- a/circ2/annotate.py +++ b/circ2/annotate.py @@ -19,6 +19,7 @@ from .parser import parse_ref, parse_bed, check_fasta from .helper import logger, map_fusion_to_iso, fix_bed, generate_bed from collections import defaultdict +import tempfile __author__ = [ 'Xiao-Ou Zhang (zhangxiaoou@picb.ac.cn)', @@ -30,15 +31,17 @@ @logger def annotate(options): + # create temporary annotated fusion file + fusion_tmp = tempfile.TemporaryFile(mode='w+') # annotate fusion junctions - annotate_fusion(options['--ref'], options['--bed'], + annotate_fusion(options['--ref'], options['--bed'], fusion_tmp, secondary_flag=options['--low-confidence']) # fix fusion juncrions - fix_fusion(options['--ref'], options['--genome'], options['--output'], + fix_fusion(options['--ref'], options['--genome'], fusion_tmp, options['--output'], options['--no-fix'], secondary_flag=options['--low-confidence']) -def annotate_fusion(ref_f, junc_bed, secondary_flag=0, denovo_flag=0): +def annotate_fusion(ref_f, junc_bed, fusion_tmp, secondary_flag=0, denovo_flag=0): """ Align fusion juncrions to gene annotations """ @@ -48,88 +51,87 @@ def annotate_fusion(ref_f, junc_bed, secondary_flag=0, denovo_flag=0): fusion_bed = junc_bed fusions, fusion_index = parse_bed(fusion_bed) # fusion junctions total = set() - annotated_fusion_f = 'annotated_fusion.txt.tmp' - with open(annotated_fusion_f, 'w') as outf: - for chrom in chrom_info: - # overlap gene annotations with fusion juncrions - result = [] - # overlap genes - if chrom in genes: - result += Interval.overlapwith(genes[chrom].interval, - fusions[chrom]) - # overlap novel genes in denovo mode - if denovo_flag and chrom in novel_genes: - result += Interval.overlapwith(novel_genes[chrom].interval, - fusions[chrom]) - for itl in result: - # extract gene annotations - iso = list([x for x in itl[2:] if x.startswith('iso')]) - # for each overlapped fusion junction - for fus in itl[(2 + len(iso)):]: - reads = fus.split()[1] - fus_start, fus_end = fusion_index[fus] - fus_loc = '%s\t%d\t%d\tFUSIONJUNC/%s' % (chrom, fus_start, - fus_end, reads) - edge_annotations = [] # first or last exon flag - secondary_exon = defaultdict(dict) # secondary exons - annotate_flag = 0 - for iso_id in iso: - g, i, c, s = iso_id.split()[1:] - start = gene_info[iso_id][0][0] - end = gene_info[iso_id][-1][-1] - # fusion junction excesses boundaries of gene - # annotation - if fus_start < start - 10 or fus_end > end + 10: - if not secondary_flag: - continue - (fusion_info, - index, - edge, - secondary) = map_fusion_to_iso(fus_start, - fus_end, s, - gene_info[iso_id]) - if fusion_info: - annotate_flag += 1 - bed_info = '\t'.join([fus_loc, '0', s, - str(fus_start), - str(fus_start), '0,0,0']) - bed = '\t'.join([bed_info, fusion_info, g, i, - index]) - if not edge: # not first or last exon - outf.write(bed + '\n') - total.add(fus) - else: # first or last exon - edge_annotations.append(bed) - elif secondary_flag and secondary is not None: - li, ri = secondary - gene = ':'.join([g, s]) - if li is not None: - li = str(li) - secondary_exon['left'][gene] = ':'.join([i, - li]) - if ri is not None: - ri = str(ri) - secondary_exon['right'][gene] = ':'.join([i, - ri]) - if edge_annotations: - for bed in edge_annotations: + outf = fusion_tmp + for chrom in chrom_info: + # overlap gene annotations with fusion juncrions + result = [] + # overlap genes + if chrom in genes: + result += Interval.overlapwith(genes[chrom].interval, + fusions[chrom]) + # overlap novel genes in denovo mode + if denovo_flag and chrom in novel_genes: + result += Interval.overlapwith(novel_genes[chrom].interval, + fusions[chrom]) + for itl in result: + # extract gene annotations + iso = list([x for x in itl[2:] if x.startswith('iso')]) + # for each overlapped fusion junction + for fus in itl[(2 + len(iso)):]: + reads = fus.split()[1] + fus_start, fus_end = fusion_index[fus] + fus_loc = '%s\t%d\t%d\tFUSIONJUNC/%s' % (chrom, fus_start, + fus_end, reads) + edge_annotations = [] # first or last exon flag + secondary_exon = defaultdict(dict) # secondary exons + annotate_flag = 0 + for iso_id in iso: + g, i, c, s = iso_id.split()[1:] + start = gene_info[iso_id][0][0] + end = gene_info[iso_id][-1][-1] + # fusion junction excesses boundaries of gene + # annotation + if fus_start < start - 10 or fus_end > end + 10: + if not secondary_flag: + continue + (fusion_info, + index, + edge, + secondary) = map_fusion_to_iso(fus_start, + fus_end, s, + gene_info[iso_id]) + if fusion_info: + annotate_flag += 1 + bed_info = '\t'.join([fus_loc, '0', s, + str(fus_start), + str(fus_start), '0,0,0']) + bed = '\t'.join([bed_info, fusion_info, g, i, + index]) + if not edge: # not first or last exon outf.write(bed + '\n') - total.add(fus) - if secondary_flag and not annotate_flag: - for gene in secondary_exon['left']: - if gene in secondary_exon['right']: - left = secondary_exon['left'][gene] - right = secondary_exon['right'][gene] - g, s = gene.split(':') - # for avoid dup, use fus_loc_new - fus_loc_new = fus_loc + '\t0\t%s' % s - outf.write('%s\t%s:%s\t%s:%s\n' % (fus_loc_new, - g, left, g, - right)) + total.add(fus) + else: # first or last exon + edge_annotations.append(bed) + elif secondary_flag and secondary is not None: + li, ri = secondary + gene = ':'.join([g, s]) + if li is not None: + li = str(li) + secondary_exon['left'][gene] = ':'.join([i, + li]) + if ri is not None: + ri = str(ri) + secondary_exon['right'][gene] = ':'.join([i, + ri]) + if edge_annotations: + for bed in edge_annotations: + outf.write(bed + '\n') + total.add(fus) + if secondary_flag and not annotate_flag: + for gene in secondary_exon['left']: + if gene in secondary_exon['right']: + left = secondary_exon['left'][gene] + right = secondary_exon['right'][gene] + g, s = gene.split(':') + # for avoid dup, use fus_loc_new + fus_loc_new = fus_loc + '\t0\t%s' % s + outf.write('%s\t%s:%s\t%s:%s\n' % (fus_loc_new, + g, left, g, + right)) print('Annotated %d fusion junctions!' % len(total)) -def fix_fusion(ref_f, genome_fa, out_file, no_fix, secondary_flag=0, +def fix_fusion(ref_f, genome_fa, fusion_tmp, out_file, no_fix, secondary_flag=0, denovo_flag=0): """ Realign fusion juncrions @@ -137,7 +139,8 @@ def fix_fusion(ref_f, genome_fa, out_file, no_fix, secondary_flag=0, print('Start to fix fusion junctions...') fa = check_fasta(genome_fa) ref = parse_ref(ref_f, 2) - annotated_fusion_f = 'annotated_fusion.txt.tmp' + annotated_fusion_f = fusion_tmp + annotated_fusion_f.seek(0) ## from the start fusions, fusion_names, fixed_flag = fix_bed(annotated_fusion_f, ref, fa, no_fix, denovo_flag) total = 0 @@ -216,6 +219,6 @@ def fix_fusion(ref_f, genome_fa, out_file, no_fix, secondary_flag=0, if secondary_flag: secondary_f.close() - os.remove('annotated_fusion.txt.tmp') + fusion_tmp.close() print('Fixed %d fusion junctions!' % total) diff --git a/circ2/denovo.py b/circ2/denovo.py index b907bcf..1b37751 100644 --- a/circ2/denovo.py +++ b/circ2/denovo.py @@ -32,6 +32,7 @@ import pysam from scipy.stats import fisher_exact, binom from .genomic_interval import Interval +import tempfile __author__ = [ 'Xiao-Ou Zhang (zhangxiaoou@picb.ac.cn)', @@ -66,11 +67,13 @@ def denovo(options): print('Warning: no cufflinks directory %s!' % options['--cuff']) print('Please run CIRCexplorer2 assembly before this step!') ref_path = options['--ref'] + # create temporary annotated fusion file + fusion_tmp = tempfile.TemporaryFile(mode='w+') # annotate fusion junctions - annotate_fusion(ref_path, options['--bed'], denovo_flag=1) + annotate_fusion(ref_path, options['--bed'], fusion_tmp, denovo_flag=1) # fix fusion juncrions out_f = '%s/circularRNA_full.txt' % denovo_dir - fix_fusion(ref_path, options['--genome'], out_f, + fix_fusion(ref_path, options['--genome'], fusion_tmp, out_f, options['--no-fix'], denovo_flag=1) # extract novel circRNAs extract_novel_circ(denovo_dir, options['--ref']) diff --git a/circ2/helper.py b/circ2/helper.py index 199e0af..d7f919d 100644 --- a/circ2/helper.py +++ b/circ2/helper.py @@ -137,110 +137,109 @@ def fix_bed(fusion_file, ref, fa, no_fix, denovo_flag): fusion_set = set() fixed_flag = defaultdict(int) # flag to indicate realignment junctions = set() - with open(fusion_file, 'r') as f: - for line in f: - secondary_flag = False - chrom = line.split()[0] - strand = line.split()[5] - start, end = [int(x) for x in line.split()[1:3]] - junction_info = '%s\t%d\t%d' % (chrom, start, end) - if not denovo_flag and junction_info in junctions: - continue - reads = int(line.split()[3].split('/')[1]) - if len(line.split()) == 8: - secondary_flag = True - flag = False - left_info, right_info = line.split()[6:8] - left_gene, left_iso, left_index = left_info.split(':') - right_gene, right_iso, right_index = right_info.split(':') - s = int(left_index) - e = int(right_index) - iso_starts = ref['\t'.join([left_gene, left_iso, chrom, - strand])][0] - iso_ends = ref['\t'.join([right_gene, right_iso, chrom, - strand])][1] - loc = '%s\t%d\t%d' % (chrom, iso_starts[s], iso_ends[e]) - name = '|'.join(['secondary', loc, strand, left_info, - right_info]) - else: - flag, gene, iso, index = line.split()[-4:] - flag = True if flag == 'ciRNA' else False - name = '\t'.join([gene, iso, chrom, strand, index]) - iso_starts, iso_ends = ref['\t'.join([gene, iso, chrom, - strand])] - if not flag: # back spliced exons - if not secondary_flag: - s, e = [int(x) for x in index.split(',')] + for line in fusion_file: + secondary_flag = False + chrom = line.split()[0] + strand = line.split()[5] + start, end = [int(x) for x in line.split()[1:3]] + junction_info = '%s\t%d\t%d' % (chrom, start, end) + if not denovo_flag and junction_info in junctions: + continue + reads = int(line.split()[3].split('/')[1]) + if len(line.split()) == 8: + secondary_flag = True + flag = False + left_info, right_info = line.split()[6:8] + left_gene, left_iso, left_index = left_info.split(':') + right_gene, right_iso, right_index = right_info.split(':') + s = int(left_index) + e = int(right_index) + iso_starts = ref['\t'.join([left_gene, left_iso, chrom, + strand])][0] + iso_ends = ref['\t'.join([right_gene, right_iso, chrom, + strand])][1] + loc = '%s\t%d\t%d' % (chrom, iso_starts[s], iso_ends[e]) + name = '|'.join(['secondary', loc, strand, left_info, + right_info]) + else: + flag, gene, iso, index = line.split()[-4:] + flag = True if flag == 'ciRNA' else False + name = '\t'.join([gene, iso, chrom, strand, index]) + iso_starts, iso_ends = ref['\t'.join([gene, iso, chrom, + strand])] + if not flag: # back spliced exons + if not secondary_flag: + s, e = [int(x) for x in index.split(',')] + # not realign + if start == iso_starts[s] and end == iso_ends[e]: + fusions[name] += reads + if name not in fusion_set: + fusion_set.add(name) + fusion_names.append(name) + junctions.add(junction_info) + # no fix mode + elif no_fix: + fusions[name] += reads + if name not in fusion_set: + fusion_set.add(name) + fusion_names.append(name) + fixed_flag[name] += 1 + junctions.add(junction_info) + # realign + elif check_seq(chrom, [start, iso_starts[s], end, iso_ends[e]], + fa): + fusions[name] += reads + if name not in fusion_set: + fusion_set.add(name) + fusion_names.append(name) + fixed_flag[name] += 1 + junctions.add(junction_info) + else: # ciRNAs + index = int(index) + if strand == '+': # not realign - if start == iso_starts[s] and end == iso_ends[e]: + if start == iso_ends[index]: + name += '|'.join(['', str(start), str(end)]) fusions[name] += reads if name not in fusion_set: fusion_set.add(name) fusion_names.append(name) junctions.add(junction_info) - # no fix mode - elif no_fix: + # realign + elif check_seq(chrom, [start, iso_ends[index], end], fa, + intron_flag=True): + fixed_start = iso_ends[index] + fixed_end = end + fixed_start - start + name += '|'.join(['', str(fixed_start), + str(fixed_end)]) fusions[name] += reads if name not in fusion_set: fusion_set.add(name) fusion_names.append(name) fixed_flag[name] += 1 junctions.add(junction_info) - # realign - elif check_seq(chrom, [start, iso_starts[s], end, iso_ends[e]], - fa): + else: + if end == iso_starts[index + 1]: + # not realign + name += '|'.join(['', str(start), str(end)]) fusions[name] += reads if name not in fusion_set: fusion_set.add(name) fusion_names.append(name) - fixed_flag[name] += 1 junctions.add(junction_info) - else: # ciRNAs - index = int(index) - if strand == '+': - # not realign - if start == iso_ends[index]: - name += '|'.join(['', str(start), str(end)]) - fusions[name] += reads - if name not in fusion_set: - fusion_set.add(name) - fusion_names.append(name) - junctions.add(junction_info) # realign - elif check_seq(chrom, [start, iso_ends[index], end], fa, - intron_flag=True): - fixed_start = iso_ends[index] - fixed_end = end + fixed_start - start - name += '|'.join(['', str(fixed_start), - str(fixed_end)]) - fusions[name] += reads - if name not in fusion_set: - fusion_set.add(name) - fusion_names.append(name) - fixed_flag[name] += 1 - junctions.add(junction_info) - else: - if end == iso_starts[index + 1]: - # not realign - name += '|'.join(['', str(start), str(end)]) - fusions[name] += reads - if name not in fusion_set: - fusion_set.add(name) - fusion_names.append(name) - junctions.add(junction_info) - # realign - elif check_seq(chrom, [end, iso_starts[index + 1], start], - fa, intron_flag=True): - fixed_end = iso_starts[index + 1] - fixed_start = start + fixed_end - end - name += '|'.join(['', str(fixed_start), - str(fixed_end)]) - fusions[name] += reads - if name not in fusion_set: - fusion_set.add(name) - fusion_names.append(name) - fixed_flag[name] += 1 - junctions.add(junction_info) + elif check_seq(chrom, [end, iso_starts[index + 1], start], + fa, intron_flag=True): + fixed_end = iso_starts[index + 1] + fixed_start = start + fixed_end - end + name += '|'.join(['', str(fixed_start), + str(fixed_end)]) + fusions[name] += reads + if name not in fusion_set: + fusion_set.add(name) + fusion_names.append(name) + fixed_flag[name] += 1 + junctions.add(junction_info) return (fusions, fusion_names, fixed_flag) diff --git a/circ2/version.py b/circ2/version.py index 6ec85d6..85a7344 100644 --- a/circ2/version.py +++ b/circ2/version.py @@ -1 +1 @@ -__version__ = '2.3.4' +__version__ = '2.3.5'