Merge pull request #57 from hammerlab/use-assembly-and-trimming

overlap assembly and coverage trimming to create VariantSequence candidates
openvax · Nov 8, 2016 · 1dff463 · 1dff463
2 parents f60e654 + dcfd2ed
commit 1dff463
Show file tree

Hide file tree

Showing 22 changed files with 1,004 additions and 753 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -3,6 +3,12 @@ language: python
 python:
   - "2.7"
   - "3.4"
+cache:
+  pip: true
+  # cache directory used for Ensembl downloads of GTF and FASTA files
+  # along with the indexed db of intervals and ID mappings and pickles
+  # of sequence dictionaries
+  directories: /home/travis/.cache/pyensembl/
 before_install:
   # Commands below copied from: http://conda.pydata.org/docs/travis.html
   # We do this conditionally because it saves us some downloading if the

diff --git a/isovar/__init__.py b/isovar/__init__.py
@@ -14,4 +14,4 @@
 
 from __future__ import print_function, division, absolute_import
 
-__version__ = "0.2.4"
+__version__ = "0.3.0"
diff --git a/isovar/allele_reads.py b/isovar/allele_reads.py
@@ -34,18 +34,15 @@
 
 logger = logging.getLogger(__name__)
 
-
 # subclassing from namedtuple to get a lightweight object with built-in
 # hashing and comparison while also being able to add methods
-AlleleReadFields = namedtuple(
+AlleleReadBase = namedtuple(
     "AlleleRead",
     "prefix allele suffix name sequence")
 
-class AlleleRead(AlleleReadFields):
+class AlleleRead(AlleleReadBase):
     def __new__(cls, prefix, allele, suffix, name):
-        # construct sequence from prefix + alt + suffix
-        return AlleleReadFields.__new__(
-            cls,
+        return AlleleReadBase(
             prefix=prefix,
             allele=allele,
             suffix=suffix,
@@ -55,100 +52,102 @@ def __new__(cls, prefix, allele, suffix, name):
     def __len__(self):
         return len(self.prefix) + len(self.allele) + len(self.suffix)
 
-def allele_read_from_locus_read(locus_read, n_ref):
-    """
-    Given a single ReadAtLocus object, return either an AlleleRead or None
-
-    Parameters
-    ----------
-    locus_read : LocusRead
-        Read which overlaps a variant locus but doesn't necessarily contain the
-        alternate nucleotides
-
-    n_ref : int
-        Number of reference positions we are expecting to be modified or
-        deleted (for insertions this should be 0)
-    """
-    sequence = locus_read.sequence
-    reference_positions = locus_read.reference_positions
-
-    # positions of the nucleotides before and after the variant within
-    # the read sequence
-    read_pos_before = locus_read.base0_read_position_before_variant
-    read_pos_after = locus_read.base0_read_position_after_variant
-
-    # positions of the nucleotides before and after the variant on the
-    # reference genome
-    ref_pos_before = reference_positions[read_pos_before]
-
-    if ref_pos_before is None:
-        logger.warn(
-            "Missing reference pos for nucleotide before variant on read: %s",
+    @classmethod
+    def from_locus_read(cls, locus_read, n_ref):
+        """
+        Given a single LocusRead object, return either an AlleleRead or None
+
+        Parameters
+        ----------
+        locus_read : LocusRead
+            Read which overlaps a variant locus but doesn't necessarily contain the
+            alternate nucleotides
+
+        n_ref : int
+            Number of reference positions we are expecting to be modified or
+            deleted (for insertions this should be 0)
+        """
+        sequence = locus_read.sequence
+        reference_positions = locus_read.reference_positions
+
+        # positions of the nucleotides before and after the variant within
+        # the read sequence
+        read_pos_before = locus_read.base0_read_position_before_variant
+        read_pos_after = locus_read.base0_read_position_after_variant
+
+        # positions of the nucleotides before and after the variant on the
+        # reference genome
+        ref_pos_before = reference_positions[read_pos_before]
+
+        if ref_pos_before is None:
+            logger.warn(
+                "Missing reference pos for nucleotide before variant on read: %s",
                 locus_read)
-        return None
+            return None
 
-    ref_pos_after = reference_positions[read_pos_after]
+        ref_pos_after = reference_positions[read_pos_after]
 
-    if ref_pos_after is None:
-        logger.warn(
-            "Missing reference pos for nucleotide after variant on read: %s",
+        if ref_pos_after is None:
+            logger.warn(
+                "Missing reference pos for nucleotide after variant on read: %s",
                 locus_read)
-        return None
-
-    if n_ref == 0:
-        if ref_pos_after - ref_pos_before != 1:
-            # if the number of nucleotides skipped isn't the same
-            # as the number of reference nucleotides in the variant then
-            # don't use this read
-            logger.debug(
-                "Positions before (%d) and after (%d) variant should be adjacent on read %s",
+            return None
+
+        if n_ref == 0:
+            if ref_pos_after - ref_pos_before != 1:
+                # if the number of nucleotides skipped isn't the same
+                # as the number of reference nucleotides in the variant then
+                # don't use this read
+                logger.debug(
+                    "Positions before (%d) and after (%d) variant should be adjacent on read %s",
                     ref_pos_before,
                     ref_pos_after,
                     locus_read)
-            return None
-
-        # insertions require a sequence of non-aligned bases
-        # followed by the subsequence reference position
-        ref_positions_for_inserted = reference_positions[
-            read_pos_before + 1:read_pos_after]
-        if any(insert_pos is not None for insert_pos in ref_positions_for_inserted):
-            # all these inserted nucleotides should *not* align to the
-            # reference
-            logger.debug(
-                "Skipping read, inserted nucleotides shouldn't map to reference")
-            return None
-    else:
-        # substitutions and deletions
-        if ref_pos_after - ref_pos_before != n_ref + 1:
-            # if the number of nucleotides skipped isn't the same
-            # as the number of reference nucleotides in the variant then
-            # don't use this read
-            logger.debug(
-                "Positions before (%d) and after (%d) variant should be adjacent on read %s",
+                return None
+
+            # insertions require a sequence of non-aligned bases
+            # followed by the subsequence reference position
+            ref_positions_for_inserted = reference_positions[
+                read_pos_before + 1:read_pos_after]
+            if any(insert_pos is not None for insert_pos in ref_positions_for_inserted):
+                # all these inserted nucleotides should *not* align to the
+                # reference
+                logger.debug(
+                    "Skipping read, inserted nucleotides shouldn't map to reference")
+                return None
+        else:
+            # substitutions and deletions
+            if ref_pos_after - ref_pos_before != n_ref + 1:
+                # if the number of nucleotides skipped isn't the same
+                # as the number of reference nucleotides in the variant then
+                # don't use this read
+                logger.debug(
+                    ("Positions before (%d) and after (%d) variant should be "
+                     "adjacent on read %s"),
                     ref_pos_before,
                     ref_pos_after,
                     locus_read)
-            return None
+                return None
 
-    nucleotides_at_variant_locus = sequence[read_pos_before + 1:read_pos_after]
+        nucleotides_at_variant_locus = sequence[read_pos_before + 1:read_pos_after]
 
-    prefix = sequence[:read_pos_before + 1]
-    suffix = sequence[read_pos_after:]
+        prefix = sequence[:read_pos_before + 1]
+        suffix = sequence[read_pos_after:]
 
-    prefix, suffix = convert_from_bytes_if_necessary(prefix, suffix)
-    prefix, suffix = trim_N_nucleotides(prefix, suffix)
+        prefix, suffix = convert_from_bytes_if_necessary(prefix, suffix)
+        prefix, suffix = trim_N_nucleotides(prefix, suffix)
 
-    return AlleleRead(
-        prefix,
-        nucleotides_at_variant_locus,
-        suffix,
-        name=locus_read.name)
+        return cls(
+            prefix,
+            nucleotides_at_variant_locus,
+            suffix,
+            name=locus_read.name)
 
 def allele_reads_from_locus_reads(locus_reads, n_ref):
     """
-    Given a collection of ReadAtLocus objects, returns a
-    list of VariantRead objects (which are split into prefix/variant/suffix
-    nucleotides).
+    Given a collection of LocusRead objects, returns a
+    list of AlleleRead objects
+    (which are split into prefix/allele/suffix nucleotide strings).
 
     Parameters
     ----------
@@ -161,7 +160,7 @@ def allele_reads_from_locus_reads(locus_reads, n_ref):
     """
 
     for locus_read in locus_reads:
-        allele_read = allele_read_from_locus_read(locus_read, n_ref)
+        allele_read = AlleleRead.from_locus_read(locus_read, n_ref)
         if allele_read is None:
             continue
         else:
@@ -206,9 +205,11 @@ def reads_overlapping_variant(
     if chromosome is None:
         chromosome = variant.contig
 
-    logger.info("Gathering variant reads for variant %s (chromosome=%s)",
+    logger.info(
+        "Gathering variant reads for variant %s (chromosome = %s, gene names = %s)",
         variant,
-        chromosome)
+        chromosome,
+        variant.gene_names)
 
     base1_position, ref, alt = trim_variant(variant)
 
@@ -271,7 +272,9 @@ def reads_overlapping_variants(variants, samfile, **kwargs):
         else:
             logger.warn(
                 "Chromosome '%s' from variant %s not in alignment file %s",
-                    chromosome, variant, samfile.filename)
+                chromosome,
+                variant,
+                samfile.filename)
             continue
         allele_reads = reads_overlapping_variant(
             samfile=samfile,

diff --git a/isovar/assembly.py b/isovar/assembly.py
@@ -19,13 +19,9 @@
 # I'd rather just use list.copy but Python 2.7 doesn't support it
 from copy import copy
 
-from .read_helpers import group_unique_sequences
-from .variant_sequences import VariantSequence
-
 
 logger = logging.getLogger(__name__)
 
-
 MIN_OVERLAP_SIZE = 30
 
 def sort_by_decreasing_prefix_length(seq):
@@ -68,14 +64,6 @@ def sort_by_decreasing_total_length(seq):
     """
     return -len(seq.sequence)
 
-def combine_variant_sequences(variant_sequence1, variant_sequence2):
-    return VariantSequence(
-        prefix=variant_sequence1.prefix,
-        alt=variant_sequence1.alt,
-        suffix=variant_sequence2.suffix,
-        reads=set(
-            variant_sequence1.reads).union(set(variant_sequence2.reads)))
-
 def greedy_merge(variant_sequences, min_overlap_size=MIN_OVERLAP_SIZE):
     """
     Greedily merge overlapping sequences into longer sequences.
@@ -103,11 +91,7 @@ def greedy_merge(variant_sequences, min_overlap_size=MIN_OVERLAP_SIZE):
                     variant_sequence2,
                     min_overlap_size=min_overlap_size):
                 continue
-            combined = combine_variant_sequences(
-                variant_sequence1, variant_sequence2)
-            if len(combined) <= max(len(variant_sequence1), len(variant_sequence2)):
-                # if the combined sequence isn't any longer, then why bother?
-                continue
+            combined = variant_sequence1.combine(variant_sequence2)
             if combined.sequence in merged_variant_sequences:
                 # it's possible to get the same merged sequence from distinct
                 # input sequences
@@ -120,8 +104,7 @@ def greedy_merge(variant_sequences, min_overlap_size=MIN_OVERLAP_SIZE):
                 # sequence is supported by any one read.
                 existing_record_with_same_sequence = merged_variant_sequences[
                     combined.sequence]
-                combined_with_more_reads = combine_variant_sequences(
-                    existing_record_with_same_sequence,
+                combined_with_more_reads = existing_record_with_same_sequence.combine(
                     combined)
                 merged_variant_sequences[combined.sequence] = combined_with_more_reads
             else:
@@ -168,7 +151,7 @@ def sort_by_decreasing_read_count_and_sequence_lenth(variant_sequence):
     """
     return -len(variant_sequence.reads), -len(variant_sequence.sequence)
 
-def iterative_assembly(
+def iterative_overlap_assembly(
         variant_sequences,
         min_overlap_size=30,
         n_merge_iters=2):
@@ -185,9 +168,9 @@ def iterative_assembly(
 
         logger.info(
             "Iteration #%d of assembly: generated %d sequences (from %d)",
-                i + 1,
-                len(variant_sequences),
-                len(previous_sequences))
+            i + 1,
+            len(variant_sequences),
+            len(previous_sequences))
 
         if len(variant_sequences) == 0:
             # if the greedy merge procedure fails for all pairs of candidate
@@ -199,31 +182,9 @@ def iterative_assembly(
         variant_sequences = collapse_substrings(variant_sequences)
         logger.info(
             "After collapsing subsequences, %d distinct sequences left",
-                len(variant_sequences))
+            len(variant_sequences))
         if len(variant_sequences) == 1:
             # once we have only one sequence then there's no point trying
             # to further combine sequences
             break
     return variant_sequences
-
-def assemble_reads_into_variant_sequences(
-        variant_reads,
-        min_overlap_size=30,
-        n_merge_iters=2):
-    """
-    Turn a collection of AlleleRead objects into a collection of VariantSequence
-    objects, which are returned in order of (# supported reads, sequence length)
-    """
-    initial_variant_sequences = [
-        VariantSequence(
-            prefix=prefix,
-            alt=alt,
-            suffix=suffix,
-            reads=reads)
-        for (prefix, alt, suffix), reads in
-        group_unique_sequences(variant_reads).items()
-    ]
-    return iterative_assembly(
-        initial_variant_sequences,
-        min_overlap_size=min_overlap_size,
-        n_merge_iters=n_merge_iters)
diff --git a/isovar/cli/variant_sequences.py b/isovar/cli/variant_sequences.py
@@ -38,6 +38,11 @@ def add_variant_sequence_args(
         default=MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE,
         help="Minimum number of reads supporting a variant sequence (default %(default)s)")
 
+    rna_sequence_group.add_argument(
+        "--variant-sequence-assembly",
+        default=False,
+        action="store_true")
+
     # when cDNA sequence length can be inferred from a protein length then
     # we may want to omit this arg
     if add_sequence_length_arg:
@@ -76,7 +81,8 @@ def variant_sequences_generator_from_args(args):
     return reads_generator_to_sequences_generator(
         allele_reads_generator,
         min_reads_supporting_cdna_sequence=args.min_reads_supporting_variant_sequence,
-        preferred_sequence_length=args.cdna_sequence_length)
+        preferred_sequence_length=args.cdna_sequence_length,
+        variant_cdna_sequence_assembly=args.variant_sequence_assembly)
 
 def variant_sequences_dataframe_from_args(args):
     variant_sequences_generator = variant_sequences_generator_from_args(args)

diff --git a/isovar/default_parameters.py b/isovar/default_parameters.py
@@ -72,3 +72,8 @@
 
 # number of protein sequences we want to return per variant
 MAX_PROTEIN_SEQUENCES_PER_VARIANT = 1
+
+# run overlap assembly algorithm to construct variant
+# sequences from multiple reads which only partially
+# overlap (rather than fully spanning a coding sequence)
+VARIANT_CDNA_SEQUENCE_ASSEMBLY = False