Skip to content

Commit

Permalink
Merge pull request #57 from hammerlab/use-assembly-and-trimming
Browse files Browse the repository at this point in the history
overlap assembly and coverage trimming to create VariantSequence candidates
  • Loading branch information
iskandr authored Nov 8, 2016
2 parents f60e654 + dcfd2ed commit 1dff463
Show file tree
Hide file tree
Showing 22 changed files with 1,004 additions and 753 deletions.
6 changes: 6 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@ language: python
python:
- "2.7"
- "3.4"
cache:
pip: true
# cache directory used for Ensembl downloads of GTF and FASTA files
# along with the indexed db of intervals and ID mappings and pickles
# of sequence dictionaries
directories: /home/travis/.cache/pyensembl/
before_install:
# Commands below copied from: http://conda.pydata.org/docs/travis.html
# We do this conditionally because it saves us some downloading if the
Expand Down
2 changes: 1 addition & 1 deletion isovar/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@

from __future__ import print_function, division, absolute_import

__version__ = "0.2.4"
__version__ = "0.3.0"
177 changes: 90 additions & 87 deletions isovar/allele_reads.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,18 +34,15 @@

logger = logging.getLogger(__name__)


# subclassing from namedtuple to get a lightweight object with built-in
# hashing and comparison while also being able to add methods
AlleleReadFields = namedtuple(
AlleleReadBase = namedtuple(
"AlleleRead",
"prefix allele suffix name sequence")

class AlleleRead(AlleleReadFields):
class AlleleRead(AlleleReadBase):
def __new__(cls, prefix, allele, suffix, name):
# construct sequence from prefix + alt + suffix
return AlleleReadFields.__new__(
cls,
return AlleleReadBase(
prefix=prefix,
allele=allele,
suffix=suffix,
Expand All @@ -55,100 +52,102 @@ def __new__(cls, prefix, allele, suffix, name):
def __len__(self):
return len(self.prefix) + len(self.allele) + len(self.suffix)

def allele_read_from_locus_read(locus_read, n_ref):
"""
Given a single ReadAtLocus object, return either an AlleleRead or None
Parameters
----------
locus_read : LocusRead
Read which overlaps a variant locus but doesn't necessarily contain the
alternate nucleotides
n_ref : int
Number of reference positions we are expecting to be modified or
deleted (for insertions this should be 0)
"""
sequence = locus_read.sequence
reference_positions = locus_read.reference_positions

# positions of the nucleotides before and after the variant within
# the read sequence
read_pos_before = locus_read.base0_read_position_before_variant
read_pos_after = locus_read.base0_read_position_after_variant

# positions of the nucleotides before and after the variant on the
# reference genome
ref_pos_before = reference_positions[read_pos_before]

if ref_pos_before is None:
logger.warn(
"Missing reference pos for nucleotide before variant on read: %s",
@classmethod
def from_locus_read(cls, locus_read, n_ref):
"""
Given a single LocusRead object, return either an AlleleRead or None
Parameters
----------
locus_read : LocusRead
Read which overlaps a variant locus but doesn't necessarily contain the
alternate nucleotides
n_ref : int
Number of reference positions we are expecting to be modified or
deleted (for insertions this should be 0)
"""
sequence = locus_read.sequence
reference_positions = locus_read.reference_positions

# positions of the nucleotides before and after the variant within
# the read sequence
read_pos_before = locus_read.base0_read_position_before_variant
read_pos_after = locus_read.base0_read_position_after_variant

# positions of the nucleotides before and after the variant on the
# reference genome
ref_pos_before = reference_positions[read_pos_before]

if ref_pos_before is None:
logger.warn(
"Missing reference pos for nucleotide before variant on read: %s",
locus_read)
return None
return None

ref_pos_after = reference_positions[read_pos_after]
ref_pos_after = reference_positions[read_pos_after]

if ref_pos_after is None:
logger.warn(
"Missing reference pos for nucleotide after variant on read: %s",
if ref_pos_after is None:
logger.warn(
"Missing reference pos for nucleotide after variant on read: %s",
locus_read)
return None

if n_ref == 0:
if ref_pos_after - ref_pos_before != 1:
# if the number of nucleotides skipped isn't the same
# as the number of reference nucleotides in the variant then
# don't use this read
logger.debug(
"Positions before (%d) and after (%d) variant should be adjacent on read %s",
return None

if n_ref == 0:
if ref_pos_after - ref_pos_before != 1:
# if the number of nucleotides skipped isn't the same
# as the number of reference nucleotides in the variant then
# don't use this read
logger.debug(
"Positions before (%d) and after (%d) variant should be adjacent on read %s",
ref_pos_before,
ref_pos_after,
locus_read)
return None

# insertions require a sequence of non-aligned bases
# followed by the subsequence reference position
ref_positions_for_inserted = reference_positions[
read_pos_before + 1:read_pos_after]
if any(insert_pos is not None for insert_pos in ref_positions_for_inserted):
# all these inserted nucleotides should *not* align to the
# reference
logger.debug(
"Skipping read, inserted nucleotides shouldn't map to reference")
return None
else:
# substitutions and deletions
if ref_pos_after - ref_pos_before != n_ref + 1:
# if the number of nucleotides skipped isn't the same
# as the number of reference nucleotides in the variant then
# don't use this read
logger.debug(
"Positions before (%d) and after (%d) variant should be adjacent on read %s",
return None

# insertions require a sequence of non-aligned bases
# followed by the subsequence reference position
ref_positions_for_inserted = reference_positions[
read_pos_before + 1:read_pos_after]
if any(insert_pos is not None for insert_pos in ref_positions_for_inserted):
# all these inserted nucleotides should *not* align to the
# reference
logger.debug(
"Skipping read, inserted nucleotides shouldn't map to reference")
return None
else:
# substitutions and deletions
if ref_pos_after - ref_pos_before != n_ref + 1:
# if the number of nucleotides skipped isn't the same
# as the number of reference nucleotides in the variant then
# don't use this read
logger.debug(
("Positions before (%d) and after (%d) variant should be "
"adjacent on read %s"),
ref_pos_before,
ref_pos_after,
locus_read)
return None
return None

nucleotides_at_variant_locus = sequence[read_pos_before + 1:read_pos_after]
nucleotides_at_variant_locus = sequence[read_pos_before + 1:read_pos_after]

prefix = sequence[:read_pos_before + 1]
suffix = sequence[read_pos_after:]
prefix = sequence[:read_pos_before + 1]
suffix = sequence[read_pos_after:]

prefix, suffix = convert_from_bytes_if_necessary(prefix, suffix)
prefix, suffix = trim_N_nucleotides(prefix, suffix)
prefix, suffix = convert_from_bytes_if_necessary(prefix, suffix)
prefix, suffix = trim_N_nucleotides(prefix, suffix)

return AlleleRead(
prefix,
nucleotides_at_variant_locus,
suffix,
name=locus_read.name)
return cls(
prefix,
nucleotides_at_variant_locus,
suffix,
name=locus_read.name)

def allele_reads_from_locus_reads(locus_reads, n_ref):
"""
Given a collection of ReadAtLocus objects, returns a
list of VariantRead objects (which are split into prefix/variant/suffix
nucleotides).
Given a collection of LocusRead objects, returns a
list of AlleleRead objects
(which are split into prefix/allele/suffix nucleotide strings).
Parameters
----------
Expand All @@ -161,7 +160,7 @@ def allele_reads_from_locus_reads(locus_reads, n_ref):
"""

for locus_read in locus_reads:
allele_read = allele_read_from_locus_read(locus_read, n_ref)
allele_read = AlleleRead.from_locus_read(locus_read, n_ref)
if allele_read is None:
continue
else:
Expand Down Expand Up @@ -206,9 +205,11 @@ def reads_overlapping_variant(
if chromosome is None:
chromosome = variant.contig

logger.info("Gathering variant reads for variant %s (chromosome=%s)",
logger.info(
"Gathering variant reads for variant %s (chromosome = %s, gene names = %s)",
variant,
chromosome)
chromosome,
variant.gene_names)

base1_position, ref, alt = trim_variant(variant)

Expand Down Expand Up @@ -271,7 +272,9 @@ def reads_overlapping_variants(variants, samfile, **kwargs):
else:
logger.warn(
"Chromosome '%s' from variant %s not in alignment file %s",
chromosome, variant, samfile.filename)
chromosome,
variant,
samfile.filename)
continue
allele_reads = reads_overlapping_variant(
samfile=samfile,
Expand Down
53 changes: 7 additions & 46 deletions isovar/assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,9 @@
# I'd rather just use list.copy but Python 2.7 doesn't support it
from copy import copy

from .read_helpers import group_unique_sequences
from .variant_sequences import VariantSequence


logger = logging.getLogger(__name__)


MIN_OVERLAP_SIZE = 30

def sort_by_decreasing_prefix_length(seq):
Expand Down Expand Up @@ -68,14 +64,6 @@ def sort_by_decreasing_total_length(seq):
"""
return -len(seq.sequence)

def combine_variant_sequences(variant_sequence1, variant_sequence2):
return VariantSequence(
prefix=variant_sequence1.prefix,
alt=variant_sequence1.alt,
suffix=variant_sequence2.suffix,
reads=set(
variant_sequence1.reads).union(set(variant_sequence2.reads)))

def greedy_merge(variant_sequences, min_overlap_size=MIN_OVERLAP_SIZE):
"""
Greedily merge overlapping sequences into longer sequences.
Expand Down Expand Up @@ -103,11 +91,7 @@ def greedy_merge(variant_sequences, min_overlap_size=MIN_OVERLAP_SIZE):
variant_sequence2,
min_overlap_size=min_overlap_size):
continue
combined = combine_variant_sequences(
variant_sequence1, variant_sequence2)
if len(combined) <= max(len(variant_sequence1), len(variant_sequence2)):
# if the combined sequence isn't any longer, then why bother?
continue
combined = variant_sequence1.combine(variant_sequence2)
if combined.sequence in merged_variant_sequences:
# it's possible to get the same merged sequence from distinct
# input sequences
Expand All @@ -120,8 +104,7 @@ def greedy_merge(variant_sequences, min_overlap_size=MIN_OVERLAP_SIZE):
# sequence is supported by any one read.
existing_record_with_same_sequence = merged_variant_sequences[
combined.sequence]
combined_with_more_reads = combine_variant_sequences(
existing_record_with_same_sequence,
combined_with_more_reads = existing_record_with_same_sequence.combine(
combined)
merged_variant_sequences[combined.sequence] = combined_with_more_reads
else:
Expand Down Expand Up @@ -168,7 +151,7 @@ def sort_by_decreasing_read_count_and_sequence_lenth(variant_sequence):
"""
return -len(variant_sequence.reads), -len(variant_sequence.sequence)

def iterative_assembly(
def iterative_overlap_assembly(
variant_sequences,
min_overlap_size=30,
n_merge_iters=2):
Expand All @@ -185,9 +168,9 @@ def iterative_assembly(

logger.info(
"Iteration #%d of assembly: generated %d sequences (from %d)",
i + 1,
len(variant_sequences),
len(previous_sequences))
i + 1,
len(variant_sequences),
len(previous_sequences))

if len(variant_sequences) == 0:
# if the greedy merge procedure fails for all pairs of candidate
Expand All @@ -199,31 +182,9 @@ def iterative_assembly(
variant_sequences = collapse_substrings(variant_sequences)
logger.info(
"After collapsing subsequences, %d distinct sequences left",
len(variant_sequences))
len(variant_sequences))
if len(variant_sequences) == 1:
# once we have only one sequence then there's no point trying
# to further combine sequences
break
return variant_sequences

def assemble_reads_into_variant_sequences(
variant_reads,
min_overlap_size=30,
n_merge_iters=2):
"""
Turn a collection of AlleleRead objects into a collection of VariantSequence
objects, which are returned in order of (# supported reads, sequence length)
"""
initial_variant_sequences = [
VariantSequence(
prefix=prefix,
alt=alt,
suffix=suffix,
reads=reads)
for (prefix, alt, suffix), reads in
group_unique_sequences(variant_reads).items()
]
return iterative_assembly(
initial_variant_sequences,
min_overlap_size=min_overlap_size,
n_merge_iters=n_merge_iters)
8 changes: 7 additions & 1 deletion isovar/cli/variant_sequences.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,11 @@ def add_variant_sequence_args(
default=MIN_READS_SUPPORTING_VARIANT_CDNA_SEQUENCE,
help="Minimum number of reads supporting a variant sequence (default %(default)s)")

rna_sequence_group.add_argument(
"--variant-sequence-assembly",
default=False,
action="store_true")

# when cDNA sequence length can be inferred from a protein length then
# we may want to omit this arg
if add_sequence_length_arg:
Expand Down Expand Up @@ -76,7 +81,8 @@ def variant_sequences_generator_from_args(args):
return reads_generator_to_sequences_generator(
allele_reads_generator,
min_reads_supporting_cdna_sequence=args.min_reads_supporting_variant_sequence,
preferred_sequence_length=args.cdna_sequence_length)
preferred_sequence_length=args.cdna_sequence_length,
variant_cdna_sequence_assembly=args.variant_sequence_assembly)

def variant_sequences_dataframe_from_args(args):
variant_sequences_generator = variant_sequences_generator_from_args(args)
Expand Down
5 changes: 5 additions & 0 deletions isovar/default_parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,3 +72,8 @@

# number of protein sequences we want to return per variant
MAX_PROTEIN_SEQUENCES_PER_VARIANT = 1

# run overlap assembly algorithm to construct variant
# sequences from multiple reads which only partially
# overlap (rather than fully spanning a coding sequence)
VARIANT_CDNA_SEQUENCE_ASSEMBLY = False
Loading

0 comments on commit 1dff463

Please sign in to comment.