Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Develop #124

Merged
merged 12 commits into from
Sep 18, 2024
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@ dependencies:
- pip:
- pysam
- frozendict
- poetry>=1.3.*
- poetry==1.3.*
9 changes: 3 additions & 6 deletions neat/cli/commands/model_sequencing_error.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,9 @@ def add_arguments(self, parser: argparse.ArgumentParser):
dest="quality_scores",
nargs='+',
required=False,
default=[2, 11, 25, 37],
help="Quality score bins. Enter as a space separeted list for binned scores "
"(-Q 2 11 24 37), or enter a single maximum score for a full range (-Q 42 gives all"
"scores from 1-42). The default is binned quality scores: [2, 11, 24, 37]. Note that"
"using quality score bins on an unbinned fastq will result in a binned model, at the"
"cost of some inaccuracy.")
default=42,
help="Quality score max. The default 42. The lowest possible score is 1. To used binned"
"scoring, enter a space separated list of scores, e.g., -Q 2 15 23 37")

parser.add_argument('-m',
type=int,
Expand Down
80 changes: 28 additions & 52 deletions neat/gen_mut_model/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,18 +85,23 @@ def runner(reference_index,

# Pre-parsing to find all the matching chromosomes between ref and vcf
_LOG.info('Processing VCF file...')
matching_variants, matching_chromosomes = read_and_filter_variants(vcf_to_process,
reference_index,
ignore)
matching_variants, matching_chromosomes = read_and_filter_variants(
vcf_to_process,
reference_index,
ignore
)

if not matching_variants or not matching_chromosomes:
_LOG.error("No valid variants detected. Check names in vcf versus reference and/or bed.")
sys.exit(1)

trinuc_ref_count, bed_track_length = count_trinucleotides(reference_index,
bed,
outcounts_file,
matching_chromosomes)
trinuc_ref_count, bed_track_length = count_trinucleotides(
reference_index,
bed,
outcounts_file,
matching_chromosomes,
save_trinuc
)

if not trinuc_ref_count:
_LOG.error("No valid trinucleotides detected in reference.")
Expand Down Expand Up @@ -296,47 +301,6 @@ def runner(reference_index,
for k in sorted(snp_trans_freq):
_LOG.info(f'p({k[0]} --> {k[1]} | SNP occurs) = {snp_trans_freq[k]}')

# Save counts, if requested
if save_trinuc:
trinuc_output_data = {
'p(snp)': average_snp_freq,
'p(insertion)': average_insertion_frequency,
'p(deletion)': average_deletion_frequency,
'overall average mut rate': avg_mut_rate,
'total variants processed': total_var,
'homozygous probability': homozygous_frequency
}

for k in sorted(trinuc_mut_prob):
trinuc_output_data[f'p({k} mutates)'] = trinuc_mut_prob[k]

for k in sorted(trinuc_trans_probs):
trinuc_output_data[f'p({k[0]} --> {k[1]} | {k[0]} mutates)'] = trinuc_trans_probs[k]

for k in sorted(deletion_counts):
trinuc_output_data[f'p(del length = {abs(k)} | deletion occurs)'] = deletion_frequency[k]

for k in sorted(insertion_counts):
trinuc_output_data[f'p(insert length = {abs(k)} | insertion occurs)'] = insertion_freqency[k]

for k in sorted(snp_trans_freq):
trinuc_output_data[f'p({k[0]} --> {k[1]} | SNP occurs)'] = snp_trans_freq[k]

trinuc_output_path = Path(output)
trinuc_output_path = trinuc_output_path.with_suffix('')
trinuc_output_path = trinuc_output_path.with_suffix('.trinuc.pickle.gz')
_LOG.info(f'Saving trinucleotide counts to: {trinuc_output_path}')

with open_output(trinuc_output_path, 'w+') as trinuc_outfile:

pickle.dump(trinuc_output_data, trinuc_outfile)

# Human-readable content of file below
trinuc_outfile.write('\n')

for key, value in trinuc_output_data.items():
trinuc_outfile.write(f'{key}: {value}\n')

_LOG.info(f'p(snp) = {average_snp_freq}')
_LOG.info(f'p(insertion) = {average_insertion_frequency}')
_LOG.info(f'p(deletion) = {average_deletion_frequency}')
Expand All @@ -360,8 +324,8 @@ def runner(reference_index,
transition_matrix=snp_transition_bias,
trinuc_trans_matrices=trinuc_transition_bias,
trinuc_mut_bias=trinuc_mutation_probs,
insert_len_model=insertion_counts,
deletion_len_model=deletion_counts
insert_len_model=insertion_freqency,
deletion_len_model=deletion_frequency
)

print('\nSaving model...')
Expand Down Expand Up @@ -405,6 +369,8 @@ def compute_mut_runner(reference,
if outcounts:
validate_input_path(outcounts)
outcounts = Path(outcounts)
elif save_trinuc:
outcounts = Path(output + '.trinuc.pickle.gz')

print('Processing reference...')
reference_index = SeqIO.index(reference, 'fasta')
Expand Down Expand Up @@ -477,8 +443,18 @@ def compute_mut_runner(reference,
output = Path(output + '.pickle.gz')
validate_output_path(output, overwrite=overwrite_output)

runner(reference_index, vcf_to_process, vcf_columns, outcounts, show_trinuc, save_trinuc,
output, bed, human_sample, skip_common)
runner(
reference_index,
vcf_to_process,
vcf_columns,
outcounts,
show_trinuc,
save_trinuc,
output,
bed,
human_sample,
skip_common
)

if os.path.exists('temp.vcf'):
os.remove('temp.vcf')
Expand Down
18 changes: 10 additions & 8 deletions neat/gen_mut_model/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

import json
import sys
import pickle
import gzip

import numpy as np

Expand Down Expand Up @@ -165,14 +167,16 @@ def cluster_list(list_to_cluster, delta):
def count_trinucleotides(reference_index,
bed,
trinuc_counts,
matching_chroms):
matching_chroms,
save_trinuc_file):
"""
Counts the frequency of the various trinucleotide combinations in the dataset

:param dict reference_index: The index of the fasta, from SeqIO
:param Path bed: Full path to bed file, if using
:param Path trinuc_counts: Full path to existing trinculeotide counts file, if input, or the output path
:param list matching_chroms: List of matching chromosomes
:param save_trinuc_file: Boolean determines whether to save trinucleotide counts file.
:return dict, int: A dictionary of trinculeotides and counts, the number of bases spanned
"""
# Data format: TRINUC: COUNT (e.g., "AAA": 12), where COUNT is the number of times trinuc was observed
Expand All @@ -183,11 +187,11 @@ def count_trinucleotides(reference_index,
track_len = 0

trinuc_counts_exists = False
save_trinuc_file = False
if trinuc_counts:
if trinuc_counts.is_file():
_LOG.info("Trinucleotide counts file exists, skipping save to avoid overwriting data.")
trinuc_counts_exists = True
save_trinuc_file = True
save_trinuc_file = False

if bed:
_LOG.info("Counting trinucleotide combinations in bed regions")
Expand All @@ -207,7 +211,6 @@ def count_trinucleotides(reference_index,
trinuc_ref_count = count_trinuc(trinuc, trinuc_ref_count)

# Solution to attribute error (needs to be checked)
# TODO remove ref_name from this dict
elif not trinuc_counts_exists:
_LOG.info("Counting trinucleotide combinations in reference.")
for ref_name in matching_chroms:
Expand All @@ -220,13 +223,12 @@ def count_trinucleotides(reference_index,
with open_output(trinuc_counts, 'w') as countfile:
_LOG.info('Saving trinuc counts to file...')
# Convert all values to writable formats
trinuc_ref_count = {str(x): int(y) for x, y in trinuc_ref_count.items()}
countfile.write(json.dumps(trinuc_ref_count))
pickle.dump(trinuc_ref_count, countfile)

else:
_LOG.info(f'Loading file: {trinuc_counts}.')
with open_input(trinuc_counts) as counts:
trinuc_ref_count = json.load(counts)
with gzip.open(trinuc_counts, 'rb') as counts:
trinuc_ref_count = pickle.load(counts)
if save_trinuc_file:
_LOG.warning('Existing trinucelotide file will not be changed.')

Expand Down
15 changes: 10 additions & 5 deletions neat/model_sequencing_error/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,11 @@ def model_seq_err_runner(
_LOG.debug(f"Quality offset: {offset}")

final_quality_scores: list
binned_scores = False
if len(qual_scores) == 1:
final_quality_scores = list(range(1, qual_scores[0] + 1))
else:
binned_scores = True
final_quality_scores = sorted(qual_scores)

_LOG.debug(f'Quality scores: {final_quality_scores}')
Expand Down Expand Up @@ -109,11 +111,14 @@ def model_seq_err_runner(
for file in files:
file_num += 1
_LOG.info(f'Reading file {file_num} of {len(files)}')
parameters_by_position, file_avg_error, read_length = parse_file(file,
final_quality_scores,
num_records_to_process,
offset,
read_length)
parameters_by_position, file_avg_error, read_length = parse_file(
file,
final_quality_scores,
num_records_to_process,
offset,
read_length,
binned_scores
)

read_parameters.append(parameters_by_position)
average_errors.append(file_avg_error)
Expand Down
6 changes: 1 addition & 5 deletions neat/model_sequencing_error/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@

from scipy.stats import mode
from ..common import open_input
from ..models import take_closest

__all__ = [
"parse_file"
Expand Down Expand Up @@ -139,10 +138,7 @@ def parse_file(input_file: str, quality_scores: list, max_reads: int, qual_offse

for j in range(read_length):
# The qualities of each read_position_scores
quality_bin = take_closest(quality_scores, qualities_to_check[j])
bin_index = quality_scores.index(quality_bin)
temp_q_count[j][bin_index] += 1
qual_score_counter[quality_bin] += 1
qual_score_counter[qualities_to_check[j]] += 1

if records_read % quarters == 0:
_LOG.info(f'reading data: {(records_read / max_reads) * 100:.0f}%')
Expand Down
1 change: 0 additions & 1 deletion neat/models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
from .models import *
from .utils import *
4 changes: 2 additions & 2 deletions neat/models/default_sequencing_error_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
[0.2505, 0.2552, 0.4943, 0.0]]
)

# This list may not be the final list
default_quality_scores = np.array([2, 11, 25, 37])
# All numbers from 1-42
default_quality_scores = np.arange(1, 43)

# This puts a high probability toward getting a maximum quality score. The current values
# should be considered temporary. We're working on final values.
Expand Down
35 changes: 22 additions & 13 deletions neat/models/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
from ..common import TRINUC_IND, ALLOWED_NUCL, NUC_IND, DINUC_IND
from .default_mutation_model import *
from .default_sequencing_error_model import *
from .utils import bin_scores, take_closest

__all__ = [
"MutationModel",
Expand Down Expand Up @@ -60,7 +59,9 @@ class InsertionModel(VariantModel):
def __init__(self,
insert_len_model: dict[int: float, ...],
rng: Generator = None):
self.insert_len_model = insert_len_model
# Creating probabilities from the weights
tot = sum(insert_len_model.values())
self.insertion_len_model = {key: val / tot for key, val in insert_len_model.items()}
self.rng = rng

def get_insertion_length(self, size: int = None) -> int | list[int, ...]:
Expand All @@ -71,8 +72,8 @@ def get_insertion_length(self, size: int = None) -> int | list[int, ...]:
Greater than 1 returns a list of ints.
:return: int or list of ints.
"""
return self.rng.choice(a=list(self.insert_len_model),
p=[*self.insert_len_model.values()],
return self.rng.choice(a=list(self.insertion_len_model),
p=[*self.insertion_len_model.values()],
size=size, shuffle=False)


Expand All @@ -91,7 +92,9 @@ class DeletionModel(VariantModel):
def __init__(self,
deletion_len_model: dict[int: float, ...],
rng: Generator = None):
self.deletion_len_model = deletion_len_model
# Creating probabilities from the weights
tot = sum(deletion_len_model.values())
self.deletion_len_model = {key: val/tot for key, val in deletion_len_model.items()}
self.rng = rng

def get_deletion_length(self, size: int = None) -> int | list[int, ...]:
Expand Down Expand Up @@ -281,6 +284,9 @@ def generate_snv(self, trinucleotide: Seq, reference_location: int) -> SingleNuc
transition_matrix = self.trinuc_trans_matrices[DINUC_IND[trinucleotide[0] + "_" + trinucleotide[2]]]
# then determine the trans probs based on the middle nucleotide
transition_probs = transition_matrix[NUC_IND[trinucleotide[1]]]
# Creating probabilities from the weights
transition_sum = sum(transition_probs)
transition_probs = [x/transition_sum for x in transition_probs]
# Now pick a random alternate, weighted by the probabilities
alt = self.rng.choice(ALLOWED_NUCL, p=transition_probs)
temp_snv = SingleNucleotideVariant(reference_location, alt=alt)
Expand Down Expand Up @@ -376,11 +382,8 @@ def __init__(self,
self.insertion_model = insertion_model
self.uniform_quality_score = None
if self.is_uniform:
# bin scores returns a list, so we need the first (only) element of the list
converted_avg_err = bin_scores(self.quality_scores,
[int(-10. * np.log10(self.average_error))])[0]
# Set score to the lowest of the max of the quality scores and the bin closest to the input avg error.
self.uniform_quality_score = min([max(self.quality_scores), converted_avg_err])
# Set score to the lowest of the max of the quality scores and the input avg error.
self.uniform_quality_score = min([max(self.quality_scores), int(-10. * np.log10(self.average_error) + 0.5)])
self.rng = rng

def get_sequencing_errors(self,
Expand Down Expand Up @@ -498,7 +501,13 @@ def get_quality_scores(self,
for i in quality_index_map:
score = self.rng.normal(self.quality_score_probabilities[i][0],
scale=self.quality_score_probabilities[i][1])
score = take_closest(self.quality_scores, score)
# make sure score is in range and an int
score = round(score)
if score > 42:
score = 42
if score < 1:
score = 1

temp_qual_array.append(score)

if self.rescale_qualities:
Expand All @@ -509,9 +518,9 @@ def get_quality_scores(self,
self.quality_score_error_rate[n]) + 0.5)])
for n in temp_qual_array]
# Now rebin the quality scores.
temp_qual_array = np.array(bin_scores(self.quality_scores, rescaled_quals))
temp_qual_array = np.array(rescaled_quals)
else:
temp_qual_array = np.array(bin_scores(self.quality_scores, temp_qual_array))
temp_qual_array = np.array(temp_qual_array)

return temp_qual_array[:input_read_length]

Expand Down
Loading
Loading