From 3797ca9fb3de1d2faea1774d68cc41138a706c3b Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Sat, 13 Jul 2024 14:55:54 -0500 Subject: [PATCH 01/11] Fixing how neat was counting input variants --- neat/read_simulator/utils/vcf_func.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/neat/read_simulator/utils/vcf_func.py b/neat/read_simulator/utils/vcf_func.py index a2c0bb2..3d900a9 100755 --- a/neat/read_simulator/utils/vcf_func.py +++ b/neat/read_simulator/utils/vcf_func.py @@ -73,6 +73,7 @@ def parse_input_vcf(input_dict: dict[str: ContigVariants], n_skipped = 0 mismatched = 0 + records_found = 0 # maximum number of columns we are interested in. Used for trimming unwanted samples. max_col = 7 with open_input(vcf_path) as f: @@ -276,6 +277,9 @@ def parse_input_vcf(input_dict: dict[str: ContigVariants], if rc == 1: _LOG.warning(f"Input variant skipped because a variant already existed at that location:" f"{chrom}: {location} ({temp_variant})") + n_skipped += 1 + else: + records_found += 1 if tumor_normal: """ @@ -283,7 +287,8 @@ def parse_input_vcf(input_dict: dict[str: ContigVariants], """ pass - _LOG.info(f'Found {len(input_dict)} variants in input VCF.') + _LOG.info(f'Found {records_found} variants in input VCF.') _LOG.info(f'Skipped {n_skipped} variants because of multiples at the same location') + _LOG.info(f'Skipped {mismatched} variants because of a mismatch between Ref and reference.') return sample_columns From 5d23806c3c1989a729daba2cb7a41918f0cc9f98 Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Sat, 13 Jul 2024 15:53:30 -0500 Subject: [PATCH 02/11] Updating gen_mut_model to output and read proper trinucleotide count files --- neat/gen_mut_model/runner.py | 76 ++++++++++++------------------------ neat/gen_mut_model/utils.py | 18 +++++---- 2 files changed, 36 insertions(+), 58 deletions(-) diff --git a/neat/gen_mut_model/runner.py b/neat/gen_mut_model/runner.py index 5fb1b22..2394c66 100644 --- a/neat/gen_mut_model/runner.py +++ b/neat/gen_mut_model/runner.py @@ -85,18 +85,23 @@ def runner(reference_index, # Pre-parsing to find all the matching chromosomes between ref and vcf _LOG.info('Processing VCF file...') - matching_variants, matching_chromosomes = read_and_filter_variants(vcf_to_process, - reference_index, - ignore) + matching_variants, matching_chromosomes = read_and_filter_variants( + vcf_to_process, + reference_index, + ignore + ) if not matching_variants or not matching_chromosomes: _LOG.error("No valid variants detected. Check names in vcf versus reference and/or bed.") sys.exit(1) - trinuc_ref_count, bed_track_length = count_trinucleotides(reference_index, - bed, - outcounts_file, - matching_chromosomes) + trinuc_ref_count, bed_track_length = count_trinucleotides( + reference_index, + bed, + outcounts_file, + matching_chromosomes, + save_trinuc + ) if not trinuc_ref_count: _LOG.error("No valid trinucleotides detected in reference.") @@ -296,47 +301,6 @@ def runner(reference_index, for k in sorted(snp_trans_freq): _LOG.info(f'p({k[0]} --> {k[1]} | SNP occurs) = {snp_trans_freq[k]}') - # Save counts, if requested - if save_trinuc: - trinuc_output_data = { - 'p(snp)': average_snp_freq, - 'p(insertion)': average_insertion_frequency, - 'p(deletion)': average_deletion_frequency, - 'overall average mut rate': avg_mut_rate, - 'total variants processed': total_var, - 'homozygous probability': homozygous_frequency - } - - for k in sorted(trinuc_mut_prob): - trinuc_output_data[f'p({k} mutates)'] = trinuc_mut_prob[k] - - for k in sorted(trinuc_trans_probs): - trinuc_output_data[f'p({k[0]} --> {k[1]} | {k[0]} mutates)'] = trinuc_trans_probs[k] - - for k in sorted(deletion_counts): - trinuc_output_data[f'p(del length = {abs(k)} | deletion occurs)'] = deletion_frequency[k] - - for k in sorted(insertion_counts): - trinuc_output_data[f'p(insert length = {abs(k)} | insertion occurs)'] = insertion_freqency[k] - - for k in sorted(snp_trans_freq): - trinuc_output_data[f'p({k[0]} --> {k[1]} | SNP occurs)'] = snp_trans_freq[k] - - trinuc_output_path = Path(output) - trinuc_output_path = trinuc_output_path.with_suffix('') - trinuc_output_path = trinuc_output_path.with_suffix('.trinuc.pickle.gz') - _LOG.info(f'Saving trinucleotide counts to: {trinuc_output_path}') - - with open_output(trinuc_output_path, 'w+') as trinuc_outfile: - - pickle.dump(trinuc_output_data, trinuc_outfile) - - # Human-readable content of file below - trinuc_outfile.write('\n') - - for key, value in trinuc_output_data.items(): - trinuc_outfile.write(f'{key}: {value}\n') - _LOG.info(f'p(snp) = {average_snp_freq}') _LOG.info(f'p(insertion) = {average_insertion_frequency}') _LOG.info(f'p(deletion) = {average_deletion_frequency}') @@ -405,6 +369,8 @@ def compute_mut_runner(reference, if outcounts: validate_input_path(outcounts) outcounts = Path(outcounts) + elif save_trinuc: + outcounts = Path(output + '.trinuc.pickle.gz') print('Processing reference...') reference_index = SeqIO.index(reference, 'fasta') @@ -477,8 +443,18 @@ def compute_mut_runner(reference, output = Path(output + '.pickle.gz') validate_output_path(output, overwrite=overwrite_output) - runner(reference_index, vcf_to_process, vcf_columns, outcounts, show_trinuc, save_trinuc, - output, bed, human_sample, skip_common) + runner( + reference_index, + vcf_to_process, + vcf_columns, + outcounts, + show_trinuc, + save_trinuc, + output, + bed, + human_sample, + skip_common + ) if os.path.exists('temp.vcf'): os.remove('temp.vcf') diff --git a/neat/gen_mut_model/utils.py b/neat/gen_mut_model/utils.py index 78356d8..3680753 100644 --- a/neat/gen_mut_model/utils.py +++ b/neat/gen_mut_model/utils.py @@ -4,6 +4,8 @@ import json import sys +import pickle +import gzip import numpy as np @@ -165,7 +167,8 @@ def cluster_list(list_to_cluster, delta): def count_trinucleotides(reference_index, bed, trinuc_counts, - matching_chroms): + matching_chroms, + save_trinuc_file): """ Counts the frequency of the various trinucleotide combinations in the dataset @@ -173,6 +176,7 @@ def count_trinucleotides(reference_index, :param Path bed: Full path to bed file, if using :param Path trinuc_counts: Full path to existing trinculeotide counts file, if input, or the output path :param list matching_chroms: List of matching chromosomes + :param save_trinuc_file: Boolean determines whether to save trinucleotide counts file. :return dict, int: A dictionary of trinculeotides and counts, the number of bases spanned """ # Data format: TRINUC: COUNT (e.g., "AAA": 12), where COUNT is the number of times trinuc was observed @@ -183,11 +187,11 @@ def count_trinucleotides(reference_index, track_len = 0 trinuc_counts_exists = False - save_trinuc_file = False if trinuc_counts: if trinuc_counts.is_file(): + _LOG.info("Trinucleotide counts file exists, skipping save to avoid overwriting data.") trinuc_counts_exists = True - save_trinuc_file = True + save_trinuc_file = False if bed: _LOG.info("Counting trinucleotide combinations in bed regions") @@ -207,7 +211,6 @@ def count_trinucleotides(reference_index, trinuc_ref_count = count_trinuc(trinuc, trinuc_ref_count) # Solution to attribute error (needs to be checked) - # TODO remove ref_name from this dict elif not trinuc_counts_exists: _LOG.info("Counting trinucleotide combinations in reference.") for ref_name in matching_chroms: @@ -220,13 +223,12 @@ def count_trinucleotides(reference_index, with open_output(trinuc_counts, 'w') as countfile: _LOG.info('Saving trinuc counts to file...') # Convert all values to writable formats - trinuc_ref_count = {str(x): int(y) for x, y in trinuc_ref_count.items()} - countfile.write(json.dumps(trinuc_ref_count)) + pickle.dump(trinuc_ref_count, countfile) else: _LOG.info(f'Loading file: {trinuc_counts}.') - with open_input(trinuc_counts) as counts: - trinuc_ref_count = json.load(counts) + with gzip.open(trinuc_counts, 'rb') as counts: + trinuc_ref_count = pickle.load(counts) if save_trinuc_file: _LOG.warning('Existing trinucelotide file will not be changed.') From 0793f4dd0e75f56dc55a455a72b25f92a97166fd Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Mon, 15 Jul 2024 18:03:30 -0500 Subject: [PATCH 03/11] Removed binned scoring --- neat/cli/commands/model_sequencing_error.py | 8 +- neat/model_sequencing_error/runner.py | 15 ++-- neat/model_sequencing_error/utils.py | 6 +- neat/models/__init__.py | 1 - neat/models/default_sequencing_error_model.py | 4 +- neat/models/models.py | 20 ++--- neat/models/utils.py | 75 ------------------- neat/read_simulator/utils/generate_reads.py | 17 ++--- 8 files changed, 32 insertions(+), 114 deletions(-) delete mode 100644 neat/models/utils.py diff --git a/neat/cli/commands/model_sequencing_error.py b/neat/cli/commands/model_sequencing_error.py index 1c3ef2a..0114a6a 100644 --- a/neat/cli/commands/model_sequencing_error.py +++ b/neat/cli/commands/model_sequencing_error.py @@ -48,12 +48,8 @@ def add_arguments(self, parser: argparse.ArgumentParser): dest="quality_scores", nargs='+', required=False, - default=[2, 11, 25, 37], - help="Quality score bins. Enter as a space separeted list for binned scores " - "(-Q 2 11 24 37), or enter a single maximum score for a full range (-Q 42 gives all" - "scores from 1-42). The default is binned quality scores: [2, 11, 24, 37]. Note that" - "using quality score bins on an unbinned fastq will result in a binned model, at the" - "cost of some inaccuracy.") + default=42, + help="Quality score max. The default 42. The lowest possible score is 1") parser.add_argument('-m', type=int, diff --git a/neat/model_sequencing_error/runner.py b/neat/model_sequencing_error/runner.py index 0326f0f..b12ac24 100644 --- a/neat/model_sequencing_error/runner.py +++ b/neat/model_sequencing_error/runner.py @@ -62,9 +62,11 @@ def model_seq_err_runner( _LOG.debug(f"Quality offset: {offset}") final_quality_scores: list + binned_scores = False if len(qual_scores) == 1: final_quality_scores = list(range(1, qual_scores[0] + 1)) else: + binned_scores = True final_quality_scores = sorted(qual_scores) _LOG.debug(f'Quality scores: {final_quality_scores}') @@ -109,11 +111,14 @@ def model_seq_err_runner( for file in files: file_num += 1 _LOG.info(f'Reading file {file_num} of {len(files)}') - parameters_by_position, file_avg_error, read_length = parse_file(file, - final_quality_scores, - num_records_to_process, - offset, - read_length) + parameters_by_position, file_avg_error, read_length = parse_file( + file, + final_quality_scores, + num_records_to_process, + offset, + read_length, + binned_scores + ) read_parameters.append(parameters_by_position) average_errors.append(file_avg_error) diff --git a/neat/model_sequencing_error/utils.py b/neat/model_sequencing_error/utils.py index b494d4d..2d99853 100644 --- a/neat/model_sequencing_error/utils.py +++ b/neat/model_sequencing_error/utils.py @@ -10,7 +10,6 @@ from scipy.stats import mode from ..common import open_input -from ..models import take_closest __all__ = [ "parse_file" @@ -139,10 +138,7 @@ def parse_file(input_file: str, quality_scores: list, max_reads: int, qual_offse for j in range(read_length): # The qualities of each read_position_scores - quality_bin = take_closest(quality_scores, qualities_to_check[j]) - bin_index = quality_scores.index(quality_bin) - temp_q_count[j][bin_index] += 1 - qual_score_counter[quality_bin] += 1 + qual_score_counter[qualities_to_check[j]] += 1 if records_read % quarters == 0: _LOG.info(f'reading data: {(records_read / max_reads) * 100:.0f}%') diff --git a/neat/models/__init__.py b/neat/models/__init__.py index f4c7f0f..aed4fa3 100644 --- a/neat/models/__init__.py +++ b/neat/models/__init__.py @@ -1,2 +1 @@ from .models import * -from .utils import * diff --git a/neat/models/default_sequencing_error_model.py b/neat/models/default_sequencing_error_model.py index 9924edd..0883718 100644 --- a/neat/models/default_sequencing_error_model.py +++ b/neat/models/default_sequencing_error_model.py @@ -17,8 +17,8 @@ [0.2505, 0.2552, 0.4943, 0.0]] ) -# This list may not be the final list -default_quality_scores = np.array([2, 11, 25, 37]) +# All numbers from 1-42 +default_quality_scores = np.arange(1, 43) # This puts a high probability toward getting a maximum quality score. The current values # should be considered temporary. We're working on final values. diff --git a/neat/models/models.py b/neat/models/models.py index 5f19a80..be3b1bd 100644 --- a/neat/models/models.py +++ b/neat/models/models.py @@ -18,7 +18,6 @@ from ..common import TRINUC_IND, ALLOWED_NUCL, NUC_IND, DINUC_IND from .default_mutation_model import * from .default_sequencing_error_model import * -from .utils import bin_scores, take_closest __all__ = [ "MutationModel", @@ -376,11 +375,8 @@ def __init__(self, self.insertion_model = insertion_model self.uniform_quality_score = None if self.is_uniform: - # bin scores returns a list, so we need the first (only) element of the list - converted_avg_err = bin_scores(self.quality_scores, - [int(-10. * np.log10(self.average_error))])[0] - # Set score to the lowest of the max of the quality scores and the bin closest to the input avg error. - self.uniform_quality_score = min([max(self.quality_scores), converted_avg_err]) + # Set score to the lowest of the max of the quality scores and the input avg error. + self.uniform_quality_score = min([max(self.quality_scores), int(-10. * np.log10(self.average_error) + 0.5)]) self.rng = rng def get_sequencing_errors(self, @@ -498,7 +494,13 @@ def get_quality_scores(self, for i in quality_index_map: score = self.rng.normal(self.quality_score_probabilities[i][0], scale=self.quality_score_probabilities[i][1]) - score = take_closest(self.quality_scores, score) + # make sure score is in range and an int + score = round(score) + if score > 42: + score = 42 + if score < 1: + score = 1 + temp_qual_array.append(score) if self.rescale_qualities: @@ -509,9 +511,9 @@ def get_quality_scores(self, self.quality_score_error_rate[n]) + 0.5)]) for n in temp_qual_array] # Now rebin the quality scores. - temp_qual_array = np.array(bin_scores(self.quality_scores, rescaled_quals)) + temp_qual_array = np.array(rescaled_quals) else: - temp_qual_array = np.array(bin_scores(self.quality_scores, temp_qual_array)) + temp_qual_array = np.array(temp_qual_array) return temp_qual_array[:input_read_length] diff --git a/neat/models/utils.py b/neat/models/utils.py deleted file mode 100644 index 049ca98..0000000 --- a/neat/models/utils.py +++ /dev/null @@ -1,75 +0,0 @@ -""" -Utilities for the sequencing error model -""" -import numpy as np - -from bisect import bisect_left - - -__all__ = [ - "bin_scores", - "take_closest" -] - -DATA_BINS = {} - - -def bin_scores(bins: list | np.ndarray, quality_array: list | np.ndarray, qual_offset: int = None): - """ - Assumes bins list is sorted. Returns the closest value to quality. - - If two numbers are equally close, return the smallest number. Note that in the case of quality scores - not being binned, the "bin" will end up just being the quality score. - - :param bins: the possible values of the quality scores for the simulation - :param quality_array: the quality array from data which we will bin. - :param qual_offset: the quality offset for these quality scores. Only needed if input is a str. - """ - # Convert to array, if string - if type(quality_array[0]) == str: - temp_array = [] - for char in quality_array: - temp_array.append(ord(char) - qual_offset) - quality_array = temp_array - - return_array = [] - for score in quality_array: - if score in DATA_BINS: - return_array.append(DATA_BINS[score]) - else: - pos = bisect_left(bins, score) - if pos == 0: - DATA_BINS[score] = bins[0] - return_array.append(bins[0]) - elif pos == len(bins): - DATA_BINS[score] = bins[-1] - return_array.append(bins[-1]) - else: - before = bins[pos - 1] - after = bins[pos] - if after - score < score - before: - DATA_BINS[score] = after - return_array.append(after) - else: - DATA_BINS[score] = before - return_array.append(before) - - return return_array - -def take_closest(bins, quality): - """ - Assumes bins is sorted. Returns the closest value to quality. - - If two numbers are equally close, return the smallest number. - """ - pos = bisect_left(bins, quality) - if pos == 0: - return bins[0] - if pos == len(bins): - return bins[-1] - before = bins[pos - 1] - after = bins[pos] - if after - quality < quality - before: - return after - else: - return before diff --git a/neat/read_simulator/utils/generate_reads.py b/neat/read_simulator/utils/generate_reads.py index f045d61..fc5282e 100644 --- a/neat/read_simulator/utils/generate_reads.py +++ b/neat/read_simulator/utils/generate_reads.py @@ -71,20 +71,15 @@ def cover_dataset( # trying to get enough variability to harden NEAT against edge cases. if loop_count % 10 == 0: fragment_model.rng.shuffle(fragment_pool) - # Breaking the gename into fragments + # Mapping random fragments onto genome + i = 0 while start < span_length: # We take the first element and put it back on the end to create an endless pool of fragments to draw from - fragment = fragment_pool.pop(0) + fragment = fragment_pool[i] + i = (i + 1) % len(fragment_pool) end = min(start + fragment, span_length) - # these are equivalent of reads we expect the machine to filter out, but we won't actually use it - if end - start < options.read_len: - # add some random flavor to try to keep it to falling into a loop - if fragment_model.rng.normal() < 0.5: - fragment_pool.insert(len(fragment_pool)//2, fragment) - else: - fragment_pool.insert(len(fragment_pool) - 3, fragment) - else: - fragment_pool.append(fragment) + # Ensure the read is long enough to form a read, else we will not use it. + if end - start > options.read_len: temp_fragments.append((start, end)) start = end From 9a84e12e777e4a2e9ddc866ea8ab447040845ea0 Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Wed, 21 Aug 2024 21:19:25 -0500 Subject: [PATCH 04/11] Couple small edits to vcf_func to improve time. Seems the contig variants container is eating up all my memory, trying to debug that --- neat/cli/commands/model_sequencing_error.py | 3 ++- neat/read_simulator/utils/vcf_func.py | 10 ++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/neat/cli/commands/model_sequencing_error.py b/neat/cli/commands/model_sequencing_error.py index 0114a6a..aa06831 100644 --- a/neat/cli/commands/model_sequencing_error.py +++ b/neat/cli/commands/model_sequencing_error.py @@ -49,7 +49,8 @@ def add_arguments(self, parser: argparse.ArgumentParser): nargs='+', required=False, default=42, - help="Quality score max. The default 42. The lowest possible score is 1") + help="Quality score max. The default 42. The lowest possible score is 1. To used binned" + "scoring, enter a space separated list of scores, e.g., -Q 2 15 23 37") parser.add_argument('-m', type=int, diff --git a/neat/read_simulator/utils/vcf_func.py b/neat/read_simulator/utils/vcf_func.py index 3d900a9..f6420da 100755 --- a/neat/read_simulator/utils/vcf_func.py +++ b/neat/read_simulator/utils/vcf_func.py @@ -148,18 +148,20 @@ def parse_input_vcf(input_dict: dict[str: ContigVariants], SAMPLE1 [9, optional] SAMPLE2 [10, optional, cancer only] """ - # First, let's check if the chromosome for this record is even in the reference: - in_ref = record[0] in reference + # First, let's check if the chromosome for this record is even in the reference. Since input_dict is + # constructed from the reference, the keys list is the same. + in_ref = record[0] in input_dict.keys() if not in_ref: _LOG.warning(f'Skipping variant because the chromosome is not in the reference:\n{line}') continue + reference_string = reference[record[0]][int(record[1]): int(record[1]) + len(record[3])].seq.upper() # We already accounted for shifting to 0-based coordinates, so this should work. - if record[3] != str(reference[record[0]][int(record[1]): int(record[1]) + len(record[3])].seq): + if record[3] != str(reference_string): mismatched += 1 _LOG.warning(f'Skipping variant because the ref field did not match the reference:' f'{record[0]}: {record[1]}, {record[3]} v ' - f'{reference[record[0]][int(record[1]): int(record[1]) + len(record[3])].seq}') + f'{reference_string}') continue # We'll need the genotype when we generate reads, and output the records, if applicable From 9892d5980b8d0323bd70957954841688a998c37c Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Thu, 22 Aug 2024 14:01:29 -0500 Subject: [PATCH 05/11] Normalizing deletion model --- neat/models/models.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/neat/models/models.py b/neat/models/models.py index be3b1bd..c93a096 100644 --- a/neat/models/models.py +++ b/neat/models/models.py @@ -90,7 +90,9 @@ class DeletionModel(VariantModel): def __init__(self, deletion_len_model: dict[int: float, ...], rng: Generator = None): - self.deletion_len_model = deletion_len_model + max_value = max(deletion_len_model.values()) + # normalize the values + self.deletion_len_model = {key: val/max_value for key, val in deletion_len_model.items()} self.rng = rng def get_deletion_length(self, size: int = None) -> int | list[int, ...]: From 37130f1d214dfcff15260a5d8c3075e20cee6087 Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Thu, 22 Aug 2024 14:04:50 -0500 Subject: [PATCH 06/11] Normalizing insertion model --- neat/models/models.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/neat/models/models.py b/neat/models/models.py index c93a096..08cf5b1 100644 --- a/neat/models/models.py +++ b/neat/models/models.py @@ -59,7 +59,9 @@ class InsertionModel(VariantModel): def __init__(self, insert_len_model: dict[int: float, ...], rng: Generator = None): - self.insert_len_model = insert_len_model + # normalize the values + tot = sum(insert_len_model.values()) + self.insertion_len_model = {key: val / tot for key, val in insert_len_model.items()} self.rng = rng def get_insertion_length(self, size: int = None) -> int | list[int, ...]: @@ -90,9 +92,9 @@ class DeletionModel(VariantModel): def __init__(self, deletion_len_model: dict[int: float, ...], rng: Generator = None): - max_value = max(deletion_len_model.values()) # normalize the values - self.deletion_len_model = {key: val/max_value for key, val in deletion_len_model.items()} + tot = sum(deletion_len_model.values()) + self.deletion_len_model = {key: val/tot for key, val in deletion_len_model.items()} self.rng = rng def get_deletion_length(self, size: int = None) -> int | list[int, ...]: From a9ba1ff565446c0b4eda246d0c5131e12b415521 Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Thu, 22 Aug 2024 16:13:17 -0500 Subject: [PATCH 07/11] Converting weights to probabilities --- neat/models/models.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/neat/models/models.py b/neat/models/models.py index 08cf5b1..5169d75 100644 --- a/neat/models/models.py +++ b/neat/models/models.py @@ -59,7 +59,7 @@ class InsertionModel(VariantModel): def __init__(self, insert_len_model: dict[int: float, ...], rng: Generator = None): - # normalize the values + # Creating probabilities from the weights tot = sum(insert_len_model.values()) self.insertion_len_model = {key: val / tot for key, val in insert_len_model.items()} self.rng = rng @@ -92,7 +92,7 @@ class DeletionModel(VariantModel): def __init__(self, deletion_len_model: dict[int: float, ...], rng: Generator = None): - # normalize the values + # Creating probabilities from the weights tot = sum(deletion_len_model.values()) self.deletion_len_model = {key: val/tot for key, val in deletion_len_model.items()} self.rng = rng @@ -284,6 +284,9 @@ def generate_snv(self, trinucleotide: Seq, reference_location: int) -> SingleNuc transition_matrix = self.trinuc_trans_matrices[DINUC_IND[trinucleotide[0] + "_" + trinucleotide[2]]] # then determine the trans probs based on the middle nucleotide transition_probs = transition_matrix[NUC_IND[trinucleotide[1]]] + # Creating probabilities from the weights + transition_sum = sum(transition_probs) + transition_probs = [x/transition_sum for x in transition_probs] # Now pick a random alternate, weighted by the probabilities alt = self.rng.choice(ALLOWED_NUCL, p=transition_probs) temp_snv = SingleNucleotideVariant(reference_location, alt=alt) From 96df2339c9f187a0ec5bea301dc624018d151251 Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Thu, 22 Aug 2024 20:56:05 -0500 Subject: [PATCH 08/11] Fixing an issue in gen_mut_model where it was exporting the wrong valuse. We needed the probabilities, not the counts --- neat/gen_mut_model/runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/neat/gen_mut_model/runner.py b/neat/gen_mut_model/runner.py index 2394c66..5165648 100644 --- a/neat/gen_mut_model/runner.py +++ b/neat/gen_mut_model/runner.py @@ -324,8 +324,8 @@ def runner(reference_index, transition_matrix=snp_transition_bias, trinuc_trans_matrices=trinuc_transition_bias, trinuc_mut_bias=trinuc_mutation_probs, - insert_len_model=insertion_counts, - deletion_len_model=deletion_counts + insert_len_model=insertion_freqency, + deletion_len_model=deletion_frequency ) print('\nSaving model...') From 5e7c81fe0d3d102ec0d209f1e14500f98f98cea9 Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Thu, 22 Aug 2024 21:07:33 -0500 Subject: [PATCH 09/11] found a slight environment issue --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index d18ba21..8caab8b 100644 --- a/environment.yml +++ b/environment.yml @@ -17,4 +17,4 @@ dependencies: - pip: - pysam - frozendict - - poetry>=1.3.* + - poetry=1.3.* From a0a5e13683bcd8af2f61ffea1a17e48ea92897ee Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Wed, 28 Aug 2024 08:22:32 -0500 Subject: [PATCH 10/11] Cleaning up test bugs --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 8caab8b..d9ef1e8 100644 --- a/environment.yml +++ b/environment.yml @@ -17,4 +17,4 @@ dependencies: - pip: - pysam - frozendict - - poetry=1.3.* + - poetry==1.3.* From 59b9bdb8f9efb58a0aff0b57864aa2137f2ac1a9 Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Wed, 28 Aug 2024 08:25:27 -0500 Subject: [PATCH 11/11] accidentally changed a variable name --- neat/models/models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/neat/models/models.py b/neat/models/models.py index 5169d75..83de744 100644 --- a/neat/models/models.py +++ b/neat/models/models.py @@ -72,8 +72,8 @@ def get_insertion_length(self, size: int = None) -> int | list[int, ...]: Greater than 1 returns a list of ints. :return: int or list of ints. """ - return self.rng.choice(a=list(self.insert_len_model), - p=[*self.insert_len_model.values()], + return self.rng.choice(a=list(self.insertion_len_model), + p=[*self.insertion_len_model.values()], size=size, shuffle=False)