From 3797ca9fb3de1d2faea1774d68cc41138a706c3b Mon Sep 17 00:00:00 2001
From: Joshua Allen <jallen17@illinois.edu>
Date: Sat, 13 Jul 2024 14:55:54 -0500
Subject: [PATCH 01/11] Fixing how neat was counting input variants

---
 neat/read_simulator/utils/vcf_func.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/neat/read_simulator/utils/vcf_func.py b/neat/read_simulator/utils/vcf_func.py
index a2c0bb2..3d900a9 100755
--- a/neat/read_simulator/utils/vcf_func.py
+++ b/neat/read_simulator/utils/vcf_func.py
@@ -73,6 +73,7 @@ def parse_input_vcf(input_dict: dict[str: ContigVariants],
 
     n_skipped = 0
     mismatched = 0
+    records_found = 0
     # maximum number of columns we are interested in. Used for trimming unwanted samples.
     max_col = 7
     with open_input(vcf_path) as f:
@@ -276,6 +277,9 @@ def parse_input_vcf(input_dict: dict[str: ContigVariants],
                     if rc == 1:
                         _LOG.warning(f"Input variant skipped because a variant already existed at that location:"
                                      f"{chrom}: {location} ({temp_variant})")
+                        n_skipped += 1
+                    else:
+                        records_found += 1
 
                     if tumor_normal:
                         """
@@ -283,7 +287,8 @@ def parse_input_vcf(input_dict: dict[str: ContigVariants],
                         """
                         pass
 
-    _LOG.info(f'Found {len(input_dict)} variants in input VCF.')
+    _LOG.info(f'Found {records_found} variants in input VCF.')
     _LOG.info(f'Skipped {n_skipped} variants because of multiples at the same location')
+    _LOG.info(f'Skipped {mismatched} variants because of a mismatch between Ref and reference.')
 
     return sample_columns

From 5d23806c3c1989a729daba2cb7a41918f0cc9f98 Mon Sep 17 00:00:00 2001
From: Joshua Allen <jallen17@illinois.edu>
Date: Sat, 13 Jul 2024 15:53:30 -0500
Subject: [PATCH 02/11] Updating gen_mut_model to output and read proper
 trinucleotide count files

---
 neat/gen_mut_model/runner.py | 76 ++++++++++++------------------------
 neat/gen_mut_model/utils.py  | 18 +++++----
 2 files changed, 36 insertions(+), 58 deletions(-)

diff --git a/neat/gen_mut_model/runner.py b/neat/gen_mut_model/runner.py
index 5fb1b22..2394c66 100644
--- a/neat/gen_mut_model/runner.py
+++ b/neat/gen_mut_model/runner.py
@@ -85,18 +85,23 @@ def runner(reference_index,
 
     # Pre-parsing to find all the matching chromosomes between ref and vcf
     _LOG.info('Processing VCF file...')
-    matching_variants, matching_chromosomes = read_and_filter_variants(vcf_to_process,
-                                                                       reference_index,
-                                                                       ignore)
+    matching_variants, matching_chromosomes = read_and_filter_variants(
+        vcf_to_process,
+        reference_index,
+        ignore
+    )
 
     if not matching_variants or not matching_chromosomes:
         _LOG.error("No valid variants detected. Check names in vcf versus reference and/or bed.")
         sys.exit(1)
 
-    trinuc_ref_count, bed_track_length = count_trinucleotides(reference_index,
-                                                              bed,
-                                                              outcounts_file,
-                                                              matching_chromosomes)
+    trinuc_ref_count, bed_track_length = count_trinucleotides(
+        reference_index,
+        bed,
+        outcounts_file,
+        matching_chromosomes,
+        save_trinuc
+    )
 
     if not trinuc_ref_count:
         _LOG.error("No valid trinucleotides detected in reference.")
@@ -296,47 +301,6 @@ def runner(reference_index,
         for k in sorted(snp_trans_freq):
             _LOG.info(f'p({k[0]} --> {k[1]} | SNP occurs) = {snp_trans_freq[k]}')
 
-    # Save counts, if requested
-    if save_trinuc:
-        trinuc_output_data = {
-            'p(snp)': average_snp_freq,
-            'p(insertion)': average_insertion_frequency,
-            'p(deletion)': average_deletion_frequency,
-            'overall average mut rate': avg_mut_rate,
-            'total variants processed': total_var,
-            'homozygous probability': homozygous_frequency
-        }
-
-        for k in sorted(trinuc_mut_prob):
-            trinuc_output_data[f'p({k} mutates)'] = trinuc_mut_prob[k]
-
-        for k in sorted(trinuc_trans_probs):
-            trinuc_output_data[f'p({k[0]} --> {k[1]} | {k[0]} mutates)'] = trinuc_trans_probs[k]
-
-        for k in sorted(deletion_counts):
-            trinuc_output_data[f'p(del length = {abs(k)} | deletion occurs)'] = deletion_frequency[k]
-
-        for k in sorted(insertion_counts):
-            trinuc_output_data[f'p(insert length = {abs(k)} | insertion occurs)'] = insertion_freqency[k]
-
-        for k in sorted(snp_trans_freq):
-            trinuc_output_data[f'p({k[0]} --> {k[1]} | SNP occurs)'] = snp_trans_freq[k]
-
-        trinuc_output_path = Path(output)
-        trinuc_output_path = trinuc_output_path.with_suffix('')
-        trinuc_output_path = trinuc_output_path.with_suffix('.trinuc.pickle.gz')
-        _LOG.info(f'Saving trinucleotide counts to: {trinuc_output_path}')
-
-        with open_output(trinuc_output_path, 'w+') as trinuc_outfile:
-
-            pickle.dump(trinuc_output_data, trinuc_outfile)
-
-            # Human-readable content of file below
-            trinuc_outfile.write('\n')
-
-            for key, value in trinuc_output_data.items():
-                trinuc_outfile.write(f'{key}: {value}\n')
-
     _LOG.info(f'p(snp)   = {average_snp_freq}')
     _LOG.info(f'p(insertion) = {average_insertion_frequency}')
     _LOG.info(f'p(deletion) = {average_deletion_frequency}')
@@ -405,6 +369,8 @@ def compute_mut_runner(reference,
     if outcounts:
         validate_input_path(outcounts)
         outcounts = Path(outcounts)
+    elif save_trinuc:
+        outcounts = Path(output + '.trinuc.pickle.gz')
 
     print('Processing reference...')
     reference_index = SeqIO.index(reference, 'fasta')
@@ -477,8 +443,18 @@ def compute_mut_runner(reference,
     output = Path(output + '.pickle.gz')
     validate_output_path(output, overwrite=overwrite_output)
 
-    runner(reference_index, vcf_to_process, vcf_columns, outcounts, show_trinuc, save_trinuc,
-           output, bed, human_sample, skip_common)
+    runner(
+        reference_index,
+        vcf_to_process,
+        vcf_columns,
+        outcounts,
+        show_trinuc,
+        save_trinuc,
+        output,
+        bed,
+        human_sample,
+        skip_common
+    )
 
     if os.path.exists('temp.vcf'):
         os.remove('temp.vcf')
diff --git a/neat/gen_mut_model/utils.py b/neat/gen_mut_model/utils.py
index 78356d8..3680753 100644
--- a/neat/gen_mut_model/utils.py
+++ b/neat/gen_mut_model/utils.py
@@ -4,6 +4,8 @@
 
 import json
 import sys
+import pickle
+import gzip
 
 import numpy as np
 
@@ -165,7 +167,8 @@ def cluster_list(list_to_cluster, delta):
 def count_trinucleotides(reference_index,
                          bed,
                          trinuc_counts,
-                         matching_chroms):
+                         matching_chroms,
+                         save_trinuc_file):
     """
     Counts the frequency of the various trinucleotide combinations in the dataset
 
@@ -173,6 +176,7 @@ def count_trinucleotides(reference_index,
     :param Path bed: Full path to bed file, if using
     :param Path trinuc_counts: Full path to existing trinculeotide counts file, if input, or the output path
     :param list matching_chroms: List of matching chromosomes
+    :param save_trinuc_file: Boolean determines whether to save trinucleotide counts file.
     :return dict, int: A dictionary of trinculeotides and counts, the number of bases spanned
     """
     # Data format: TRINUC: COUNT (e.g., "AAA": 12), where COUNT is the number of times trinuc was observed
@@ -183,11 +187,11 @@ def count_trinucleotides(reference_index,
     track_len = 0
 
     trinuc_counts_exists = False
-    save_trinuc_file = False
     if trinuc_counts:
         if trinuc_counts.is_file():
+            _LOG.info("Trinucleotide counts file exists, skipping save to avoid overwriting data.")
             trinuc_counts_exists = True
-        save_trinuc_file = True
+            save_trinuc_file = False
 
     if bed:
         _LOG.info("Counting trinucleotide combinations in bed regions")
@@ -207,7 +211,6 @@ def count_trinucleotides(reference_index,
                         trinuc_ref_count = count_trinuc(trinuc, trinuc_ref_count)
 
     # Solution to attribute error (needs to be checked)
-    # TODO remove ref_name from this dict
     elif not trinuc_counts_exists:
         _LOG.info("Counting trinucleotide combinations in reference.")
         for ref_name in matching_chroms:
@@ -220,13 +223,12 @@ def count_trinucleotides(reference_index,
             with open_output(trinuc_counts, 'w') as countfile:
                 _LOG.info('Saving trinuc counts to file...')
                 # Convert all values to writable formats
-                trinuc_ref_count = {str(x): int(y) for x, y in trinuc_ref_count.items()}
-                countfile.write(json.dumps(trinuc_ref_count))
+                pickle.dump(trinuc_ref_count, countfile)
 
     else:
         _LOG.info(f'Loading file: {trinuc_counts}.')
-        with open_input(trinuc_counts) as counts:
-            trinuc_ref_count = json.load(counts)
+        with gzip.open(trinuc_counts, 'rb') as counts:
+            trinuc_ref_count = pickle.load(counts)
         if save_trinuc_file:
             _LOG.warning('Existing trinucelotide file will not be changed.')
 

From 0793f4dd0e75f56dc55a455a72b25f92a97166fd Mon Sep 17 00:00:00 2001
From: Joshua Allen <jallen17@illinois.edu>
Date: Mon, 15 Jul 2024 18:03:30 -0500
Subject: [PATCH 03/11] Removed binned scoring

---
 neat/cli/commands/model_sequencing_error.py   |  8 +-
 neat/model_sequencing_error/runner.py         | 15 ++--
 neat/model_sequencing_error/utils.py          |  6 +-
 neat/models/__init__.py                       |  1 -
 neat/models/default_sequencing_error_model.py |  4 +-
 neat/models/models.py                         | 20 ++---
 neat/models/utils.py                          | 75 -------------------
 neat/read_simulator/utils/generate_reads.py   | 17 ++---
 8 files changed, 32 insertions(+), 114 deletions(-)
 delete mode 100644 neat/models/utils.py

diff --git a/neat/cli/commands/model_sequencing_error.py b/neat/cli/commands/model_sequencing_error.py
index 1c3ef2a..0114a6a 100644
--- a/neat/cli/commands/model_sequencing_error.py
+++ b/neat/cli/commands/model_sequencing_error.py
@@ -48,12 +48,8 @@ def add_arguments(self, parser: argparse.ArgumentParser):
                             dest="quality_scores",
                             nargs='+',
                             required=False,
-                            default=[2, 11, 25, 37],
-                            help="Quality score bins. Enter as a space separeted list for binned scores "
-                                 "(-Q 2 11 24 37), or enter a single maximum score for a full range (-Q 42 gives all"
-                                 "scores from 1-42). The default is binned quality scores: [2, 11, 24, 37]. Note that"
-                                 "using quality score bins on an unbinned fastq will result in a binned model, at the"
-                                 "cost of some inaccuracy.")
+                            default=42,
+                            help="Quality score max. The default 42. The lowest possible score is 1")
 
         parser.add_argument('-m',
                             type=int,
diff --git a/neat/model_sequencing_error/runner.py b/neat/model_sequencing_error/runner.py
index 0326f0f..b12ac24 100644
--- a/neat/model_sequencing_error/runner.py
+++ b/neat/model_sequencing_error/runner.py
@@ -62,9 +62,11 @@ def model_seq_err_runner(
     _LOG.debug(f"Quality offset: {offset}")
 
     final_quality_scores: list
+    binned_scores = False
     if len(qual_scores) == 1:
         final_quality_scores = list(range(1, qual_scores[0] + 1))
     else:
+        binned_scores = True
         final_quality_scores = sorted(qual_scores)
 
     _LOG.debug(f'Quality scores: {final_quality_scores}')
@@ -109,11 +111,14 @@ def model_seq_err_runner(
     for file in files:
         file_num += 1
         _LOG.info(f'Reading file {file_num} of {len(files)}')
-        parameters_by_position, file_avg_error, read_length = parse_file(file,
-                                                                         final_quality_scores,
-                                                                         num_records_to_process,
-                                                                         offset,
-                                                                         read_length)
+        parameters_by_position, file_avg_error, read_length = parse_file(
+            file,
+            final_quality_scores,
+            num_records_to_process,
+            offset,
+            read_length,
+            binned_scores
+        )
 
         read_parameters.append(parameters_by_position)
         average_errors.append(file_avg_error)
diff --git a/neat/model_sequencing_error/utils.py b/neat/model_sequencing_error/utils.py
index b494d4d..2d99853 100644
--- a/neat/model_sequencing_error/utils.py
+++ b/neat/model_sequencing_error/utils.py
@@ -10,7 +10,6 @@
 
 from scipy.stats import mode
 from ..common import open_input
-from ..models import take_closest
 
 __all__ = [
     "parse_file"
@@ -139,10 +138,7 @@ def parse_file(input_file: str, quality_scores: list, max_reads: int, qual_offse
 
             for j in range(read_length):
                 # The qualities of each read_position_scores
-                quality_bin = take_closest(quality_scores, qualities_to_check[j])
-                bin_index = quality_scores.index(quality_bin)
-                temp_q_count[j][bin_index] += 1
-                qual_score_counter[quality_bin] += 1
+                qual_score_counter[qualities_to_check[j]] += 1
 
             if records_read % quarters == 0:
                 _LOG.info(f'reading data: {(records_read / max_reads) * 100:.0f}%')
diff --git a/neat/models/__init__.py b/neat/models/__init__.py
index f4c7f0f..aed4fa3 100644
--- a/neat/models/__init__.py
+++ b/neat/models/__init__.py
@@ -1,2 +1 @@
 from .models import *
-from .utils import *
diff --git a/neat/models/default_sequencing_error_model.py b/neat/models/default_sequencing_error_model.py
index 9924edd..0883718 100644
--- a/neat/models/default_sequencing_error_model.py
+++ b/neat/models/default_sequencing_error_model.py
@@ -17,8 +17,8 @@
      [0.2505, 0.2552, 0.4943, 0.0]]
 )
 
-# This list may not be the final list
-default_quality_scores = np.array([2, 11, 25, 37])
+# All numbers from 1-42
+default_quality_scores = np.arange(1, 43)
 
 # This puts a high probability toward getting a maximum quality score. The current values
 # should be considered temporary. We're working on final values.
diff --git a/neat/models/models.py b/neat/models/models.py
index 5f19a80..be3b1bd 100644
--- a/neat/models/models.py
+++ b/neat/models/models.py
@@ -18,7 +18,6 @@
 from ..common import TRINUC_IND, ALLOWED_NUCL, NUC_IND, DINUC_IND
 from .default_mutation_model import *
 from .default_sequencing_error_model import *
-from .utils import bin_scores, take_closest
 
 __all__ = [
     "MutationModel",
@@ -376,11 +375,8 @@ def __init__(self,
         self.insertion_model = insertion_model
         self.uniform_quality_score = None
         if self.is_uniform:
-            # bin scores returns a list, so we need the first (only) element of the list
-            converted_avg_err = bin_scores(self.quality_scores,
-                                           [int(-10. * np.log10(self.average_error))])[0]
-            # Set score to the lowest of the max of the quality scores and the bin closest to the input avg error.
-            self.uniform_quality_score = min([max(self.quality_scores), converted_avg_err])
+            # Set score to the lowest of the max of the quality scores and the input avg error.
+            self.uniform_quality_score = min([max(self.quality_scores), int(-10. * np.log10(self.average_error) + 0.5)])
         self.rng = rng
 
     def get_sequencing_errors(self,
@@ -498,7 +494,13 @@ def get_quality_scores(self,
             for i in quality_index_map:
                 score = self.rng.normal(self.quality_score_probabilities[i][0],
                                         scale=self.quality_score_probabilities[i][1])
-                score = take_closest(self.quality_scores, score)
+                # make sure score is in range and an int
+                score = round(score)
+                if score > 42:
+                    score = 42
+                if score < 1:
+                    score = 1
+
                 temp_qual_array.append(score)
 
         if self.rescale_qualities:
@@ -509,9 +511,9 @@ def get_quality_scores(self,
                                                           self.quality_score_error_rate[n]) + 0.5)])
                               for n in temp_qual_array]
             # Now rebin the quality scores.
-            temp_qual_array = np.array(bin_scores(self.quality_scores, rescaled_quals))
+            temp_qual_array = np.array(rescaled_quals)
         else:
-            temp_qual_array = np.array(bin_scores(self.quality_scores, temp_qual_array))
+            temp_qual_array = np.array(temp_qual_array)
 
         return temp_qual_array[:input_read_length]
 
diff --git a/neat/models/utils.py b/neat/models/utils.py
deleted file mode 100644
index 049ca98..0000000
--- a/neat/models/utils.py
+++ /dev/null
@@ -1,75 +0,0 @@
-"""
-Utilities for the sequencing error model
-"""
-import numpy as np
-
-from bisect import bisect_left
-
-
-__all__ = [
-    "bin_scores",
-    "take_closest"
-]
-
-DATA_BINS = {}
-
-
-def bin_scores(bins: list | np.ndarray, quality_array: list | np.ndarray, qual_offset: int = None):
-    """
-    Assumes bins list is sorted. Returns the closest value to quality.
-
-    If two numbers are equally close, return the smallest number. Note that in the case of quality scores
-    not being binned, the "bin" will end up just being the quality score.
-
-    :param bins: the possible values of the quality scores for the simulation
-    :param quality_array: the quality array from data which we will bin.
-    :param qual_offset: the quality offset for these quality scores. Only needed if input is a str.
-    """
-    # Convert to array, if string
-    if type(quality_array[0]) == str:
-        temp_array = []
-        for char in quality_array:
-            temp_array.append(ord(char) - qual_offset)
-        quality_array = temp_array
-
-    return_array = []
-    for score in quality_array:
-        if score in DATA_BINS:
-            return_array.append(DATA_BINS[score])
-        else:
-            pos = bisect_left(bins, score)
-            if pos == 0:
-                DATA_BINS[score] = bins[0]
-                return_array.append(bins[0])
-            elif pos == len(bins):
-                DATA_BINS[score] = bins[-1]
-                return_array.append(bins[-1])
-            else:
-                before = bins[pos - 1]
-                after = bins[pos]
-                if after - score < score - before:
-                    DATA_BINS[score] = after
-                    return_array.append(after)
-                else:
-                    DATA_BINS[score] = before
-                    return_array.append(before)
-
-    return return_array
-
-def take_closest(bins, quality):
-    """
-    Assumes bins is sorted. Returns the closest value to quality.
-
-    If two numbers are equally close, return the smallest number.
-    """
-    pos = bisect_left(bins, quality)
-    if pos == 0:
-        return bins[0]
-    if pos == len(bins):
-        return bins[-1]
-    before = bins[pos - 1]
-    after = bins[pos]
-    if after - quality < quality - before:
-        return after
-    else:
-        return before
diff --git a/neat/read_simulator/utils/generate_reads.py b/neat/read_simulator/utils/generate_reads.py
index f045d61..fc5282e 100644
--- a/neat/read_simulator/utils/generate_reads.py
+++ b/neat/read_simulator/utils/generate_reads.py
@@ -71,20 +71,15 @@ def cover_dataset(
         # trying to get enough variability to harden NEAT against edge cases.
         if loop_count % 10 == 0:
             fragment_model.rng.shuffle(fragment_pool)
-        # Breaking the gename into fragments
+        # Mapping random fragments onto genome
+        i = 0
         while start < span_length:
             # We take the first element and put it back on the end to create an endless pool of fragments to draw from
-            fragment = fragment_pool.pop(0)
+            fragment = fragment_pool[i]
+            i = (i + 1) % len(fragment_pool)
             end = min(start + fragment, span_length)
-            # these are equivalent of reads we expect the machine to filter out, but we won't actually use it
-            if end - start < options.read_len:
-                # add some random flavor to try to keep it to falling into a loop
-                if fragment_model.rng.normal() < 0.5:
-                    fragment_pool.insert(len(fragment_pool)//2, fragment)
-                else:
-                    fragment_pool.insert(len(fragment_pool) - 3, fragment)
-            else:
-                fragment_pool.append(fragment)
+            # Ensure the read is long enough to form a read, else we will not use it.
+            if end - start > options.read_len:
                 temp_fragments.append((start, end))
             start = end
 

From 9a84e12e777e4a2e9ddc866ea8ab447040845ea0 Mon Sep 17 00:00:00 2001
From: Joshua Allen <jallen17@illinois.edu>
Date: Wed, 21 Aug 2024 21:19:25 -0500
Subject: [PATCH 04/11] Couple small edits to vcf_func to improve time. Seems
 the contig variants container is eating up all my memory, trying to debug
 that

---
 neat/cli/commands/model_sequencing_error.py |  3 ++-
 neat/read_simulator/utils/vcf_func.py       | 10 ++++++----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/neat/cli/commands/model_sequencing_error.py b/neat/cli/commands/model_sequencing_error.py
index 0114a6a..aa06831 100644
--- a/neat/cli/commands/model_sequencing_error.py
+++ b/neat/cli/commands/model_sequencing_error.py
@@ -49,7 +49,8 @@ def add_arguments(self, parser: argparse.ArgumentParser):
                             nargs='+',
                             required=False,
                             default=42,
-                            help="Quality score max. The default 42. The lowest possible score is 1")
+                            help="Quality score max. The default 42. The lowest possible score is 1. To used binned"
+                                 "scoring, enter a space separated list of scores, e.g., -Q 2 15 23 37")
 
         parser.add_argument('-m',
                             type=int,
diff --git a/neat/read_simulator/utils/vcf_func.py b/neat/read_simulator/utils/vcf_func.py
index 3d900a9..f6420da 100755
--- a/neat/read_simulator/utils/vcf_func.py
+++ b/neat/read_simulator/utils/vcf_func.py
@@ -148,18 +148,20 @@ def parse_input_vcf(input_dict: dict[str: ContigVariants],
                     SAMPLE1 [9, optional]
                     SAMPLE2 [10, optional, cancer only]
                 """
-                # First, let's check if the chromosome for this record is even in the reference:
-                in_ref = record[0] in reference
+                # First, let's check if the chromosome for this record is even in the reference. Since input_dict is
+                # constructed from the reference, the keys list is the same.
+                in_ref = record[0] in input_dict.keys()
                 if not in_ref:
                     _LOG.warning(f'Skipping variant because the chromosome is not in the reference:\n{line}')
                     continue
 
+                reference_string = reference[record[0]][int(record[1]): int(record[1]) + len(record[3])].seq.upper()
                 # We already accounted for shifting to 0-based coordinates, so this should work.
-                if record[3] != str(reference[record[0]][int(record[1]): int(record[1]) + len(record[3])].seq):
+                if record[3] != str(reference_string):
                     mismatched += 1
                     _LOG.warning(f'Skipping variant because the ref field did not match the reference:'
                                  f'{record[0]}: {record[1]}, {record[3]} v '
-                                 f'{reference[record[0]][int(record[1]): int(record[1]) + len(record[3])].seq}')
+                                 f'{reference_string}')
                     continue
 
                 # We'll need the genotype when we generate reads, and output the records, if applicable

From 9892d5980b8d0323bd70957954841688a998c37c Mon Sep 17 00:00:00 2001
From: Joshua Allen <jallen17@illinois.edu>
Date: Thu, 22 Aug 2024 14:01:29 -0500
Subject: [PATCH 05/11] Normalizing deletion model

---
 neat/models/models.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/neat/models/models.py b/neat/models/models.py
index be3b1bd..c93a096 100644
--- a/neat/models/models.py
+++ b/neat/models/models.py
@@ -90,7 +90,9 @@ class DeletionModel(VariantModel):
     def __init__(self,
                  deletion_len_model: dict[int: float, ...],
                  rng: Generator = None):
-        self.deletion_len_model = deletion_len_model
+        max_value = max(deletion_len_model.values())
+        # normalize the values
+        self.deletion_len_model = {key: val/max_value for key, val in deletion_len_model.items()}
         self.rng = rng
 
     def get_deletion_length(self, size: int = None) -> int | list[int, ...]:

From 37130f1d214dfcff15260a5d8c3075e20cee6087 Mon Sep 17 00:00:00 2001
From: Joshua Allen <jallen17@illinois.edu>
Date: Thu, 22 Aug 2024 14:04:50 -0500
Subject: [PATCH 06/11] Normalizing insertion model

---
 neat/models/models.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/neat/models/models.py b/neat/models/models.py
index c93a096..08cf5b1 100644
--- a/neat/models/models.py
+++ b/neat/models/models.py
@@ -59,7 +59,9 @@ class InsertionModel(VariantModel):
     def __init__(self,
                  insert_len_model: dict[int: float, ...],
                  rng: Generator = None):
-        self.insert_len_model = insert_len_model
+        # normalize the values
+        tot = sum(insert_len_model.values())
+        self.insertion_len_model = {key: val / tot for key, val in insert_len_model.items()}
         self.rng = rng
 
     def get_insertion_length(self, size: int = None) -> int | list[int, ...]:
@@ -90,9 +92,9 @@ class DeletionModel(VariantModel):
     def __init__(self,
                  deletion_len_model: dict[int: float, ...],
                  rng: Generator = None):
-        max_value = max(deletion_len_model.values())
         # normalize the values
-        self.deletion_len_model = {key: val/max_value for key, val in deletion_len_model.items()}
+        tot = sum(deletion_len_model.values())
+        self.deletion_len_model = {key: val/tot for key, val in deletion_len_model.items()}
         self.rng = rng
 
     def get_deletion_length(self, size: int = None) -> int | list[int, ...]:

From a9ba1ff565446c0b4eda246d0c5131e12b415521 Mon Sep 17 00:00:00 2001
From: Joshua Allen <jallen17@illinois.edu>
Date: Thu, 22 Aug 2024 16:13:17 -0500
Subject: [PATCH 07/11] Converting weights to probabilities

---
 neat/models/models.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/neat/models/models.py b/neat/models/models.py
index 08cf5b1..5169d75 100644
--- a/neat/models/models.py
+++ b/neat/models/models.py
@@ -59,7 +59,7 @@ class InsertionModel(VariantModel):
     def __init__(self,
                  insert_len_model: dict[int: float, ...],
                  rng: Generator = None):
-        # normalize the values
+        # Creating probabilities from the weights
         tot = sum(insert_len_model.values())
         self.insertion_len_model = {key: val / tot for key, val in insert_len_model.items()}
         self.rng = rng
@@ -92,7 +92,7 @@ class DeletionModel(VariantModel):
     def __init__(self,
                  deletion_len_model: dict[int: float, ...],
                  rng: Generator = None):
-        # normalize the values
+        # Creating probabilities from the weights
         tot = sum(deletion_len_model.values())
         self.deletion_len_model = {key: val/tot for key, val in deletion_len_model.items()}
         self.rng = rng
@@ -284,6 +284,9 @@ def generate_snv(self, trinucleotide: Seq, reference_location: int) -> SingleNuc
         transition_matrix = self.trinuc_trans_matrices[DINUC_IND[trinucleotide[0] + "_" + trinucleotide[2]]]
         # then determine the trans probs based on the middle nucleotide
         transition_probs = transition_matrix[NUC_IND[trinucleotide[1]]]
+        # Creating probabilities from the weights
+        transition_sum = sum(transition_probs)
+        transition_probs = [x/transition_sum for x in transition_probs]
         # Now pick a random alternate, weighted by the probabilities
         alt = self.rng.choice(ALLOWED_NUCL, p=transition_probs)
         temp_snv = SingleNucleotideVariant(reference_location, alt=alt)

From 96df2339c9f187a0ec5bea301dc624018d151251 Mon Sep 17 00:00:00 2001
From: Joshua Allen <jallen17@illinois.edu>
Date: Thu, 22 Aug 2024 20:56:05 -0500
Subject: [PATCH 08/11] Fixing an issue in gen_mut_model where it was exporting
 the wrong valuse. We needed the probabilities, not the counts

---
 neat/gen_mut_model/runner.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/neat/gen_mut_model/runner.py b/neat/gen_mut_model/runner.py
index 2394c66..5165648 100644
--- a/neat/gen_mut_model/runner.py
+++ b/neat/gen_mut_model/runner.py
@@ -324,8 +324,8 @@ def runner(reference_index,
         transition_matrix=snp_transition_bias,
         trinuc_trans_matrices=trinuc_transition_bias,
         trinuc_mut_bias=trinuc_mutation_probs,
-        insert_len_model=insertion_counts,
-        deletion_len_model=deletion_counts
+        insert_len_model=insertion_freqency,
+        deletion_len_model=deletion_frequency
     )
 
     print('\nSaving model...')

From 5e7c81fe0d3d102ec0d209f1e14500f98f98cea9 Mon Sep 17 00:00:00 2001
From: Joshua Allen <jallen17@illinois.edu>
Date: Thu, 22 Aug 2024 21:07:33 -0500
Subject: [PATCH 09/11] found a slight environment issue

---
 environment.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/environment.yml b/environment.yml
index d18ba21..8caab8b 100644
--- a/environment.yml
+++ b/environment.yml
@@ -17,4 +17,4 @@ dependencies:
   - pip:
     - pysam
     - frozendict
-    - poetry>=1.3.*
+    - poetry=1.3.*

From a0a5e13683bcd8af2f61ffea1a17e48ea92897ee Mon Sep 17 00:00:00 2001
From: Joshua Allen <jallen17@illinois.edu>
Date: Wed, 28 Aug 2024 08:22:32 -0500
Subject: [PATCH 10/11] Cleaning up test bugs

---
 environment.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/environment.yml b/environment.yml
index 8caab8b..d9ef1e8 100644
--- a/environment.yml
+++ b/environment.yml
@@ -17,4 +17,4 @@ dependencies:
   - pip:
     - pysam
     - frozendict
-    - poetry=1.3.*
+    - poetry==1.3.*

From 59b9bdb8f9efb58a0aff0b57864aa2137f2ac1a9 Mon Sep 17 00:00:00 2001
From: Joshua Allen <jallen17@illinois.edu>
Date: Wed, 28 Aug 2024 08:25:27 -0500
Subject: [PATCH 11/11] accidentally changed a variable name

---
 neat/models/models.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/neat/models/models.py b/neat/models/models.py
index 5169d75..83de744 100644
--- a/neat/models/models.py
+++ b/neat/models/models.py
@@ -72,8 +72,8 @@ def get_insertion_length(self, size: int = None) -> int | list[int, ...]:
                      Greater than 1 returns a list of ints.
         :return: int or list of ints.
         """
-        return self.rng.choice(a=list(self.insert_len_model),
-                               p=[*self.insert_len_model.values()],
+        return self.rng.choice(a=list(self.insertion_len_model),
+                               p=[*self.insertion_len_model.values()],
                                size=size, shuffle=False)