ncsa · joshfactorial · Sep 18, 2024 · Jul 13, 2024 · Jul 13, 2024 · Jul 15, 2024
diff --git a/environment.yml b/environment.yml
@@ -17,4 +17,4 @@ dependencies:
  - pip:
  - pysam
  - frozendict
- - poetry>=1.3.*
+ - poetry==1.3.*
diff --git a/neat/cli/commands/model_sequencing_error.py b/neat/cli/commands/model_sequencing_error.py
@@ -48,12 +48,9 @@ def add_arguments(self, parser: argparse.ArgumentParser):
  dest="quality_scores",
  nargs='+',
  required=False,
- default=[2, 11, 25, 37],
- help="Quality score bins. Enter as a space separeted list for binned scores "
- "(-Q 2 11 24 37), or enter a single maximum score for a full range (-Q 42 gives all"
- "scores from 1-42). The default is binned quality scores: [2, 11, 24, 37]. Note that"
- "using quality score bins on an unbinned fastq will result in a binned model, at the"
- "cost of some inaccuracy.")
+ default=42,
+ help="Quality score max. The default 42. The lowest possible score is 1. To used binned"
+ "scoring, enter a space separated list of scores, e.g., -Q 2 15 23 37")
 
  parser.add_argument('-m',
  type=int,

diff --git a/neat/gen_mut_model/runner.py b/neat/gen_mut_model/runner.py
@@ -85,18 +85,23 @@ def runner(reference_index,
 
  # Pre-parsing to find all the matching chromosomes between ref and vcf
  _LOG.info('Processing VCF file...')
- matching_variants, matching_chromosomes = read_and_filter_variants(vcf_to_process,
- reference_index,
- ignore)
+ matching_variants, matching_chromosomes = read_and_filter_variants(
+ vcf_to_process,
+ reference_index,
+ ignore
+ )
 
  if not matching_variants or not matching_chromosomes:
  _LOG.error("No valid variants detected. Check names in vcf versus reference and/or bed.")
  sys.exit(1)
 
- trinuc_ref_count, bed_track_length = count_trinucleotides(reference_index,
- bed,
- outcounts_file,
- matching_chromosomes)
+ trinuc_ref_count, bed_track_length = count_trinucleotides(
+ reference_index,
+ bed,
+ outcounts_file,
+ matching_chromosomes,
+ save_trinuc
+ )
 
  if not trinuc_ref_count:
  _LOG.error("No valid trinucleotides detected in reference.")
@@ -296,47 +301,6 @@ def runner(reference_index,
  for k in sorted(snp_trans_freq):
  _LOG.info(f'p({k[0]} --> {k[1]} | SNP occurs) = {snp_trans_freq[k]}')
 
- # Save counts, if requested
- if save_trinuc:
- trinuc_output_data = {
- 'p(snp)': average_snp_freq,
- 'p(insertion)': average_insertion_frequency,
- 'p(deletion)': average_deletion_frequency,
- 'overall average mut rate': avg_mut_rate,
- 'total variants processed': total_var,
- 'homozygous probability': homozygous_frequency
- }
-
- for k in sorted(trinuc_mut_prob):
- trinuc_output_data[f'p({k} mutates)'] = trinuc_mut_prob[k]
-
- for k in sorted(trinuc_trans_probs):
- trinuc_output_data[f'p({k[0]} --> {k[1]} | {k[0]} mutates)'] = trinuc_trans_probs[k]
-
- for k in sorted(deletion_counts):
- trinuc_output_data[f'p(del length = {abs(k)} | deletion occurs)'] = deletion_frequency[k]
-
- for k in sorted(insertion_counts):
- trinuc_output_data[f'p(insert length = {abs(k)} | insertion occurs)'] = insertion_freqency[k]
-
- for k in sorted(snp_trans_freq):
- trinuc_output_data[f'p({k[0]} --> {k[1]} | SNP occurs)'] = snp_trans_freq[k]
-
- trinuc_output_path = Path(output)
- trinuc_output_path = trinuc_output_path.with_suffix('')
- trinuc_output_path = trinuc_output_path.with_suffix('.trinuc.pickle.gz')
- _LOG.info(f'Saving trinucleotide counts to: {trinuc_output_path}')
-
- with open_output(trinuc_output_path, 'w+') as trinuc_outfile:
-
- pickle.dump(trinuc_output_data, trinuc_outfile)
-
- # Human-readable content of file below
- trinuc_outfile.write('\n')
-
- for key, value in trinuc_output_data.items():
- trinuc_outfile.write(f'{key}: {value}\n')
-
  _LOG.info(f'p(snp) = {average_snp_freq}')
  _LOG.info(f'p(insertion) = {average_insertion_frequency}')
  _LOG.info(f'p(deletion) = {average_deletion_frequency}')
@@ -360,8 +324,8 @@ def runner(reference_index,
  transition_matrix=snp_transition_bias,
  trinuc_trans_matrices=trinuc_transition_bias,
  trinuc_mut_bias=trinuc_mutation_probs,
- insert_len_model=insertion_counts,
- deletion_len_model=deletion_counts
+ insert_len_model=insertion_freqency,
+ deletion_len_model=deletion_frequency
  )
 
  print('\nSaving model...')
@@ -405,6 +369,8 @@ def compute_mut_runner(reference,
  if outcounts:
  validate_input_path(outcounts)
  outcounts = Path(outcounts)
+ elif save_trinuc:
+ outcounts = Path(output + '.trinuc.pickle.gz')
 
  print('Processing reference...')
  reference_index = SeqIO.index(reference, 'fasta')
@@ -477,8 +443,18 @@ def compute_mut_runner(reference,
  output = Path(output + '.pickle.gz')
  validate_output_path(output, overwrite=overwrite_output)
 
- runner(reference_index, vcf_to_process, vcf_columns, outcounts, show_trinuc, save_trinuc,
- output, bed, human_sample, skip_common)
+ runner(
+ reference_index,
+ vcf_to_process,
+ vcf_columns,
+ outcounts,
+ show_trinuc,
+ save_trinuc,
+ output,
+ bed,
+ human_sample,
+ skip_common
+ )
 
  if os.path.exists('temp.vcf'):
  os.remove('temp.vcf')

diff --git a/neat/gen_mut_model/utils.py b/neat/gen_mut_model/utils.py
@@ -4,6 +4,8 @@
 
 import json
 import sys
+import pickle
+import gzip
 
 import numpy as np
 
@@ -165,14 +167,16 @@ def cluster_list(list_to_cluster, delta):
 def count_trinucleotides(reference_index,
  bed,
  trinuc_counts,
- matching_chroms):
+ matching_chroms,
+ save_trinuc_file):
  """
  Counts the frequency of the various trinucleotide combinations in the dataset
 
  :param dict reference_index: The index of the fasta, from SeqIO
  :param Path bed: Full path to bed file, if using
  :param Path trinuc_counts: Full path to existing trinculeotide counts file, if input, or the output path
  :param list matching_chroms: List of matching chromosomes
+ :param save_trinuc_file: Boolean determines whether to save trinucleotide counts file.
  :return dict, int: A dictionary of trinculeotides and counts, the number of bases spanned
  """
  # Data format: TRINUC: COUNT (e.g., "AAA": 12), where COUNT is the number of times trinuc was observed
@@ -183,11 +187,11 @@ def count_trinucleotides(reference_index,
  track_len = 0
 
  trinuc_counts_exists = False
- save_trinuc_file = False
  if trinuc_counts:
  if trinuc_counts.is_file():
+ _LOG.info("Trinucleotide counts file exists, skipping save to avoid overwriting data.")
  trinuc_counts_exists = True
- save_trinuc_file = True
+  save_trinuc_file = False
 
  if bed:
  _LOG.info("Counting trinucleotide combinations in bed regions")
@@ -207,7 +211,6 @@ def count_trinucleotides(reference_index,
  trinuc_ref_count = count_trinuc(trinuc, trinuc_ref_count)
 
  # Solution to attribute error (needs to be checked)
- # TODO remove ref_name from this dict
  elif not trinuc_counts_exists:
  _LOG.info("Counting trinucleotide combinations in reference.")
  for ref_name in matching_chroms:
@@ -220,13 +223,12 @@ def count_trinucleotides(reference_index,
  with open_output(trinuc_counts, 'w') as countfile:
  _LOG.info('Saving trinuc counts to file...')
  # Convert all values to writable formats
- trinuc_ref_count = {str(x): int(y) for x, y in trinuc_ref_count.items()}
- countfile.write(json.dumps(trinuc_ref_count))
+ pickle.dump(trinuc_ref_count, countfile)
 
  else:
  _LOG.info(f'Loading file: {trinuc_counts}.')
- with open_input(trinuc_counts) as counts:
- trinuc_ref_count = json.load(counts)
+ with gzip.open(trinuc_counts, 'rb') as counts:
+ trinuc_ref_count = pickle.load(counts)
  if save_trinuc_file:
  _LOG.warning('Existing trinucelotide file will not be changed.')
 

diff --git a/neat/model_sequencing_error/runner.py b/neat/model_sequencing_error/runner.py
@@ -62,9 +62,11 @@ def model_seq_err_runner(
  _LOG.debug(f"Quality offset: {offset}")
 
  final_quality_scores: list
+ binned_scores = False
  if len(qual_scores) == 1:
  final_quality_scores = list(range(1, qual_scores[0] + 1))
  else:
+ binned_scores = True
  final_quality_scores = sorted(qual_scores)
 
  _LOG.debug(f'Quality scores: {final_quality_scores}')
@@ -109,11 +111,14 @@ def model_seq_err_runner(
  for file in files:
  file_num += 1
  _LOG.info(f'Reading file {file_num} of {len(files)}')
- parameters_by_position, file_avg_error, read_length = parse_file(file,
- final_quality_scores,
- num_records_to_process,
- offset,
- read_length)
+ parameters_by_position, file_avg_error, read_length = parse_file(
+ file,
+ final_quality_scores,
+ num_records_to_process,
+ offset,
+ read_length,
+ binned_scores
+ )
 
  read_parameters.append(parameters_by_position)
  average_errors.append(file_avg_error)

diff --git a/neat/model_sequencing_error/utils.py b/neat/model_sequencing_error/utils.py
@@ -10,7 +10,6 @@
 
 from scipy.stats import mode
 from ..common import open_input
-from ..models import take_closest
 
 __all__ = [
  "parse_file"
@@ -139,10 +138,7 @@ def parse_file(input_file: str, quality_scores: list, max_reads: int, qual_offse
 
  for j in range(read_length):
  # The qualities of each read_position_scores
- quality_bin = take_closest(quality_scores, qualities_to_check[j])
- bin_index = quality_scores.index(quality_bin)
- temp_q_count[j][bin_index] += 1
- qual_score_counter[quality_bin] += 1
+ qual_score_counter[qualities_to_check[j]] += 1
 
  if records_read % quarters == 0:
  _LOG.info(f'reading data: {(records_read / max_reads) * 100:.0f}%')

diff --git a/neat/models/__init__.py b/neat/models/__init__.py
@@ -1,2 +1 @@
 from .models import *
-from .utils import *
diff --git a/neat/models/default_sequencing_error_model.py b/neat/models/default_sequencing_error_model.py
@@ -17,8 +17,8 @@
  [0.2505, 0.2552, 0.4943, 0.0]]
 )
 
-# This list may not be the final list
-default_quality_scores = np.array([2, 11, 25, 37])
+# All numbers from 1-42
+default_quality_scores = np.arange(1, 43)
 
 # This puts a high probability toward getting a maximum quality score. The current values
 # should be considered temporary. We're working on final values.

diff --git a/neat/models/models.py b/neat/models/models.py
@@ -18,7 +18,6 @@
 from ..common import TRINUC_IND, ALLOWED_NUCL, NUC_IND, DINUC_IND
 from .default_mutation_model import *
 from .default_sequencing_error_model import *
-from .utils import bin_scores, take_closest
 
 __all__ = [
  "MutationModel",
@@ -60,7 +59,9 @@ class InsertionModel(VariantModel):
  def __init__(self,
  insert_len_model: dict[int: float, ...],
  rng: Generator = None):
- self.insert_len_model = insert_len_model
+ # Creating probabilities from the weights
+ tot = sum(insert_len_model.values())
+ self.insertion_len_model = {key: val / tot for key, val in insert_len_model.items()}
  self.rng = rng
 
  def get_insertion_length(self, size: int = None) -> int | list[int, ...]:
@@ -71,8 +72,8 @@ def get_insertion_length(self, size: int = None) -> int | list[int, ...]:
  Greater than 1 returns a list of ints.
  :return: int or list of ints.
  """
- return self.rng.choice(a=list(self.insert_len_model),
- p=[*self.insert_len_model.values()],
+ return self.rng.choice(a=list(self.insertion_len_model),
+ p=[*self.insertion_len_model.values()],
  size=size, shuffle=False)
 
 
@@ -91,7 +92,9 @@ class DeletionModel(VariantModel):
  def __init__(self,
  deletion_len_model: dict[int: float, ...],
  rng: Generator = None):
- self.deletion_len_model = deletion_len_model
+ # Creating probabilities from the weights
+ tot = sum(deletion_len_model.values())
+ self.deletion_len_model = {key: val/tot for key, val in deletion_len_model.items()}
  self.rng = rng
 
  def get_deletion_length(self, size: int = None) -> int | list[int, ...]:
@@ -281,6 +284,9 @@ def generate_snv(self, trinucleotide: Seq, reference_location: int) -> SingleNuc
  transition_matrix = self.trinuc_trans_matrices[DINUC_IND[trinucleotide[0] + "_" + trinucleotide[2]]]
  # then determine the trans probs based on the middle nucleotide
  transition_probs = transition_matrix[NUC_IND[trinucleotide[1]]]
+ # Creating probabilities from the weights
+ transition_sum = sum(transition_probs)
+ transition_probs = [x/transition_sum for x in transition_probs]
  # Now pick a random alternate, weighted by the probabilities
  alt = self.rng.choice(ALLOWED_NUCL, p=transition_probs)
  temp_snv = SingleNucleotideVariant(reference_location, alt=alt)
@@ -376,11 +382,8 @@ def __init__(self,
  self.insertion_model = insertion_model
  self.uniform_quality_score = None
  if self.is_uniform:
- # bin scores returns a list, so we need the first (only) element of the list
- converted_avg_err = bin_scores(self.quality_scores,
- [int(-10. * np.log10(self.average_error))])[0]
- # Set score to the lowest of the max of the quality scores and the bin closest to the input avg error.
- self.uniform_quality_score = min([max(self.quality_scores), converted_avg_err])
+ # Set score to the lowest of the max of the quality scores and the input avg error.
+ self.uniform_quality_score = min([max(self.quality_scores), int(-10. * np.log10(self.average_error) + 0.5)])
  self.rng = rng
 
  def get_sequencing_errors(self,
@@ -498,7 +501,13 @@ def get_quality_scores(self,
  for i in quality_index_map:
  score = self.rng.normal(self.quality_score_probabilities[i][0],
  scale=self.quality_score_probabilities[i][1])
- score = take_closest(self.quality_scores, score)
+ # make sure score is in range and an int
+ score = round(score)
+ if score > 42:
+ score = 42
+ if score < 1:
+ score = 1
+
  temp_qual_array.append(score)
 
  if self.rescale_qualities:
@@ -509,9 +518,9 @@ def get_quality_scores(self,
  self.quality_score_error_rate[n]) + 0.5)])
  for n in temp_qual_array]
  # Now rebin the quality scores.
- temp_qual_array = np.array(bin_scores(self.quality_scores, rescaled_quals))
+ temp_qual_array = np.array(rescaled_quals)
  else:
- temp_qual_array = np.array(bin_scores(self.quality_scores, temp_qual_array))
+ temp_qual_array = np.array(temp_qual_array)
 
  return temp_qual_array[:input_read_length]