From 396cebb2a310e1dee40b35e6a73caae78722d474 Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Wed, 16 Oct 2024 17:06:46 -0400 Subject: [PATCH 01/12] functionality seems fixed, need to fix verbosity and tests --- .../fastq_validator.py | 7 +- .../fastq_validator_logic.py | 203 ++++++++++-------- tests/test_fastq_validator_logic.py | 91 ++++++++ 3 files changed, 208 insertions(+), 93 deletions(-) diff --git a/src/ingest_validation_tests/fastq_validator.py b/src/ingest_validation_tests/fastq_validator.py index b30ca75..d9acee4 100644 --- a/src/ingest_validation_tests/fastq_validator.py +++ b/src/ingest_validation_tests/fastq_validator.py @@ -1,4 +1,3 @@ -from os import cpu_count from typing import List, Optional from fastq_validator_logic import FASTQValidatorLogic, _log @@ -11,10 +10,10 @@ class FASTQValidator(Validator): version = "1.0" def collect_errors(self, **kwargs) -> List[Optional[str]]: - threads = kwargs.get("coreuse", None) or cpu_count() // 4 or 1 - _log(f"Threading at FastQValidator with {threads}") + del kwargs + _log(f"Threading at FastQValidator with {self.thread_count}") validator = FASTQValidatorLogic(verbose=True) - validator.validate_fastq_files_in_path(self.paths, threads) + validator.validate_fastq_files_in_path(self.paths, self.thread_count) if validator.errors: return validator.errors elif validator.files_were_found: diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py index baf8e45..f5c4699 100644 --- a/src/ingest_validation_tests/fastq_validator_logic.py +++ b/src/ingest_validation_tests/fastq_validator_logic.py @@ -2,32 +2,51 @@ import gzip import logging import re -from collections import defaultdict -from multiprocessing import Lock, Manager, Pool +from collections import defaultdict, namedtuple +from multiprocessing import Manager, Pool +from multiprocessing.managers import ListProxy +from os import cpu_count from pathlib import Path -from typing import Callable, List, Optional, TextIO +from typing import Callable, List, Optional, TextIO, Union import fastq_utils +from typing_extensions import Self + +filename_pattern = namedtuple("filename_pattern", ["prefix", "read_type", "set_num"]) def is_valid_filename(filename: str) -> bool: return bool(fastq_utils.FASTQ_PATTERN.fullmatch(filename)) +def get_prefix_read_type_and_set(filename: str) -> Optional[filename_pattern]: + if not bool(fastq_utils.FASTQ_PATTERN.fullmatch(filename)): + return + pattern = re.compile( + r"(?P.*(?=[_](?:(?PR|.read|I)\d)))(?:_(?P=read_type)\d_)(?P\d+)" + ) + groups = pattern.match(filename) + if groups and all(x in groups.groupdict().keys() for x in ["prefix", "read_type", "set"]): + return filename_pattern( + groups.group("prefix"), groups.group("read_type"), groups.group("set") + ) + + def _open_fastq_file(file: Path) -> TextIO: return gzip.open(file, "rt") if file.name.endswith(".gz") else file.open() -def _log(message: str) -> str: - print(message) - return message +def _log(message: str, verbose: bool = True) -> Optional[str]: + if verbose: + print(message) + return message class Engine(object): def __init__(self, validate_object): self.validate_object = validate_object - def __call__(self, fastq_file): + def __call__(self, fastq_file) -> list[Optional[str]]: errors = [] _log(f"Validating matching fastq file {fastq_file}") self.validate_object.validate_fastq_file(fastq_file) @@ -56,8 +75,9 @@ class FASTQValidatorLogic: def __init__(self, verbose=False): self.errors: List[Optional[str]] = [] self.files_were_found = False + self.file_list = Manager().list() self._file_record_counts = Manager().dict() - self._file_prefix_counts = Manager().dict() + self._ungrouped_files = Manager().list() self._filename = "" self._line_number = 0 @@ -73,8 +93,7 @@ def _format_error(self, error: str) -> str: message = f"{location}: {error}" - if self._verbose: - _log(message) + print(message) return message def _validate_fastq_line_1(self, line: str) -> List[str]: @@ -113,6 +132,16 @@ def _validate_fastq_line_4(self, line: str) -> List[str]: ) return errors + def _make_groups(self) -> dict[filename_pattern, list[Path]]: + groups = defaultdict(list) + for file in self.file_list: + potential_match = get_prefix_read_type_and_set(file.name) + if potential_match: + groups[potential_match].append(file) + else: + self._ungrouped_files.append(file) + return groups + _VALIDATE_FASTQ_LINE_METHODS = { 1: _validate_fastq_line_1, 2: _validate_fastq_line_2, @@ -123,9 +152,9 @@ def _validate_fastq_line_4(self, line: str) -> List[str]: def validate_fastq_record(self, line: str, line_number: int) -> List[str]: line_index = line_number % 4 + 1 - validator_method: Callable[[FASTQValidatorLogic, str], List[str]] = ( - self._VALIDATE_FASTQ_LINE_METHODS[line_index] - ) + validator_method: Callable[[Self, str], List[str]] = self._VALIDATE_FASTQ_LINE_METHODS[ + line_index + ] assert validator_method, f"No validator method defined for record index {line_index}" @@ -179,97 +208,89 @@ def validate_fastq_file(self, fastq_file: Path) -> None: self._file_record_counts[str(fastq_file)] = records_read def validate_fastq_files_in_path(self, paths: List[Path], threads: int) -> None: - data_found_one = [] - dirs_and_files = defaultdict(dict) + """ + - Builds a list of filepaths; checks for duplicate filenames in upload. + - [parallel] Opens, validates, and gets line count of each file in list, and then + populates self._file_record_counts as {filepath: record_count}. + - If successful, groups files with matching prefix/read_type/set_num values. + - Compares record_counts across grouped files, logs any that don't match or are ungrouped. + """ for path in paths: - dirs_and_files[path] = fastq_utils.collect_fastq_files_by_directory(path) - _log( - f"FASTQValidatorLogic: Added files from {path} to dirs_and_files: {dirs_and_files}" - ) - file_list = [] + fastq_utils_output = fastq_utils.collect_fastq_files_by_directory(path) + for files in fastq_utils_output.values(): + self.file_list.extend(files) + _log(f"FASTQValidatorLogic: Added files from {path} to file_list: {files}") + self.files_were_found = bool(self.file_list) + self._find_duplicates() + _log(f"File list: {self.file_list}") with Manager() as manager: - # TODO: re-evaluate dicts/for loops lock = manager.Lock() - for path, rel_paths in dirs_and_files.items(): - for rel_path, files in rel_paths.items(): - for file in files: - file_list.append(Path(path / rel_path / file)) - if file_list: - self.files_were_found = True + pool = Pool(threads) try: logging.info( - f"Passing file list for paths {paths} to engine. File list: {file_list}." + f"Passing file list for paths {self._printable_filenames(paths, newlines=False)} to engine. File list:" ) - pool = Pool(threads) + logging.info(self._printable_filenames(self.file_list)) engine = Engine(self) - data_output = pool.imap_unordered(engine, file_list) + data_output = pool.imap_unordered(engine, self.file_list) + [self.errors.extend(output) for output in data_output if output] except Exception as e: - _log(f"Error {e}") pool.close() pool.join() - data_found_one.append(f"Error {e}") + _log(f"Error {e}") + self.errors.append(f"Error {e}") else: pool.close() pool.join() - self._find_duplicates(dirs_and_files) - self._find_shared_prefixes(lock) - [data_found_one.extend(output) for output in data_output if output] - - if len(data_found_one) > 0: - self.errors.extend(data_found_one) - - def _find_duplicates(self, dirs_and_files): - # TODO: re-evaluate dicts/for loops - for data_path, sub_dirs in dirs_and_files.items(): - # Creates a dict of filenames to list of full filepaths for each - # fastq file in a given data_path (dataset dir). - files_per_path = defaultdict(list) - for sub_path, filepaths in sub_dirs.items(): - for filepath in filepaths: - files_per_path[filepath.name].append(data_path / sub_path) - for filename, filepaths in files_per_path.items(): - if len(filepaths) > 1: + groups = self._make_groups() + if self._ungrouped_files: + _log(f"Ungrouped files, counts not checked: {self._ungrouped_files}") + self._find_counts(groups, lock) + + def _find_duplicates(self) -> None: + """ + Transforms data from {path: [filepaths]} to {filepath.name: [paths]} + to ensure that each filename only appears once in an upload + """ + files_per_path = defaultdict(list) + for filepath in self.file_list: + files_per_path[filepath.name].append(filepath.parents[0]) + for filename, filepaths in files_per_path.items(): + if len(filepaths) > 1: + self.errors.append( + _log( + f"{filename} has been found multiple times during this validation. " + f"Locations of duplicates: {str(filepaths)}." + ) + ) + + def _find_counts(self, groups: dict[filename_pattern, list[Path]], lock): + with lock: + for pattern, paths in groups.items(): + comparison = {} + for path in paths: + comparison[path] = self._file_record_counts.get(str(path)) + if not (len(set(comparison.values())) == 1): self.errors.append( - _log( - f"{filename} has been found multiple times during this validation. " - f"Locations of duplicates: {filepaths}." # noqa: E501 - ) + f"Counts do not match among files matching pattern {pattern.prefix}_{pattern.read_type}#_{pattern.set_num}: {comparison}" + ) + else: + _log( + f"PASSED: Record count comparison for files matching pattern {pattern.prefix}_{pattern.read_type}#_{pattern.set_num}: {comparison}" ) - def _find_shared_prefixes(self, lock): - # This pattern seeks out the string that includes the lane number (since - # that is expected to be present to help anchor the prefix) that comes - # before any of _I1, _I2, _R1, or _R2. - fastq_file_prefix_regex = re.compile(r"(.+_L\d+.*)_[IR][12][._]") - for fastq_file, records_read in self._file_record_counts.items(): - match = fastq_file_prefix_regex.match(Path(fastq_file).name) - with lock: - if match: - filename_prefix = match.group(1) - if filename_prefix in self._file_prefix_counts.keys(): - extant_count = self._file_prefix_counts[filename_prefix] - if extant_count != records_read: - # Find a file we've validated already that matches this - # prefix. - extant_files = [ - str(Path(filepath).name) - for filepath, record_count in self._file_record_counts.items() - if record_count == extant_count - and Path(filepath).name.startswith(filename_prefix) - ] - # Based on how the dictionaries are created, there should - # always be at least one matching filename. - assert extant_files - - self.errors.append( - _log( - f"{Path(fastq_file).name} ({records_read} lines) " - f"does not match length of {extant_files[0]} " - f"({extant_count} lines)." - ) - ) - else: - self._file_prefix_counts[filename_prefix] = records_read + def _printable_filenames( + self, files: Union[list, ListProxy, Path, str], newlines: bool = True + ): + if type(files) is list or type(files) is ListProxy: + file_list = [str(file) for file in files] + if newlines: + return "\n".join(file_list) + return file_list + elif type(files) is Path: + return str(files) + elif type(files) is str: + return files def main(): @@ -277,6 +298,7 @@ def main(): parser.add_argument( "filepaths", type=Path, nargs="+", help="Files to validate for FASTQ syntax" ) + parser.add_argument("coreuse", type=int, help="Number of cores to use") args = parser.parse_args() if isinstance(args.filepaths, List): @@ -289,7 +311,10 @@ def main(): raise Exception(f"Validator init received base_paths arg as type {type(args.filepaths)}") validator = FASTQValidatorLogic(True) - validator.validate_fastq_files_in_path(filepaths, Lock()) + if not (threads := args.coreuse): + cpus = cpu_count() + threads = cpus // 4 if cpus else 1 + validator.validate_fastq_files_in_path(filepaths, threads) if __name__ == "__main__": diff --git a/tests/test_fastq_validator_logic.py b/tests/test_fastq_validator_logic.py index 31bd68a..2a3e131 100644 --- a/tests/test_fastq_validator_logic.py +++ b/tests/test_fastq_validator_logic.py @@ -202,3 +202,94 @@ def test_fastq_validator_record_counts_bad(self, fastq_validator, tmp_path): assert "(4 lines)" in fastq_validator.errors[0] assert "does not match" in fastq_validator.errors[0] assert "(8 lines)" in fastq_validator.errors[0] + + def test_fastq_comparison_good(self, fastq_validator, tmp_path): + filenames = [ + "3252_ftL_RNA_T1_S31_L003_R1_001.fastq", + "3252_ftL_RNA_T1_S31_L003_R2_001.fastq", + "3252_ftL_RNA_T1_S31_L003_R1_002.fastq", + "3252_ftL_RNA_T1_S31_L003_R2_002.fastq", + ] + for filename in filenames: + new_file = tmp_path.joinpath(filename) + with _open_output_file(new_file, False) as output: + output.write(_GOOD_RECORDS) + + fastq_validator.validate_fastq_files_in_path([tmp_path], 2) + + assert not fastq_validator.errors + + # def test_fastq_comparison_bad_extra_R(self, fastq_validator, tmp_path): + # filenames = [ + # "3252_ftL_RNA_T1_S31_L003_R1_001.fastq", + # "3252_ftL_RNA_T1_S31_L003_R2_001.fastq", + # "3252_ftL_RNA_T1_S31_L003_R1_002.fastq", + # "3252_ftL_RNA_T1_S31_L003_R2_002.fastq", + # "3252_ftL_RNA_T1_S31_L003_R3_001.fastq", + # ] + # for filename in filenames: + # new_file = tmp_path.joinpath(filename) + # with _open_output_file(new_file, False) as output: + # output.write(_GOOD_RECORDS) + # + # fastq_validator.validate_fastq_files_in_path([tmp_path], 2) + # assert "IndexError: list index out of range" in fastq_validator.errors[0] + + # def test_fastq_comparison_bad_unpaired_R(self, fastq_validator, tmp_path): + # filenames = [ + # "3252_ftL_RNA_T1_S31_L003_R1_001.fastq", + # "3252_ftL_RNA_T1_S31_L003_R2_001.fastq", + # "3252_ftL_RNA_T1_S31_L003_R1_002.fastq", + # ] + # for filename in filenames: + # new_file = tmp_path.joinpath(filename) + # with _open_output_file(new_file, False) as output: + # output.write(_GOOD_RECORDS) + # + # fastq_validator.validate_fastq_files_in_path([tmp_path], 2) + # + # assert fastq_validator.errors + # + # def test_fastq_comparison_bad_mixed_I_and_R(self, fastq_validator, tmp_path): + # filenames = [ + # "3252_ftL_RNA_T1_S31_L003_R1_001.fastq", + # "3252_ftL_RNA_T1_S31_L003_R2_001.fastq", + # "3252_ftL_RNA_T1_S31_L003_I1_002.fastq", + # ] + # for filename in filenames: + # new_file = tmp_path.joinpath(filename) + # with _open_output_file(new_file, False) as output: + # output.write(_GOOD_RECORDS) + # + # fastq_validator.validate_fastq_files_in_path([tmp_path], 2) + # + # assert fastq_validator.errors + # + # def test_fastq_comparison_bad_extra_unmatched_fastq(self, fastq_validator, tmp_path): + # filenames = [ + # "3252_ftL_RNA_T1_S31_L003_R1_001.fastq", + # "3252_ftL_RNA_T1_S31_L003_R2_001.fastq", + # "bad_ftL_RNA_T1_S31_L003_R1_001.fastq", + # ] + # for filename in filenames: + # new_file = tmp_path.joinpath(filename) + # with _open_output_file(new_file, False) as output: + # output.write(_GOOD_RECORDS) + # + # fastq_validator.validate_fastq_files_in_path([tmp_path], 2) + # + # assert fastq_validator.errors + + def test_fastq_comparison_bad_unequal_line_counts(self, fastq_validator, tmp_path): + good_file = "3252_ftL_RNA_T1_S31_L003_R1_001.fastq" + bad_file = "3252_ftL_RNA_T1_S31_L003_R2_001.fastq" + new_good_file = tmp_path.joinpath(good_file) + with _open_output_file(new_good_file, False) as output: + output.write(_GOOD_RECORDS) + new_bad_file = tmp_path.joinpath(bad_file) + with _open_output_file(new_bad_file, False) as output: + output.write("bad") + + fastq_validator.validate_fastq_files_in_path([tmp_path], 2) + + assert fastq_validator.errors From 7c93ab575a3aaad45a4ce677b4f6f374658cf9e3 Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Mon, 21 Oct 2024 17:12:14 -0400 Subject: [PATCH 02/12] added tests --- .../fastq_validator_logic.py | 14 +- tests/test_fastq_validator_logic.py | 175 +++++++++++------- 2 files changed, 119 insertions(+), 70 deletions(-) diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py index f5c4699..bb34106 100644 --- a/src/ingest_validation_tests/fastq_validator_logic.py +++ b/src/ingest_validation_tests/fastq_validator_logic.py @@ -20,10 +20,18 @@ def is_valid_filename(filename: str) -> bool: def get_prefix_read_type_and_set(filename: str) -> Optional[filename_pattern]: + """ + PREFIX (?P.*(?:L\\d+)(?=[_](?:(?P(?:R|read)(?=[123])|I(?=[12]))))) + - (?P | named capture group "prefix" + - .*(?:L\\d+) | match anything before pattern L# (where # represents 1 or more digits) + - (?=[_](?:(?P(?:R|read)(?=[123])|I(?=[12])))) | only capture above match if followed by the sequence _[R1,R2,R3,read1,read2,read3,I1,I2] + + """ if not bool(fastq_utils.FASTQ_PATTERN.fullmatch(filename)): return + # looking for fastq filenames matching pattern __[I1,I2,R1,R2,R3]_ pattern = re.compile( - r"(?P.*(?=[_](?:(?PR|.read|I)\d)))(?:_(?P=read_type)\d_)(?P\d+)" + r"(?P.*(?:L\d+)(?=[_](?:(?P(?:R|read)(?=[123]_)|I(?=[12]_)))))" ) groups = pattern.match(filename) if groups and all(x in groups.groupdict().keys() for x in ["prefix", "read_type", "set"]): @@ -140,6 +148,8 @@ def _make_groups(self) -> dict[filename_pattern, list[Path]]: groups[potential_match].append(file) else: self._ungrouped_files.append(file) + for group in groups.values(): + group.sort() return groups _VALIDATE_FASTQ_LINE_METHODS = { @@ -269,7 +279,7 @@ def _find_counts(self, groups: dict[filename_pattern, list[Path]], lock): for pattern, paths in groups.items(): comparison = {} for path in paths: - comparison[path] = self._file_record_counts.get(str(path)) + comparison[str(path)] = self._file_record_counts.get(str(path)) if not (len(set(comparison.values())) == 1): self.errors.append( f"Counts do not match among files matching pattern {pattern.prefix}_{pattern.read_type}#_{pattern.set_num}: {comparison}" diff --git a/tests/test_fastq_validator_logic.py b/tests/test_fastq_validator_logic.py index 2a3e131..dfdffae 100644 --- a/tests/test_fastq_validator_logic.py +++ b/tests/test_fastq_validator_logic.py @@ -1,10 +1,15 @@ import gzip -from pathlib import Path +from pathlib import Path, PosixPath from typing import TextIO +import fastq_utils import pytest -from src.ingest_validation_tests.fastq_validator_logic import FASTQValidatorLogic +from src.ingest_validation_tests.fastq_validator_logic import ( + FASTQValidatorLogic, + filename_pattern, + get_prefix_read_type_and_set, +) _GOOD_RECORDS = """\ @A12345:123:A12BCDEFG:1:1234:1000:1234 1:N:0:NACTGACTGA+CTGACTGACT @@ -197,11 +202,14 @@ def test_fastq_validator_record_counts_bad(self, fastq_validator, tmp_path): fastq_validator.validate_fastq_files_in_path([tmp_path], 2) - # Order of the files being processed is not guaranteed, however these - # strings ensure that a mismatch was found. - assert "(4 lines)" in fastq_validator.errors[0] - assert "does not match" in fastq_validator.errors[0] - assert "(8 lines)" in fastq_validator.errors[0] + # Non-matching records only stored in errors, need to do ugly string match + assert "Counts do not match" in fastq_validator.errors[0] + assert ( + "SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I1_001.fastq': 4" in fastq_validator.errors[0] + ) + assert ( + "SREQ-1_1-ACTGACTGAC-TGACTGACTG_S1_L001_I2_001.fastq': 8" in fastq_validator.errors[0] + ) def test_fastq_comparison_good(self, fastq_validator, tmp_path): filenames = [ @@ -219,67 +227,6 @@ def test_fastq_comparison_good(self, fastq_validator, tmp_path): assert not fastq_validator.errors - # def test_fastq_comparison_bad_extra_R(self, fastq_validator, tmp_path): - # filenames = [ - # "3252_ftL_RNA_T1_S31_L003_R1_001.fastq", - # "3252_ftL_RNA_T1_S31_L003_R2_001.fastq", - # "3252_ftL_RNA_T1_S31_L003_R1_002.fastq", - # "3252_ftL_RNA_T1_S31_L003_R2_002.fastq", - # "3252_ftL_RNA_T1_S31_L003_R3_001.fastq", - # ] - # for filename in filenames: - # new_file = tmp_path.joinpath(filename) - # with _open_output_file(new_file, False) as output: - # output.write(_GOOD_RECORDS) - # - # fastq_validator.validate_fastq_files_in_path([tmp_path], 2) - # assert "IndexError: list index out of range" in fastq_validator.errors[0] - - # def test_fastq_comparison_bad_unpaired_R(self, fastq_validator, tmp_path): - # filenames = [ - # "3252_ftL_RNA_T1_S31_L003_R1_001.fastq", - # "3252_ftL_RNA_T1_S31_L003_R2_001.fastq", - # "3252_ftL_RNA_T1_S31_L003_R1_002.fastq", - # ] - # for filename in filenames: - # new_file = tmp_path.joinpath(filename) - # with _open_output_file(new_file, False) as output: - # output.write(_GOOD_RECORDS) - # - # fastq_validator.validate_fastq_files_in_path([tmp_path], 2) - # - # assert fastq_validator.errors - # - # def test_fastq_comparison_bad_mixed_I_and_R(self, fastq_validator, tmp_path): - # filenames = [ - # "3252_ftL_RNA_T1_S31_L003_R1_001.fastq", - # "3252_ftL_RNA_T1_S31_L003_R2_001.fastq", - # "3252_ftL_RNA_T1_S31_L003_I1_002.fastq", - # ] - # for filename in filenames: - # new_file = tmp_path.joinpath(filename) - # with _open_output_file(new_file, False) as output: - # output.write(_GOOD_RECORDS) - # - # fastq_validator.validate_fastq_files_in_path([tmp_path], 2) - # - # assert fastq_validator.errors - # - # def test_fastq_comparison_bad_extra_unmatched_fastq(self, fastq_validator, tmp_path): - # filenames = [ - # "3252_ftL_RNA_T1_S31_L003_R1_001.fastq", - # "3252_ftL_RNA_T1_S31_L003_R2_001.fastq", - # "bad_ftL_RNA_T1_S31_L003_R1_001.fastq", - # ] - # for filename in filenames: - # new_file = tmp_path.joinpath(filename) - # with _open_output_file(new_file, False) as output: - # output.write(_GOOD_RECORDS) - # - # fastq_validator.validate_fastq_files_in_path([tmp_path], 2) - # - # assert fastq_validator.errors - def test_fastq_comparison_bad_unequal_line_counts(self, fastq_validator, tmp_path): good_file = "3252_ftL_RNA_T1_S31_L003_R1_001.fastq" bad_file = "3252_ftL_RNA_T1_S31_L003_R2_001.fastq" @@ -293,3 +240,95 @@ def test_fastq_comparison_bad_unequal_line_counts(self, fastq_validator, tmp_pat fastq_validator.validate_fastq_files_in_path([tmp_path], 2) assert fastq_validator.errors + + def test_fastq_groups_good(self, fastq_validator, tmp_path): + files = [ + "20147_Healthy_PA_S1_L001_R1_001.fastq", + "20147_Healthy_PA_S1_L001_R2_001.fastq", + "20147_Healthy_PA_S1_L001_R3_001.fastq", + "20147_Healthy_PA_S1_L001_R1_002.fastq", + "20147_Healthy_PA_S1_L001_R2_002.fastq", + ] + for file in files: + with _open_output_file(tmp_path.joinpath(file), False) as output: + output.write(_GOOD_RECORDS) + + fastq_validator.validate_fastq_files_in_path([tmp_path], 2) + assert fastq_validator._make_groups() == { + filename_pattern(prefix="20147_Healthy_PA_S1_L001", read_type="R", set_num="001"): [ + PosixPath(tmp_path.joinpath("20147_Healthy_PA_S1_L001_R1_001.fastq")), + PosixPath(tmp_path.joinpath("20147_Healthy_PA_S1_L001_R2_001.fastq")), + PosixPath(tmp_path.joinpath("20147_Healthy_PA_S1_L001_R3_001.fastq")), + ], + filename_pattern(prefix="20147_Healthy_PA_S1_L001", read_type="R", set_num="002"): [ + PosixPath(tmp_path.joinpath("20147_Healthy_PA_S1_L001_R1_002.fastq")), + PosixPath(tmp_path.joinpath("20147_Healthy_PA_S1_L001_R2_002.fastq")), + ], + } + assert not fastq_validator.errors + + def test_fastq_groups_bad(self, fastq_validator, tmp_path): + good_files = [ + "20147_Healthy_PA_S1_L001_R1_001.fastq", + "20147_Healthy_PA_S1_L001_R2_001.fastq", + "20147_Healthy_PA_S1_L001_R1_002.fastq", + ] + bad_files = [ + "20147_Healthy_PA_S1_L001_R3_001.fastq", + "20147_Healthy_PA_S1_L001_R2_002.fastq", + ] + for file in good_files: + with _open_output_file(tmp_path.joinpath(file), False) as output: + output.write(_GOOD_RECORDS) + for file in bad_files: + with _open_output_file(tmp_path.joinpath(file), False) as output: + output.write("@bad") + + fastq_validator.validate_fastq_files_in_path([tmp_path], 2) + assert "Counts do not match" in fastq_validator.errors[0] + assert "20147_Healthy_PA_S1_L001_R2_002.fastq" in fastq_validator.errors[0] + assert "Counts do not match" in fastq_validator.errors[1] + assert "20147_Healthy_PA_S1_L001_R3_001.fastq" in fastq_validator.errors[1] + + def test_filename_valid_and_fastq_valid_but_not_grouped(self, fastq_validator, tmp_path): + # good_filenames[0:6] are valid but would not be grouped for comparison + good_filenames = [ + "B001A001_1.fastq", # no lane, read_type, or set + "B001A001_R1.fq", # no lane or set + "B001A001_I1.fastq.gz", # no lane or set + "H4L1-4_S64_R1_001.fastq.gz", # no lane + "H4L1-4_S64_L001_001.fastq.gz", # no read_type + "H4L1-4_S64_L001_R1.fastq.gz", # no set + "L001_H4L1-4_S64_R1_001.fastq.gz", # out of order + "H4L1-4_S64_L001_R1_001.fastq.gz", + "H4L1-4_S64_L001_R2_001.fastq.gz", + "H4L1-4_S64_L001_I1_001.fastq.gz", + "Undetermined_S0_L001_R1_001.W105_Small_bowel_ileum.trimmed.fastq.gz", # annotated but otherwise fits pattern + ] + for file in good_filenames: + use_gzip = bool("gz" in file) + with _open_output_file(tmp_path.joinpath(file), use_gzip) as output: + output.write(_GOOD_RECORDS) + + fastq_validator.validate_fastq_files_in_path([tmp_path], 2) + # All files in good_filenames should be in file_list + assert { + PosixPath(tmp_path.joinpath(file)) in fastq_validator.file_list + for file in good_filenames + } == {True} + # No errors should be found in any of those files + assert not fastq_validator.errors + # Only some files from good_filenames will match criteria for grouping + valid_filename_patterns = [ + get_prefix_read_type_and_set(str(file)) + for file in fastq_validator.file_list + if get_prefix_read_type_and_set(str(file)) is not None + ] + assert valid_filename_patterns == [ + filename_pattern(prefix=f"{tmp_path}/H4L1-4_S64_L001", read_type="R", set_num="001"), + filename_pattern(prefix=f"{tmp_path}/H4L1-4_S64_L001", read_type="R", set_num="001"), + filename_pattern(prefix=f"{tmp_path}/H4L1-4_S64_L001", read_type="I", set_num="001"), + filename_pattern( + prefix=f"{tmp_path}/Undetermined_S0_L001", read_type="R", set_num="001" + ), + ] From 4dd234db93320d8b10853e01f2865bf5b5199f9f Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Wed, 23 Oct 2024 14:30:55 -0400 Subject: [PATCH 03/12] fixing set_num regex --- src/ingest_validation_tests/fastq_validator_logic.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py index bb34106..f021782 100644 --- a/src/ingest_validation_tests/fastq_validator_logic.py +++ b/src/ingest_validation_tests/fastq_validator_logic.py @@ -29,14 +29,14 @@ def get_prefix_read_type_and_set(filename: str) -> Optional[filename_pattern]: """ if not bool(fastq_utils.FASTQ_PATTERN.fullmatch(filename)): return - # looking for fastq filenames matching pattern __[I1,I2,R1,R2,R3]_ + # looking for fastq filenames matching pattern __[I1,I2,R1,R2,R3]_ pattern = re.compile( - r"(?P.*(?:L\d+)(?=[_](?:(?P(?:R|read)(?=[123]_)|I(?=[12]_)))))" + r"(?P.*(?:L\d+)(?=[_](?:(?P(?:R|read)(?=[123])|I(?=[12]))(?:\d?_)(?P\d+))))" ) groups = pattern.match(filename) - if groups and all(x in groups.groupdict().keys() for x in ["prefix", "read_type", "set"]): + if groups and all(x in groups.groupdict().keys() for x in ["prefix", "read_type", "set_num"]): return filename_pattern( - groups.group("prefix"), groups.group("read_type"), groups.group("set") + groups.group("prefix"), groups.group("read_type"), groups.group("set_num") ) From 0e3e9a604d0408dec25efe88244c6a74e4c3b326 Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Wed, 23 Oct 2024 15:06:57 -0400 Subject: [PATCH 04/12] fixes from testing --- .../fastq_validator.py | 4 ++++ .../fastq_validator_logic.py | 21 ++++++++++++++----- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/src/ingest_validation_tests/fastq_validator.py b/src/ingest_validation_tests/fastq_validator.py index d9acee4..1178e83 100644 --- a/src/ingest_validation_tests/fastq_validator.py +++ b/src/ingest_validation_tests/fastq_validator.py @@ -8,9 +8,13 @@ class FASTQValidator(Validator): description = "Check FASTQ files for basic syntax and consistency." cost = 15.0 version = "1.0" + # need to add to parent validator class in IVT, hack for now + thread_count = None def collect_errors(self, **kwargs) -> List[Optional[str]]: del kwargs + if not self.thread_count: + self.thread_count = 1 _log(f"Threading at FastQValidator with {self.thread_count}") validator = FASTQValidatorLogic(verbose=True) validator.validate_fastq_files_in_path(self.paths, self.thread_count) diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py index f021782..43fd43e 100644 --- a/src/ingest_validation_tests/fastq_validator_logic.py +++ b/src/ingest_validation_tests/fastq_validator_logic.py @@ -21,17 +21,22 @@ def is_valid_filename(filename: str) -> bool: def get_prefix_read_type_and_set(filename: str) -> Optional[filename_pattern]: """ - PREFIX (?P.*(?:L\\d+)(?=[_](?:(?P(?:R|read)(?=[123])|I(?=[12]))))) + PREFIX (?P.*(?:L\\d+) - (?P | named capture group "prefix" - .*(?:L\\d+) | match anything before pattern L# (where # represents 1 or more digits) - - (?=[_](?:(?P(?:R|read)(?=[123])|I(?=[12])))) | only capture above match if followed by the sequence _[R1,R2,R3,read1,read2,read3,I1,I2] - + - only capture if read_type found + READ_TYPE + - (?=_ | assert that the character "_" is present at the beginning of this next group + - (?:(?PR|read(?=[123])|I(?=[12]))) | only capture above match if followed by the sequence _[R1,R2,R3,read1,read2,read3,I1,I2]; capture R/I/read as read_type + SET_NUM + - (?:\\d_ | ensure presence of "#_" before group, ignore characters + - (?P\\d+) | capture group set_num of 1 or more digits """ if not bool(fastq_utils.FASTQ_PATTERN.fullmatch(filename)): return # looking for fastq filenames matching pattern __[I1,I2,R1,R2,R3]_ pattern = re.compile( - r"(?P.*(?:L\d+)(?=[_](?:(?P(?:R|read)(?=[123])|I(?=[12]))(?:\d?_)(?P\d+))))" + r"(?P.*(?:L\d+)(?=_(?PR|read(?=[123])|I(?=[12]))(?:\d_(?P\d+))))" ) groups = pattern.match(filename) if groups and all(x in groups.groupdict().keys() for x in ["prefix", "read_type", "set_num"]): @@ -253,9 +258,9 @@ def validate_fastq_files_in_path(self, paths: List[Path], threads: int) -> None: pool.close() pool.join() groups = self._make_groups() + self._find_counts(groups, lock) if self._ungrouped_files: _log(f"Ungrouped files, counts not checked: {self._ungrouped_files}") - self._find_counts(groups, lock) def _find_duplicates(self) -> None: """ @@ -277,6 +282,12 @@ def _find_duplicates(self) -> None: def _find_counts(self, groups: dict[filename_pattern, list[Path]], lock): with lock: for pattern, paths in groups.items(): + if len(paths) == 1: + # This would happen if there was a file that matched the prefix_read_set pattern + # but did not have a counterpart for comparison; this probably should not happen but + # is currently only logged and does not throw an exception + self._ungrouped_files.append(paths[0]) + continue comparison = {} for path in paths: comparison[str(path)] = self._file_record_counts.get(str(path)) From 5067d375376de6a6a7a675311a9813799db8498f Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Wed, 23 Oct 2024 15:14:43 -0400 Subject: [PATCH 05/12] slight tweaks to tests --- tests/test_fastq_validator_logic.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tests/test_fastq_validator_logic.py b/tests/test_fastq_validator_logic.py index dfdffae..856f421 100644 --- a/tests/test_fastq_validator_logic.py +++ b/tests/test_fastq_validator_logic.py @@ -272,6 +272,7 @@ def test_fastq_groups_bad(self, fastq_validator, tmp_path): "20147_Healthy_PA_S1_L001_R1_001.fastq", "20147_Healthy_PA_S1_L001_R2_001.fastq", "20147_Healthy_PA_S1_L001_R1_002.fastq", + "test_not_grouped_but_valid", ] bad_files = [ "20147_Healthy_PA_S1_L001_R3_001.fastq", @@ -291,7 +292,7 @@ def test_fastq_groups_bad(self, fastq_validator, tmp_path): assert "20147_Healthy_PA_S1_L001_R3_001.fastq" in fastq_validator.errors[1] def test_filename_valid_and_fastq_valid_but_not_grouped(self, fastq_validator, tmp_path): - # good_filenames[0:6] are valid but would not be grouped for comparison + # good_filenames[0:6] are valid in pipeline processing but would not be grouped for comparison good_filenames = [ "B001A001_1.fastq", # no lane, read_type, or set "B001A001_R1.fq", # no lane or set @@ -332,3 +333,15 @@ def test_filename_valid_and_fastq_valid_but_not_grouped(self, fastq_validator, t prefix=f"{tmp_path}/Undetermined_S0_L001", read_type="R", set_num="001" ), ] + assert ( + fastq_validator._ungrouped_files.sort() + == [ + "B001A001_1.fastq", # no lane, read_type, or set + "B001A001_R1.fq", # no lane or set + "B001A001_I1.fastq.gz", # no lane or set + "H4L1-4_S64_R1_001.fastq.gz", # no lane + "H4L1-4_S64_L001_001.fastq.gz", # no read_type + "H4L1-4_S64_L001_R1.fastq.gz", # no set + "L001_H4L1-4_S64_R1_001.fastq.gz", # out of order + ].sort() + ) From c65e7ac4872c5e4e1e3c33c6f987e233333c9d5e Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Wed, 23 Oct 2024 15:25:52 -0400 Subject: [PATCH 06/12] documentation --- .../fastq_validator_logic.py | 26 +++++++++++-------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py index 43fd43e..28ec57c 100644 --- a/src/ingest_validation_tests/fastq_validator_logic.py +++ b/src/ingest_validation_tests/fastq_validator_logic.py @@ -21,20 +21,24 @@ def is_valid_filename(filename: str) -> bool: def get_prefix_read_type_and_set(filename: str) -> Optional[filename_pattern]: """ - PREFIX (?P.*(?:L\\d+) - - (?P | named capture group "prefix" - - .*(?:L\\d+) | match anything before pattern L# (where # represents 1 or more digits) - - only capture if read_type found - READ_TYPE - - (?=_ | assert that the character "_" is present at the beginning of this next group - - (?:(?PR|read(?=[123])|I(?=[12]))) | only capture above match if followed by the sequence _[R1,R2,R3,read1,read2,read3,I1,I2]; capture R/I/read as read_type - SET_NUM - - (?:\\d_ | ensure presence of "#_" before group, ignore characters - - (?P\\d+) | capture group set_num of 1 or more digits + Regex explanation: + PREFIX + - (?P | named capture group "prefix" + - .*(?:L\\d+) | match anything before pattern L# (where # represents 1 or more digits) + - only capture if read_type and set_num found + READ_TYPE + - (?=_ | assert that the character "_" is present at the beginning of this next group + - (?:(?PR|read(?=[123])|I(?=[12]))) | only capture above match if followed by the sequence _[R1,R2,R3,read1,read2,read3,I1,I2]; capture R/I/read as read_type + SET_NUM + - (?:\\d_ | ensure presence of "#_" before group, ignore characters + - (?P\\d+) | capture group set_num of 1 or more digits + Note: if set_num needs to be optional, could change this portion to (?:\\d_?(?P\\d+)|) """ if not bool(fastq_utils.FASTQ_PATTERN.fullmatch(filename)): + # looking for fastq filenames matching pattern + # __#_ + # e.g. arbitrary_string_L001_R1_001.fastq return - # looking for fastq filenames matching pattern __[I1,I2,R1,R2,R3]_ pattern = re.compile( r"(?P.*(?:L\d+)(?=_(?PR|read(?=[123])|I(?=[12]))(?:\d_(?P\d+))))" ) From d621ac27c3593283fa9349944428bc11a092fd68 Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Wed, 23 Oct 2024 15:34:35 -0400 Subject: [PATCH 07/12] maybe satisfying linter --- tests/test_fastq_validator_logic.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_fastq_validator_logic.py b/tests/test_fastq_validator_logic.py index 856f421..823d915 100644 --- a/tests/test_fastq_validator_logic.py +++ b/tests/test_fastq_validator_logic.py @@ -2,7 +2,6 @@ from pathlib import Path, PosixPath from typing import TextIO -import fastq_utils import pytest from src.ingest_validation_tests.fastq_validator_logic import ( From 2dfec6b9f8470ef0eb7951e40cefb625b6b23d90 Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Wed, 23 Oct 2024 17:00:55 -0400 Subject: [PATCH 08/12] works to dedupe errors but ugly --- .../fastq_validator_logic.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py index 28ec57c..34f7af0 100644 --- a/src/ingest_validation_tests/fastq_validator_logic.py +++ b/src/ingest_validation_tests/fastq_validator_logic.py @@ -240,7 +240,8 @@ def validate_fastq_files_in_path(self, paths: List[Path], threads: int) -> None: self.file_list.extend(files) _log(f"FASTQValidatorLogic: Added files from {path} to file_list: {files}") self.files_were_found = bool(self.file_list) - self._find_duplicates() + if dupes := self._find_duplicates(): + [self.errors.append(dupe) for dupe in dupes if dupe not in self.errors] _log(f"File list: {self.file_list}") with Manager() as manager: lock = manager.Lock() @@ -252,7 +253,6 @@ def validate_fastq_files_in_path(self, paths: List[Path], threads: int) -> None: logging.info(self._printable_filenames(self.file_list)) engine = Engine(self) data_output = pool.imap_unordered(engine, self.file_list) - [self.errors.extend(output) for output in data_output if output] except Exception as e: pool.close() pool.join() @@ -261,27 +261,35 @@ def validate_fastq_files_in_path(self, paths: List[Path], threads: int) -> None: else: pool.close() pool.join() + compiled = set([errors for error_list in data_output for errors in error_list]) + [ + self.errors.append(output) + for output in compiled + if output and output not in self.errors + ] groups = self._make_groups() self._find_counts(groups, lock) if self._ungrouped_files: _log(f"Ungrouped files, counts not checked: {self._ungrouped_files}") - def _find_duplicates(self) -> None: + def _find_duplicates(self) -> list[str]: """ Transforms data from {path: [filepaths]} to {filepath.name: [paths]} to ensure that each filename only appears once in an upload """ files_per_path = defaultdict(list) + errors = [] for filepath in self.file_list: files_per_path[filepath.name].append(filepath.parents[0]) for filename, filepaths in files_per_path.items(): if len(filepaths) > 1: - self.errors.append( + errors.append( _log( f"{filename} has been found multiple times during this validation. " f"Locations of duplicates: {str(filepaths)}." ) ) + return errors def _find_counts(self, groups: dict[filename_pattern, list[Path]], lock): with lock: From 6f96b9be411f83e766ca8af7dcfc3ca4fc1d2b8a Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Thu, 24 Oct 2024 15:45:12 -0400 Subject: [PATCH 09/12] fixing duplicate error issue --- .../fastq_validator_logic.py | 62 +++++++++---------- 1 file changed, 29 insertions(+), 33 deletions(-) diff --git a/src/ingest_validation_tests/fastq_validator_logic.py b/src/ingest_validation_tests/fastq_validator_logic.py index 34f7af0..3c917db 100644 --- a/src/ingest_validation_tests/fastq_validator_logic.py +++ b/src/ingest_validation_tests/fastq_validator_logic.py @@ -21,23 +21,25 @@ def is_valid_filename(filename: str) -> bool: def get_prefix_read_type_and_set(filename: str) -> Optional[filename_pattern]: """ - Regex explanation: + Looking for fastq filenames matching pattern: + __#_ + e.g. arbitrary_string_L001_R1_001.fastq + + Regex documentation: PREFIX - (?P | named capture group "prefix" - - .*(?:L\\d+) | match anything before pattern L# (where # represents 1 or more digits) - - only capture if read_type and set_num found + - .*(?:L\\d+) | match anything before and including pattern L# (where # represents 1 or more digits) + - only capture prefix if read_type and set_num found READ_TYPE - (?=_ | assert that the character "_" is present at the beginning of this next group - - (?:(?PR|read(?=[123])|I(?=[12]))) | only capture above match if followed by the sequence _[R1,R2,R3,read1,read2,read3,I1,I2]; capture R/I/read as read_type + - (?:(?PR|read(?=[123])|I(?=[12]))) | only capture above match if followed by the + sequence _[R1,R2,R3,read1,read2,read3,I1,I2]; capture R/I/read as read_type, discarding digit SET_NUM - (?:\\d_ | ensure presence of "#_" before group, ignore characters - (?P\\d+) | capture group set_num of 1 or more digits Note: if set_num needs to be optional, could change this portion to (?:\\d_?(?P\\d+)|) """ if not bool(fastq_utils.FASTQ_PATTERN.fullmatch(filename)): - # looking for fastq filenames matching pattern - # __#_ - # e.g. arbitrary_string_L001_R1_001.fastq return pattern = re.compile( r"(?P.*(?:L\d+)(?=_(?PR|read(?=[123])|I(?=[12]))(?:\d_(?P\d+))))" @@ -149,18 +151,6 @@ def _validate_fastq_line_4(self, line: str) -> List[str]: ) return errors - def _make_groups(self) -> dict[filename_pattern, list[Path]]: - groups = defaultdict(list) - for file in self.file_list: - potential_match = get_prefix_read_type_and_set(file.name) - if potential_match: - groups[potential_match].append(file) - else: - self._ungrouped_files.append(file) - for group in groups.values(): - group.sort() - return groups - _VALIDATE_FASTQ_LINE_METHODS = { 1: _validate_fastq_line_1, 2: _validate_fastq_line_2, @@ -240,9 +230,7 @@ def validate_fastq_files_in_path(self, paths: List[Path], threads: int) -> None: self.file_list.extend(files) _log(f"FASTQValidatorLogic: Added files from {path} to file_list: {files}") self.files_were_found = bool(self.file_list) - if dupes := self._find_duplicates(): - [self.errors.append(dupe) for dupe in dupes if dupe not in self.errors] - _log(f"File list: {self.file_list}") + data_found_one = [] with Manager() as manager: lock = manager.Lock() pool = Pool(threads) @@ -250,9 +238,10 @@ def validate_fastq_files_in_path(self, paths: List[Path], threads: int) -> None: logging.info( f"Passing file list for paths {self._printable_filenames(paths, newlines=False)} to engine. File list:" ) - logging.info(self._printable_filenames(self.file_list)) + logging.info(self._printable_filenames(self.file_list, newlines=True)) engine = Engine(self) data_output = pool.imap_unordered(engine, self.file_list) + [data_found_one.extend(output) for output in data_output if output] except Exception as e: pool.close() pool.join() @@ -261,35 +250,42 @@ def validate_fastq_files_in_path(self, paths: List[Path], threads: int) -> None: else: pool.close() pool.join() - compiled = set([errors for error_list in data_output for errors in error_list]) - [ - self.errors.append(output) - for output in compiled - if output and output not in self.errors - ] + self._find_duplicates() groups = self._make_groups() self._find_counts(groups, lock) if self._ungrouped_files: _log(f"Ungrouped files, counts not checked: {self._ungrouped_files}") + if len(data_found_one) > 0: + self.errors.extend(data_found_one) - def _find_duplicates(self) -> list[str]: + def _make_groups(self) -> dict[filename_pattern, list[Path]]: + groups = defaultdict(list) + for file in self.file_list: + potential_match = get_prefix_read_type_and_set(file.name) + if potential_match: + groups[potential_match].append(file) + else: + self._ungrouped_files.append(file) + for group in groups.values(): + group.sort() + return groups + + def _find_duplicates(self): """ Transforms data from {path: [filepaths]} to {filepath.name: [paths]} to ensure that each filename only appears once in an upload """ files_per_path = defaultdict(list) - errors = [] for filepath in self.file_list: files_per_path[filepath.name].append(filepath.parents[0]) for filename, filepaths in files_per_path.items(): if len(filepaths) > 1: - errors.append( + self.errors.append( _log( f"{filename} has been found multiple times during this validation. " f"Locations of duplicates: {str(filepaths)}." ) ) - return errors def _find_counts(self, groups: dict[filename_pattern, list[Path]], lock): with lock: From 32f3f369ec20b5069b310fe8eb0b8f3b1bd3f5f6 Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Thu, 24 Oct 2024 16:41:29 -0400 Subject: [PATCH 10/12] testing fix to tests --- tests/test_fastq_validator_logic.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/tests/test_fastq_validator_logic.py b/tests/test_fastq_validator_logic.py index 823d915..bc55920 100644 --- a/tests/test_fastq_validator_logic.py +++ b/tests/test_fastq_validator_logic.py @@ -1,4 +1,5 @@ import gzip +from operator import attrgetter from pathlib import Path, PosixPath from typing import TextIO @@ -324,14 +325,25 @@ def test_filename_valid_and_fastq_valid_but_not_grouped(self, fastq_validator, t for file in fastq_validator.file_list if get_prefix_read_type_and_set(str(file)) is not None ] - assert valid_filename_patterns == [ - filename_pattern(prefix=f"{tmp_path}/H4L1-4_S64_L001", read_type="R", set_num="001"), - filename_pattern(prefix=f"{tmp_path}/H4L1-4_S64_L001", read_type="R", set_num="001"), - filename_pattern(prefix=f"{tmp_path}/H4L1-4_S64_L001", read_type="I", set_num="001"), - filename_pattern( - prefix=f"{tmp_path}/Undetermined_S0_L001", read_type="R", set_num="001" - ), - ] + assert sorted( + valid_filename_patterns, key=attrgetter("prefix", "read_type", "set_num") + ) == sorted( + [ + filename_pattern( + prefix=f"{tmp_path}/H4L1-4_S64_L001", read_type="R", set_num="001" + ), + filename_pattern( + prefix=f"{tmp_path}/H4L1-4_S64_L001", read_type="R", set_num="001" + ), + filename_pattern( + prefix=f"{tmp_path}/H4L1-4_S64_L001", read_type="I", set_num="001" + ), + filename_pattern( + prefix=f"{tmp_path}/Undetermined_S0_L001", read_type="R", set_num="001" + ), + ], + key=attrgetter("prefix", "read_type", "set_num"), + ) assert ( fastq_validator._ungrouped_files.sort() == [ From aeaeeae277173e4901112efac15a967c73655585 Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Thu, 24 Oct 2024 16:54:35 -0400 Subject: [PATCH 11/12] oops forgot to revert thread_count thing --- src/ingest_validation_tests/fastq_validator.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/ingest_validation_tests/fastq_validator.py b/src/ingest_validation_tests/fastq_validator.py index 1178e83..072c1d6 100644 --- a/src/ingest_validation_tests/fastq_validator.py +++ b/src/ingest_validation_tests/fastq_validator.py @@ -1,3 +1,4 @@ +from os import cpu_count from typing import List, Optional from fastq_validator_logic import FASTQValidatorLogic, _log @@ -8,14 +9,10 @@ class FASTQValidator(Validator): description = "Check FASTQ files for basic syntax and consistency." cost = 15.0 version = "1.0" - # need to add to parent validator class in IVT, hack for now - thread_count = None def collect_errors(self, **kwargs) -> List[Optional[str]]: - del kwargs - if not self.thread_count: - self.thread_count = 1 - _log(f"Threading at FastQValidator with {self.thread_count}") + threads = kwargs.get("coreuse", None) or cpu_count() // 4 or 1 + _log(f"Threading at FastQValidator with {threads}") validator = FASTQValidatorLogic(verbose=True) validator.validate_fastq_files_in_path(self.paths, self.thread_count) if validator.errors: From 93e76981bb39847381fed0e83b7ec0ff8ab2ed4e Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Thu, 24 Oct 2024 16:55:06 -0400 Subject: [PATCH 12/12] oops forgot to revert thread_count thing 2 --- src/ingest_validation_tests/fastq_validator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ingest_validation_tests/fastq_validator.py b/src/ingest_validation_tests/fastq_validator.py index 072c1d6..b30ca75 100644 --- a/src/ingest_validation_tests/fastq_validator.py +++ b/src/ingest_validation_tests/fastq_validator.py @@ -14,7 +14,7 @@ def collect_errors(self, **kwargs) -> List[Optional[str]]: threads = kwargs.get("coreuse", None) or cpu_count() // 4 or 1 _log(f"Threading at FastQValidator with {threads}") validator = FASTQValidatorLogic(verbose=True) - validator.validate_fastq_files_in_path(self.paths, self.thread_count) + validator.validate_fastq_files_in_path(self.paths, threads) if validator.errors: return validator.errors elif validator.files_were_found: