diff --git a/cicd/gitlab/parts/python.gitlab-ci.yml b/cicd/gitlab/parts/python.gitlab-ci.yml index 9b74da395..149312a16 100644 --- a/cicd/gitlab/parts/python.gitlab-ci.yml +++ b/cicd/gitlab/parts/python.gitlab-ci.yml @@ -96,6 +96,8 @@ python:black: python:pytest: extends: .python:test script: + - apt-get update -y + - apt-get install -y sqlite3 - coverage run -m pytest --junitxml=$RUN_DIR/junit_reports.xml artifacts: when: always diff --git a/pyproject.toml b/pyproject.toml index d716eb487..391367e79 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,7 +52,7 @@ dependencies = [ "bcbio-gff == 0.7.1", "biopython >= 1.81", "ensembl-py >= 2.1.2", - "ensembl-utils >= 0.4.4", + "ensembl-utils >= 0.5.0", "jsonschema >= 4.6.0", "intervaltree >= 3.1.0", "mysql-connector-python >= 8.0.29", @@ -176,6 +176,7 @@ disable = [ "too-few-public-methods", "too-many-arguments", "too-many-locals", + "too-many-positional-arguments", "too-many-statements", "unspecified-encoding", "wildcard-import", diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py index 722ce2ead..92ab98e05 100644 --- a/src/python/ensembl/io/genomio/assembly/status.py +++ b/src/python/ensembl/io/genomio/assembly/status.py @@ -15,31 +15,30 @@ """Obtain and record the assembly status for a set of INSDC accession(s) using NCBI's datasets CLI tool.""" __all__ = [ - "check_parameterization", - "datasets_asm_reports", "extract_assembly_metadata", - "fetch_accessions_from_cores", + "fetch_datasets_reports", + "fetch_accessions_from_core_dbs", "generate_report_tsv", - "resolve_query_type", + "get_assembly_accessions", "singularity_image_setter", ] import csv +from dataclasses import dataclass import json import logging import os -from os import PathLike from pathlib import Path import re -from typing import Dict, List, Tuple, Union from spython.main import Client from sqlalchemy.engine import URL from sqlalchemy import text from ensembl.io.genomio.utils.json_utils import print_json -from ensembl.io.genomio.database.dbconnection_lite import DBConnectionLite as dbc +from ensembl.utils import StrPath from ensembl.utils.argparse import ArgumentParser +from ensembl.utils.database import DBConnection from ensembl.utils.logging import init_logging_with_args @@ -52,37 +51,55 @@ class UnsupportedFormatError(Exception): """When a string does not have the expected format.""" -class ReportStructure(dict): - """Dict setter class of key report meta information""" - - def __init__(self) -> None: - dict.__init__(self) - self.update( - { - "Species Name": "", - "Taxon ID": "", - "Isolate/Strain": "", - "Asm name": "", - "Assembly type": "", - "Asm accession": "", - "Paired assembly": "", - "Asm last updated": "", - "Asm status": "", - "Asm notes": "", - } - ) - - -def singularity_image_setter(sif_cache_dir: Path, datasets_version: str) -> Client: - """Parse ENV and User specified variables related to 'datasets' singularity SIF +@dataclass +class ReportStructure: + """Stores key report meta information.""" + + species_name: str = "" + taxon_id: int = 0 + strain: str = "NA" + assembly_name: str = "" + assembly_type: str = "" + accession: str = "" + paired_assembly: str = "NA" + last_updated: str = "" + assembly_status: str = "NA" + assembly_notes: str = "NA" + + def to_dict(self) -> dict[str, str]: + """Returns a dictionary representation of this object.""" + return { + "Species Name": self.species_name, + "Taxon ID": str(self.taxon_id), + "Isolate/Strain": self.strain, + "Asm name": self.assembly_name, + "Assembly type": self.assembly_type, + "Asm accession": self.accession, + "Paired assembly": self.paired_assembly, + "Asm last updated": self.last_updated, + "Asm status": self.assembly_status, + "Asm notes": self.assembly_notes, + } + + def header(self) -> list[str]: + """Returns the dictionary keys matching each of the properties of the report.""" + return list(self.to_dict().keys()) + + def values(self) -> list[str]: + """Returns the values of each of the properties of the report.""" + return list(self.to_dict().values()) + + +def singularity_image_setter(sif_cache_dir: Path | None, datasets_version: str | None) -> Client: + """Parse ENV and User specified variables related to `datasets` singularity SIF container and define version and location of container. Args: sif_cache_dir: Path to locate existing, or download new SIF container image. - datasets_version: URL of singularity container (custom 'datasets' version if desired) + datasets_version: URL of singularity container (custom `datasets` version if desired). Returns: - `spython.main.client` instance of singularity container image housing 'datasets'. + `spython.main.client` instance of singularity container image housing `datasets`. """ # Set singularity cache dir from user defined path or use environment @@ -117,117 +134,90 @@ def singularity_image_setter(sif_cache_dir: Path, datasets_version: str) -> Clie return datasets_image -def check_parameterization(input_cores: Path, input_accessions: Path, db_host: str, db_port: int) -> Path: - """Detect the kind of user input (cores/accessions) and determine any missing or - incorrect parameterization. +def get_assembly_accessions(src_file: StrPath) -> list[str]: + """Returns the list of assembly accessions found in the provided file. Args: - input_cores: Input core(s) list file name. - input_accessions: Input accession (s) list file name. - db_host: Host server name - db_port: Host server port + src_file: Path to file with one line per INSDC assembly accession. - Returns: - User input file used in assembly status querying + Raises: + UnsupportedFormatError: If an accession does not match the INSDC assembly accession format. """ - # Input core names centered run - if input_cores: - logging.info(f"Performing assembly status report using core db list file: {input_cores}") - if db_host is None or db_port is None: - raise RuntimeError("Core database names require both arguments '--host' and '--port'") - return input_cores - # Accession centered run - logging.info(f"Performing assembly status report using INSDC accession list file: {input_accessions}") - return input_accessions - - -def resolve_query_type( - query_list: list, partial_url: URL, input_cores: str, input_accessions: str -) -> Union[Tuple[Dict, str]]: - """Function to identify the kind of queries being passed by user, - then extract the queries (core names or accessions) and store each with appropriate identifier. - - Args: - query_list: List of user defined queries either core names, or accessions - partial_url: A partial MYSQL connection URL (host:port) - input_cores: Arg parse param '--input_cores' - input_accessions: Arg parse param '--input_accessions' - - Returns: - User queries stored as identifier[(core db name | UniqueID#)] : accession - """ - - query_accessions: Dict = {} - query_type: str = "" - - if input_cores and input_accessions is None: - query_accessions = fetch_accessions_from_cores(query_list, partial_url) - query_type = "CoreDB" - elif input_cores is None and input_accessions: - query_type = "Accession" - for accession in query_list: - match = re.match(r"(GC[AF])_([0-9]{3})([0-9]{3})([0-9]{3})\.?([0-9]+)", accession) + query_accessions: list[str] = [] + with Path(src_file).open(mode="r") as fin: + for line in fin.readlines(): + line = line.strip() + match = re.match(r"^GC[AF]_[0-9]{9}\.[1-9][0-9]*$", line) if not match: - raise UnsupportedFormatError(f"Could not recognize GCA accession format: {accession}") - query_accessions[accession] = accession + raise UnsupportedFormatError(f"Could not recognize GCA/GCF accession format: {line}") + query_accessions.append(line) + return query_accessions - return query_accessions, query_type +def fetch_accessions_from_core_dbs(src_file: StrPath, server_url: URL) -> dict[str, str]: + """Obtain the associated INSDC accession given a set of core database names and a database server URL. -def fetch_accessions_from_cores(database_names: List, connection_url: URL) -> Dict: - """Obtain the associated INSDC accession [meta.assembly.accession] given a set of core(s) names - and a MYSQL server host. + The accession information is obtained from the `meta` table's meta key `assembly.accession`. Args: - database_names: Set of names for one or more core databases - connection_url: Partial MYSQL host name : port + src_file: File path with list of core database names. + server_url: Database server URL. Returns: - Dict of core name(s) (key) and its INSDC assembly.accession (value) + Dict of core database names (key) and their corresponding INSDC assembly accession (value). """ core_accn_meta = {} - core_list_count = len(database_names) + database_count = 0 count_accn_found = 0 - for core in database_names: - db_connection_url = connection_url.set(database=core) - db_connection = dbc(f"{db_connection_url}") - with db_connection.connect() as conn: - query_result = conn.execute( - text('SELECT meta_value FROM meta WHERE meta_key = "assembly.accession";') - ).fetchall() - - if query_result is None: - logging.warning(f"No accessions found in core: {core}") - elif len(query_result) == 1: - count_accn_found += 1 - asm_accession = query_result.pop()[0] - logging.info(f"{core} -> assembly.accession[{asm_accession}]") - core_accn_meta[core] = asm_accession - else: - logging.warning(f"Core {core} has {len(query_result)} assembly.accessions") + with Path(src_file).open("r") as fin: + for line in fin.readlines(): + core_db = line.strip() + database_count += 1 + db_connection_url = server_url.set(database=core_db) + db_connection = DBConnection(db_connection_url) + with db_connection.begin() as conn: + query_result = conn.execute( + text('SELECT meta_value FROM meta WHERE meta_key = "assembly.accession";') + ).fetchall() + + if not query_result: + logging.warning(f"No accessions found in core: {core_db}") + elif len(query_result) == 1: + count_accn_found += 1 + asm_accession = query_result.pop()[0] + logging.info(f"{core_db} -> assembly.accession[{asm_accession}]") + core_accn_meta[core_db] = asm_accession + else: + logging.warning(f"Core {core_db} has {len(query_result)} assembly.accessions") - logging.info(f"From initial input cores ({core_list_count}), obtained ({count_accn_found}) accessions") + logging.info( + f"From initial input core databases ({database_count}), obtained ({count_accn_found}) accessions" + ) return core_accn_meta -def datasets_asm_reports( - sif_image: str, assembly_accessions: dict, download_directory: PathLike, batch_size: int -) -> Dict: - """Obtain assembly report(s) JSONs for one or more queries made to datasets CLI. +def fetch_datasets_reports( + sif_image: Client, assembly_accessions: dict[str, str], download_directory: StrPath, batch_size: int +) -> dict[str, dict]: + """Obtain assembly reports in JSON format for each assembly accession via `datasets` CLI. Args: - sif_image: Instance of Client.loaded singularity image. - assembly_accessions: Dict of core accessions. - download_directory: Dir path to store assembly report JSON files. - batch_size: Number of assembly accessions to batch submit to 'datasets'. + sif_image: Instance of `Client.loaded()` singularity image. + assembly_accessions: Dictionary of accession source <> assembly accessions pairs. + download_directory: Directory path to store assembly report JSON files. + batch_size: Number of assembly accessions to batch submit to `datasets`. Returns: - Dictionary of core name and its associated assembly report - """ + Dictionary of accession source and its associated assembly report. + + Raises: + ValueError: If result returned by `datasets` is not a string. + RuntimeError: If there was an error raised by `datasets`. + """ master_accn_list = list(assembly_accessions.values()) combined_asm_reports = {} @@ -235,70 +225,66 @@ def datasets_asm_reports( list_split = list(range(0, len(master_accn_list), batch_size)) accn_subsample = [master_accn_list[ind : ind + batch_size] for ind in list_split] + datasets_command = ["datasets", "summary", "genome", "accession"] for accessions in accn_subsample: - datasets_command = ["datasets", "summary", "genome", "accession"] + accessions - - # Make call to singularity datasets providing a multi-accession query: + # Make call to singularity datasets providing a multi-accession query client_return = Client.execute( - image=sif_image, command=datasets_command, return_result=True, quiet=True + image=sif_image, command=datasets_command + accessions, return_result=True, quiet=True ) - raw_result = client_return["message"] ## Test what result we have obtained following execution of sif image and accession value + # Returned a list, i.e. datasets returned a result to client.execute # Returned a str, i.e. no datasets result obtained exited with fatal error if isinstance(raw_result, list): result = raw_result[0] else: result = raw_result - if not isinstance(result, str): - raise ValueError("Result obtained from datasets is not the expected format 'string'") + raise ValueError("Result obtained from datasets is not a string") if re.search("^FATAL", result): raise RuntimeError(f"Singularity image execution failed! -> '{result.strip()}'") - # Returned a list, i.e. datasets returned a result to client.execute tmp_asm_dict = json.loads(result) - if tmp_asm_dict["total_count"] >= 1: - logging.info(f"Assembly report obtained for accession(s) {accessions}") - - batch_reports_json = tmp_asm_dict["reports"] - for assembly_report in batch_reports_json: - accession = assembly_report["accession"] - asm_json_outfile = Path(download_directory, f"{accession}.asm_report.json") - print_json(asm_json_outfile, assembly_report) - # Save assembly report into master core<>report dict - for core, accession_core in assembly_accessions.items(): - if accession == accession_core: - combined_asm_reports[core] = assembly_report - else: - logging.warning(f"No assembly report found for accession(s) {accessions}. Exiting !") + if not tmp_asm_dict["total_count"]: + logging.warning(f"No assembly report found for accession(s) {accessions}") + continue + + logging.info(f"Assembly report obtained for accession(s) {accessions}") + batch_reports_json = tmp_asm_dict["reports"] + for assembly_report in batch_reports_json: + accession = assembly_report["accession"] + asm_json_outfile = Path(download_directory, f"{accession}.asm_report.json") + print_json(asm_json_outfile, assembly_report) + # Save assembly report into source key<>report dict + for src_key, accession_core in assembly_accessions.items(): + if accession == accession_core: + combined_asm_reports[src_key] = assembly_report return combined_asm_reports -def extract_assembly_metadata(assembly_reports: Dict[str, dict]) -> Dict[str, ReportStructure]: - """Function to parse assembly reports and extract specific key information on - status and related fields. +def extract_assembly_metadata(assembly_reports: dict[str, dict]) -> dict[str, ReportStructure]: + """Parse assembly reports and extract specific key information on status and related fields. Args: - assembly_reports: Key value pair of core_name : assembly report. + assembly_reports: Key value pair of source name <> assembly report. Returns: - Parsed assembly report meta (core, meta). + Parsed assembly report meta (source, meta). """ parsed_meta = {} - for core, asm_report in assembly_reports.items(): + for source, asm_report in assembly_reports.items(): asm_meta_info = ReportStructure() # Mandatory meta key parsing: - asm_meta_info["Asm accession"] = asm_report["accession"] - asm_meta_info["Asm name"] = asm_report["assembly_info"]["assembly_name"] - asm_meta_info["Assembly type"] = asm_report["assembly_info"]["assembly_type"] - asm_meta_info["Asm status"] = asm_report["assembly_info"]["assembly_status"] - asm_meta_info["Species Name"] = asm_report["organism"]["organism_name"] - asm_meta_info["Taxon ID"] = asm_report["organism"]["tax_id"] + asm_meta_info.accession = asm_report["accession"] + asm_meta_info.assembly_name = asm_report["assembly_info"]["assembly_name"] + asm_meta_info.assembly_type = asm_report["assembly_info"]["assembly_type"] + asm_meta_info.assembly_status = asm_report["assembly_info"]["assembly_status"] + asm_meta_info.species_name = asm_report["organism"]["organism_name"] + asm_meta_info.taxon_id = int(asm_report["organism"]["tax_id"]) ## Non-mandatory meta key parsing: assembly_meta_keys = asm_report["assembly_info"].keys() @@ -307,164 +293,123 @@ def extract_assembly_metadata(assembly_reports: Dict[str, dict]) -> Dict[str, Re # check for genome_notes: if "genome_notes" in assembly_meta_keys: complete_notes = ", ".join(asm_report["assembly_info"]["genome_notes"]) - asm_meta_info["Asm notes"] = complete_notes - else: - asm_meta_info["Asm notes"] = "NA" + asm_meta_info.assembly_notes = complete_notes # check for biosample: if "biosample" in assembly_meta_keys: - asm_meta_info["Asm last updated"] = asm_report["assembly_info"]["biosample"]["last_updated"] - else: - asm_meta_info["Asm last updated"] = "NA" + asm_meta_info.last_updated = asm_report["assembly_info"]["biosample"]["last_updated"] # check for paired assembly: if "paired_assembly" in assembly_meta_keys: - asm_meta_info["Paired assembly"] = asm_report["assembly_info"]["paired_assembly"]["accession"] - else: - asm_meta_info["Paired assembly"] = "NA" + asm_meta_info.paired_assembly = asm_report["assembly_info"]["paired_assembly"]["accession"] # check for isolate/strain type: if "infraspecific_names" in organism_keys: organism_type_keys = asm_report["organism"]["infraspecific_names"].keys() if "isolate" in organism_type_keys: - asm_meta_info["Isolate/Strain"] = asm_report["organism"]["infraspecific_names"]["isolate"] + asm_meta_info.strain = asm_report["organism"]["infraspecific_names"]["isolate"] elif "strain" in organism_type_keys: - asm_meta_info["Isolate/Strain"] = asm_report["organism"]["infraspecific_names"]["strain"] - else: - asm_meta_info["Isolate/Strain"] = "NA" - else: - asm_meta_info["Isolate/Strain"] = "NA" + asm_meta_info.strain = asm_report["organism"]["infraspecific_names"]["strain"] - parsed_meta[core] = asm_meta_info + parsed_meta[source] = asm_meta_info return parsed_meta def generate_report_tsv( - parsed_asm_reports: Dict, - outfile_prefix: str, + parsed_asm_reports: dict[str, ReportStructure], query_type: str, - output_directory: PathLike = Path(), + output_directory: StrPath = Path(), + outfile_name: str = "AssemblyStatusReport", ) -> None: """Generate and write the assembly report to a TSV file. Args: - parsed_asm_reports: Parsed assembly report meta - output_directory: Path to directory where output TSV is stored. - query_type: Type of query core_db or accession - output_directory: Directory to store report TSV + parsed_asm_reports: Parsed assembly report meta. + query_type: Type of query (either core databases or accessions). + output_directory: Directory to store report TSV file. + outfile_name: Name to give to the output TSV file. """ + tsv_outfile = Path(output_directory, f"{outfile_name}.tsv") - if not parsed_asm_reports: - return - - tsv_outfile = Path(output_directory, f"{outfile_prefix}.tsv") - - header_list = list(ReportStructure().keys()) - header_list = [query_type] + header_list + header_list = next(iter(parsed_asm_reports.values())).header() + header_list = [query_type.capitalize().replace("_", " ")] + header_list with open(tsv_outfile, "w+") as tsv_out: writer = csv.writer(tsv_out, delimiter="\t", lineterminator="\n") writer.writerow(header_list) - for core, report_meta in parsed_asm_reports.items(): - final_asm_report = [core] + list(report_meta.values()) + final_asm_report = [core] + report_meta.values() writer.writerow(final_asm_report) - tsv_out.close() def main() -> None: """Module's entry-point.""" parser = ArgumentParser(description=__doc__) - input_group = parser.add_mutually_exclusive_group(required=True) - input_group.add_argument( - "--input_cores", - type=Path, - required=False, - default=None, - help="List of ensembl core database(s) names to retrieve query accessions", - ) - input_group.add_argument( - "--input_accessions", - type=Path, - required=False, - default=None, - help="List of assembly INSDC query accessions", - ) - parser.add_argument_dst_path( - "--download_dir", - default="Assembly_report_jsons", - help="Folder where the assembly report JSON file(s) are stored", - ) - parser.add_argument_dst_path( - "--assembly_report_prefix", - default="AssemblyStatusReport", - help="Prefix used in assembly report TSV output file.", - ) - parser.add_argument( - "--host", - type=str, - required=False, - help="Server hostname (fmt: mysql-ens-XXXXX-YY); required with '--input_cores'", - ) - parser.add_argument( - "--port", type=str, required=False, help="Server port (fmt: 1234); required with '--input_cores'" - ) - parser.add_argument( - "--datasets_version_url", - type=str, - required=False, - metavar="URL", - help="datasets version: E.g. docker://ensemblorg/datasets-cli:latest", - ) - parser.add_argument( - "--cache_dir", - type=Path, - required=False, - default="$NXF_SINGULARITY_CACHEDIR", - metavar="SINGULARITY_CACHE", - help="Custom path to user generated singularity container housing ncbi tool 'datasets'", + subparsers = parser.add_subparsers(title="report assembly status from", required=True, dest="src") + # Specific arguments required when using Ensembl core database names as source + core_db_parser = subparsers.add_parser("core_db", help="list of Ensembl core databases") + core_db_parser.add_argument_src_path( + "--input", + required=True, + help="file path with list of Ensembl core database(s) to retrieve query accessions from", ) - parser.add_argument( - "--datasets_batch_size", - type=int, - required=False, - default=100, - metavar="BATCH_SIZE", - help="Number of accessions requested in one query to datasets", + core_db_parser.add_server_arguments() + # Specific arguments required when using assembly accessions as source + accessions_parser = subparsers.add_parser("accession", help="list of INSDC accessions") + accessions_parser.add_argument_src_path( + "--input", required=True, help="file path with list of assembly INSDC query accessions" ) + # Add common arguments to both subparsers to avoid forcing users to add common arguments before + # the sub-command, e.g. `status.py --cache_dir accession --input ` + for subparser in [core_db_parser, accessions_parser]: + subparser.add_argument_dst_path( + "--output_file", + default=Path("assembly_report_jsons"), + help="path to folder where the assembly report JSON files is stored", + ) + subparser.add_argument( + "--assembly_report_name", + metavar="NAME", + default="AssemblyStatusReport", + help="file name used for the assembly report TSV output file", + ) + subparser.add_argument( + "--datasets_version_url", + type=str, + metavar="URL", + help="datasets version, e.g. docker://ensemblorg/datasets-cli:latest", + ) + subparser.add_argument_src_path( + "--cache_dir", + default=Path(os.environ.get("NXF_SINGULARITY_CACHEDIR", "")), + metavar="SINGULARITY_CACHE", + help="folder path to user generated singularity container housing NCBI tool 'datasets'", + ) + subparser.add_numeric_argument( + "--datasets_batch_size", + type=int, + min_value=1, + default=100, + metavar="BATCH_SIZE", + help="number of accessions requested in one query to datasets", + ) + subparser.add_log_arguments(add_log_file=True) - parser.add_log_arguments(add_log_file=True) args = parser.parse_args() - init_logging_with_args(args) - # Set and create dir for download files - args.download_dir.mkdir(parents=True, exist_ok=True) - - # Set input file and determine if proper parameterization options are defined. - user_query_file = check_parameterization(args.input_cores, args.input_accessions, args.host, args.port) - - ## Parse and store cores/accessions from user input query file - with user_query_file.open(mode="r") as f: - query_list = f.read().splitlines() + # Get accessions on cores list or use user accession list directly + if args.src == "core_db": + query_accessions = fetch_accessions_from_core_dbs(args.input, args.url) + else: + query_accessions = {x: x for x in get_assembly_accessions(args.input)} - ## Parse singularity setting and define the SIF image for 'datasets' + # Parse singularity setting and define the SIF image for 'datasets' datasets_image = singularity_image_setter(args.cache_dir, args.datasets_version_url) - ## Get accessions on cores list or use user accession list directly - connection_url = URL.create( - "mysql", - host=args.host, - port=args.port, - username="ensro", - ) - query_accessions, query_type = resolve_query_type( - query_list, connection_url, args.input_cores, args.input_accessions - ) - # Datasets query implementation for one or more batched accessions - assembly_reports = datasets_asm_reports( + assembly_reports = fetch_datasets_reports( datasets_image, query_accessions, args.download_dir, args.datasets_batch_size ) @@ -472,4 +417,4 @@ def main() -> None: key_assembly_report_meta = extract_assembly_metadata(assembly_reports) # Produce the finalized assembly status report TSV from set of parsed 'datasets summary report' - generate_report_tsv(key_assembly_report_meta, args.assembly_report_prefix, query_type, args.download_dir) + generate_report_tsv(key_assembly_report_meta, args.src, args.download_dir, args.assembly_report_name) diff --git a/src/python/tests/assembly/test_download.py b/src/python/tests/assembly/test_download.py index 686224292..34b1f5f8a 100644 --- a/src/python/tests/assembly/test_download.py +++ b/src/python/tests/assembly/test_download.py @@ -12,15 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Unit testing of `ensembl.io.genomio.assembly.download` module. - -The unit testing is divided into one test class per submodule/class found in this module, and one test method -per public function/class method. - -Typical usage example:: - $ pytest ./assembly/test_download.py - -""" +"""Unit testing of `ensembl.io.genomio.assembly.download` module.""" from contextlib import nullcontext as does_not_raise import filecmp diff --git a/src/python/tests/assembly/test_status.py b/src/python/tests/assembly/test_status.py new file mode 100644 index 000000000..5f421616c --- /dev/null +++ b/src/python/tests/assembly/test_status.py @@ -0,0 +1,367 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Unit testing of `ensembl.io.genomio.assembly.status` module.""" + +from __future__ import annotations + +from contextlib import nullcontext as does_not_raise +import json +import os +from pathlib import Path +from typing import Any, Callable, ContextManager +from unittest.mock import Mock, patch + +from deepdiff import DeepDiff +import pytest +from pytest import FixtureRequest, param, raises +from sqlalchemy import Column, Index, String, text +from sqlalchemy.dialects.mysql import INTEGER +from sqlalchemy.engine import make_url +from sqlalchemy.orm import declarative_base + +from ensembl.io.genomio.assembly.status import ( + DATASETS_SINGULARITY, + UnsupportedFormatError, + ReportStructure, + extract_assembly_metadata, + fetch_datasets_reports, + fetch_accessions_from_core_dbs, + generate_report_tsv, + get_assembly_accessions, + singularity_image_setter, +) +from ensembl.io.genomio.utils.json_utils import get_json +from ensembl.utils import StrPath +from ensembl.utils.database import UnitTestDB + + +MINIMUM_METADATA = { + "my_core": ReportStructure( + species_name="Octopus bimaculoides", + taxon_id=37653, + assembly_name="ASM119413v2", + assembly_type="haploid", + accession="GCF_001194135.2", + assembly_status="current", + ) +} + +STRAIN_METADATA = { + "my_core": ReportStructure( + species_name="Octopus bimaculoides", + taxon_id=37653, + strain="UCB-OBI-ISO-001", + assembly_name="ASM119413v2", + assembly_type="haploid", + accession="GCF_001194135.2", + assembly_status="current", + ) +} + +COMPLETE_METADATA = { + "my_core": ReportStructure( + species_name="Octopus bimaculoides", + taxon_id=37653, + strain="UCB-OBI-ISO-001", + assembly_name="ASM119413v2", + assembly_type="haploid", + accession="GCF_001194135.2", + paired_assembly="GCA_001194135.2", + last_updated="2015-06-29T09:51:41.073", + assembly_status="current", + assembly_notes="RefSeq", + ) +} + + +Base: Any = declarative_base() # only possible type hint compatible with SQLAlchemy 1.4 and 2.0+ + + +class Meta(Base): + """Meta class mirroring the Ensembl core database meta table without any foreign keys""" + + __tablename__ = "meta" + __table_args__ = ( + Index("species_value_idx", "species_id", "meta_value"), + Index("species_key_value_idx", "species_id", "meta_key", "meta_value", unique=True), + ) + + meta_id: Column = Column(INTEGER(11), primary_key=True) + species_id: Column = Column(INTEGER(10), nullable=False, index=True, server_default=text("'1'")) + meta_key: Column = Column(String(40), nullable=False) + meta_value: Column = Column(String(255), nullable=False) + + +@pytest.mark.dependency() +def test_report_structure() -> None: + """Tests the `ReportStructure` class.""" + assert ReportStructure() + + +@pytest.mark.dependency(depends=["test_report_structure"]) +def test_report_structure_to_dict() -> None: + """Tests the `ReportStructure.to_dict()` method.""" + assert not DeepDiff( + COMPLETE_METADATA["my_core"].to_dict(), + { + "Species Name": "Octopus bimaculoides", + "Taxon ID": "37653", + "Isolate/Strain": "UCB-OBI-ISO-001", + "Asm name": "ASM119413v2", + "Assembly type": "haploid", + "Asm accession": "GCF_001194135.2", + "Paired assembly": "GCA_001194135.2", + "Asm last updated": "2015-06-29T09:51:41.073", + "Asm status": "current", + "Asm notes": "RefSeq", + }, + ) + + +@pytest.mark.dependency(depends=["test_report_structure"]) +def test_report_structure_header() -> None: + """Tests the `ReportStructure.header()` method.""" + expected_header = [ + "Species Name", + "Taxon ID", + "Isolate/Strain", + "Asm name", + "Assembly type", + "Asm accession", + "Paired assembly", + "Asm last updated", + "Asm status", + "Asm notes", + ] + assert COMPLETE_METADATA["my_core"].header() == expected_header + + +@pytest.mark.dependency(depends=["test_report_structure"]) +def test_report_structure_values() -> None: + """Tests the `ReportStructure.values()` method.""" + expected_values = [ + "Octopus bimaculoides", + "37653", + "UCB-OBI-ISO-001", + "ASM119413v2", + "haploid", + "GCF_001194135.2", + "GCA_001194135.2", + "2015-06-29T09:51:41.073", + "current", + "RefSeq", + ] + assert COMPLETE_METADATA["my_core"].values() == expected_values + + +@pytest.mark.parametrize( + "sif_cache_dir, datasets_version, nextflow_cachedir, singularity_cachedir", + [ + param("sif_cache", None, None, None, id="Personal SIF cache, default datasets"), + param(None, None, "nxf_cache", None, id="Nextflow SIF cache, default datasets"), + param(None, None, None, "singularity_cache", id="Singularity SIF cache, default datasets"), + param(None, "my_datasets", None, None, id="No SIF cache, user datasets"), + ], +) +@patch("ensembl.io.genomio.assembly.status.Client") +def test_singularity_image_setter( + mock_client: Mock, + tmp_path: Path, + sif_cache_dir: Path | None, + datasets_version: str | None, + nextflow_cachedir: str | None, + singularity_cachedir: str | None, +) -> None: + """Tests the `singularity_image_setter()` function. + + Fixtures: tmp_path + + Args: + sif_cache_dir: Path to locate existing, or download new SIF container image. + datasets_version: URL of singularity container (custom `datasets` version if desired). + nextflow_cachedir: Value to assign to environment variable NXF_SINGULARITY_CACHEDIR. + singularity_cachedir: Value to assign to environment variable SINGULARITY_CACHEDIR. + """ + mock_client.pull.return_value = True + # Define SIF cache path and expected path used to pull the container + sif_cache_path = None + if sif_cache_dir: + sif_cache_path = tmp_path / sif_cache_dir + sif_cache_path.mkdir() + expected_cache_path = sif_cache_path + elif nextflow_cachedir: + expected_cache_path = Path(nextflow_cachedir) + elif singularity_cachedir: + expected_cache_path = Path(singularity_cachedir) + else: + expected_cache_path = Path() + # Get expected container URL used to pull the container + if datasets_version: + expected_container_url = datasets_version + else: + expected_container_url = DATASETS_SINGULARITY["datasets_version_url"] + # Patch the environment variables + new_env = {} + if nextflow_cachedir: + new_env["NXF_SINGULARITY_CACHEDIR"] = nextflow_cachedir + if singularity_cachedir: + new_env["SINGULARITY_CACHEDIR"] = singularity_cachedir + with patch.dict(os.environ, new_env, clear=True): + assert singularity_image_setter(sif_cache_path, datasets_version) + # Check that the spython pull method was called with the right arguments + mock_client.pull.assert_called_with( + expected_container_url, stream=False, pull_folder=expected_cache_path, quiet=True + ) + + +@pytest.mark.parametrize( + "file_name, expected_output, expectation", + [ + param("assemblies.txt", ["GCA_900524668.2", "GCF_123456789.1"], does_not_raise(), id="Valid file"), + param("meta_report.tsv", [], raises(UnsupportedFormatError), id="Wrong accession format"), + ], +) +def test_get_assembly_accessions( + data_dir: Path, file_name: str, expected_output: list[str], expectation: ContextManager +) -> None: + """Tests the `get_assembly_accessions()` function. + + Fixtures: + data_dir + + Args: + file_name: File with one line per INSDC assembly accession. + expected_output: Expected assembly accessions returned. + expectation: Context manager of expected raise exception. + """ + file_path = data_dir / file_name + with expectation: + result = get_assembly_accessions(file_path) + assert result == expected_output + + +@pytest.mark.parametrize( + "test_dbs", + [ + [ + {"src": "one_core_db", "metadata": Base.metadata}, + {"src": "another_core_db", "metadata": Base.metadata}, + {"src": "yet_another_core_db", "metadata": Base.metadata}, + ], + ], + indirect=True, +) +def test_fetch_accessions_from_core_dbs( + request: FixtureRequest, tmp_path: Path, test_dbs: dict[str, UnitTestDB] +) -> None: + """Tests the `fetch_accessions_from_core_dbs()` function. + + Fixtures: request, tmp_path, test_dbs + """ + # Create a file with each test database name + tmp_file = tmp_path / "core_dbs_list.txt" + with tmp_file.open("w") as fin: + for db in test_dbs.values(): + fin.write(f"{db.dbc.db_name}\n") + test_server = make_url(request.config.getoption("server")) + accessions = fetch_accessions_from_core_dbs(tmp_file, test_server) + assert not DeepDiff(accessions, {test_dbs["one_core_db"].dbc.db_name: "GCF_001194135.2"}) + + +@patch("ensembl.io.genomio.assembly.status.Client") +def test_fetch_datasets_reports( + mock_client: Mock, tmp_path: Path, data_dir: Path, assert_files: Callable[[StrPath, StrPath], None] +) -> None: + """Tests the `fetch_datasets_reports()` function. + + Fixtures: + tmp_path, data_dir, assert_files + """ + + def execute_return( + command: list[str], **kwargs: Any # pylint: disable=unused-argument + ) -> dict[str, str]: + report_path = data_dir / f"{command[-1]}.asm_report.json" + if report_path.exists(): + report = get_json(report_path) + return {"message": f'{{"reports": [{json.dumps(report)}], "total_count": 1}}'} + return {"message": '{"total_count": 0}'} + + mock_client.execute.side_effect = execute_return + accessions = {"my_core": "GCF_001194135.2", "your_core": "GCA_001122334.1"} + report = fetch_datasets_reports(mock_client, accessions, tmp_path, 1) + assert not (tmp_path / "GCA_001122334.1.asm_report.json").exists() + report_file = tmp_path / "GCF_001194135.2.asm_report.json" + expected_report_file = data_dir / "GCF_001194135.2.asm_report.json" + assert_files(report_file, expected_report_file) + expected_report = get_json(expected_report_file) + assert not DeepDiff(report, {"my_core": expected_report}) + + +@patch("ensembl.io.genomio.assembly.status.Client") +def test_fetch_datasets_reports_value_error(mock_client: Mock) -> None: + """Tests the `fetch_datasets_reports()` function when `ValueError` is raised.""" + mock_client.execute.return_value = {"message": [["unexpected nested list"]]} + accessions = {"my_core": "GCF_001194135.2"} + with raises(ValueError): + fetch_datasets_reports(mock_client, accessions, Path(), 1) + + +@patch("ensembl.io.genomio.assembly.status.Client") +def test_fetch_datasets_reports_runtime_error(mock_client: Mock) -> None: + """Tests the `fetch_datasets_reports()` function when `RuntimeError` is raised.""" + mock_client.execute.return_value = {"message": "FATAL error message"} + accessions = {"my_core": "GCF_001194135.2"} + with raises(RuntimeError, match=r"Singularity image execution failed! -> '.*'"): + fetch_datasets_reports(mock_client, accessions, Path(), 1) + + +@pytest.mark.dependency(depends=["test_report_structure"]) +@pytest.mark.parametrize( + "file_name, expected_metadata", + [ + param("simple.asm_report.json", MINIMUM_METADATA, id="Minimum report"), + param("strain.asm_report.json", STRAIN_METADATA, id="Strain report"), + param("cultivar.asm_report.json", MINIMUM_METADATA, id="Unexpected infraspecific name (cultivar)"), + param("GCF_001194135.2.asm_report.json", COMPLETE_METADATA, id="Detailed report"), + ], +) +def test_extract_assembly_metadata( + data_dir: Path, file_name: str, expected_metadata: dict[str, ReportStructure] +) -> None: + """Tests the `extract_assembly_metadata()` function. + + Fixtures: data_dir + + Args: + file_name: Test data file to extract the assembly metadata from. + expected_metadata: Expected key value pairs of source name <> assembly report. + """ + report_path = data_dir / file_name + report = {"my_core": get_json(report_path)} + metadata = extract_assembly_metadata(report) + assert not DeepDiff(metadata, expected_metadata) + + +@pytest.mark.dependency(depends=["test_report_structure"]) +def test_generate_report_tsv( + tmp_path: Path, data_dir: Path, assert_files: Callable[[StrPath, StrPath], None] +) -> None: + """Tests the `generate_report_tsv()` function. + + Fixtures: tmp_path, data_dir, assert_files + """ + generate_report_tsv(COMPLETE_METADATA, "core_db", tmp_path, "test_output") + assert_files(tmp_path / "test_output.tsv", data_dir / "meta_report.tsv") diff --git a/src/python/tests/assembly/test_status/GCF_001194135.2.asm_report.json b/src/python/tests/assembly/test_status/GCF_001194135.2.asm_report.json new file mode 100644 index 000000000..b027f0252 --- /dev/null +++ b/src/python/tests/assembly/test_status/GCF_001194135.2.asm_report.json @@ -0,0 +1,187 @@ +{ + "accession": "GCF_001194135.2", + "annotation_info": { + "busco": { + "busco_lineage": "mollusca_odb10", + "busco_ver": "4.1.4", + "complete": 0.90764874, + "duplicated": 0.006421152, + "fragmented": 0.026440037, + "missing": 0.06591123, + "single_copy": 0.9012276, + "total_count": "5295" + }, + "method": "Best-placed RefSeq; Gnomon; cmsearch; tRNAscan-SE", + "name": "GCF_001194135.2-RS_2023_01", + "pipeline": "NCBI eukaryotic genome annotation pipeline", + "provider": "NCBI RefSeq", + "release_date": "2023-01-05", + "report_url": "https://www.ncbi.nlm.nih.gov/genome/annotation_euk/Octopus_bimaculoides/GCF_001194135.2-RS_2023_01", + "software_version": "10.1", + "stats": { + "gene_counts": { + "non_coding": 3956, + "protein_coding": 15684, + "pseudogene": 504, + "total": 20144 + } + }, + "status": "Full annotation" + }, + "assembly_info": { + "assembly_level": "Chromosome", + "assembly_method": "HiRise v. 2019", + "assembly_name": "ASM119413v2", + "assembly_status": "current", + "assembly_type": "haploid", + "bioproject_accession": "PRJNA270931", + "bioproject_lineage": [ + { + "bioprojects": [ + { + "accession": "PRJNA270931", + "title": "Octopus bimaculoides isolate:UCB-OBI-ISO-001 Genome sequencing and assembly" + } + ] + } + ], + "biosample": { + "accession": "SAMN03271701", + "attributes": [ + { + "name": "isolate", + "value": "UCB-OBI-ISO-001" + }, + { + "name": "isolation_source", + "value": "wild caught off the coast of California" + }, + { + "name": "collection_date", + "value": "not applicable" + }, + { + "name": "geo_loc_name", + "value": "Pacific Ocean" + }, + { + "name": "tissue", + "value": "gonad" + }, + { + "name": "biomaterial_provider", + "value": "Daniel S. Rokhsar, 454 LSA #3200, UC Berkeley,Berkeley CA 94720-3200" + }, + { + "name": "collected_by", + "value": "Crissy Huffard, Roy Caldwell" + }, + { + "name": "sex", + "value": "male" + } + ], + "biomaterial_provider": "Daniel S. Rokhsar, 454 LSA #3200, UC Berkeley,Berkeley CA 94720-3200", + "bioprojects": [ + { + "accession": "PRJNA270931" + } + ], + "collected_by": "Crissy Huffard, Roy Caldwell", + "collection_date": "not applicable", + "description": { + "comment": "Gonad tissue from an adult male Octopus bimaculoides for whole genome shotgun sequencing", + "organism": { + "organism_name": "Octopus bimaculoides", + "tax_id": 37653 + }, + "title": "Whole Genome Shotgun sample from Octopus bimaculoides" + }, + "geo_loc_name": "Pacific Ocean", + "isolate": "UCB-OBI-ISO-001", + "isolation_source": "wild caught off the coast of California", + "last_updated": "2015-06-29T09:51:41.073", + "models": [ + "Invertebrate" + ], + "owner": { + "contacts": [ + {} + ], + "name": "UC Berkeley" + }, + "package": "Invertebrate.1.0", + "publication_date": "2015-03-27T14:38:12.907", + "sample_ids": [ + { + "label": "Sample name", + "value": "ObimacWGS" + }, + { + "db": "SRA", + "value": "SRS920820" + } + ], + "sex": "male", + "status": { + "status": "live", + "when": "2015-03-27T14:38:12.907" + }, + "submission_date": "2014-12-20T13:31:33.810", + "tissue": "gonad" + }, + "blast_url": "https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastSearch&PROG_DEF=blastn&BLAST_SPEC=GDH_GCF_001194135.2", + "genome_notes": [ + "RefSeq" + ], + "paired_assembly": { + "accession": "GCA_001194135.2", + "only_refseq": "chromosome MT", + "status": "current" + }, + "refseq_category": "representative genome", + "release_date": "2022-10-06", + "sequencing_tech": "HiC", + "submitter": "UC Berkeley" + }, + "assembly_stats": { + "contig_l50": 98071, + "contig_n50": 5523, + "gc_count": "715821249", + "gc_percent": 36, + "genome_coverage": "10.0x", + "number_of_component_sequences": 145326, + "number_of_contigs": 713915, + "number_of_organelles": 1, + "number_of_scaffolds": 145326, + "scaffold_l50": 9, + "scaffold_n50": 96881196, + "total_number_of_chromosomes": 30, + "total_sequence_length": "2342518435", + "total_ungapped_length": "1986264087" + }, + "current_accession": "GCF_001194135.2", + "organelle_info": [ + { + "description": "Mitochondrion", + "submitter": "UC Berkeley", + "total_seq_length": "15733" + } + ], + "organism": { + "common_name": "California two-spot octopus", + "infraspecific_names": { + "isolate": "UCB-OBI-ISO-001", + "sex": "male" + }, + "organism_name": "Octopus bimaculoides", + "tax_id": 37653 + }, + "paired_accession": "GCA_001194135.2", + "source_database": "SOURCE_DATABASE_REFSEQ", + "wgs_info": { + "master_wgs_url": "https://www.ncbi.nlm.nih.gov/nuccore/LGKD00000000.2", + "wgs_contigs_url": "https://www.ncbi.nlm.nih.gov/Traces/wgs/LGKD02", + "wgs_project_accession": "LGKD02" + } +} \ No newline at end of file diff --git a/src/python/tests/assembly/test_status/another_core_db/meta.txt b/src/python/tests/assembly/test_status/another_core_db/meta.txt new file mode 100644 index 000000000..f511019f6 --- /dev/null +++ b/src/python/tests/assembly/test_status/another_core_db/meta.txt @@ -0,0 +1,2 @@ +1 1 assembly.accession GCF_001194135.2 +2 1 assembly.accession GCA_001194135.2 \ No newline at end of file diff --git a/src/python/tests/assembly/test_status/assemblies.txt b/src/python/tests/assembly/test_status/assemblies.txt new file mode 100644 index 000000000..f2387e41a --- /dev/null +++ b/src/python/tests/assembly/test_status/assemblies.txt @@ -0,0 +1,2 @@ +GCA_900524668.2 +GCF_123456789.1 diff --git a/src/python/tests/assembly/test_status/cultivar.asm_report.json b/src/python/tests/assembly/test_status/cultivar.asm_report.json new file mode 100644 index 000000000..0190fb9e6 --- /dev/null +++ b/src/python/tests/assembly/test_status/cultivar.asm_report.json @@ -0,0 +1,16 @@ +{ + "accession": "GCF_001194135.2", + "assembly_info": { + "assembly_level": "Chromosome", + "assembly_name": "ASM119413v2", + "assembly_status": "current", + "assembly_type": "haploid" + }, + "organism": { + "infraspecific_names": { + "cultivar": "UCB-OBI-ISO-001" + }, + "organism_name": "Octopus bimaculoides", + "tax_id": 37653 + } +} \ No newline at end of file diff --git a/src/python/tests/assembly/test_status/meta_report.tsv b/src/python/tests/assembly/test_status/meta_report.tsv new file mode 100644 index 000000000..f955b841d --- /dev/null +++ b/src/python/tests/assembly/test_status/meta_report.tsv @@ -0,0 +1,2 @@ +Core db Species Name Taxon ID Isolate/Strain Asm name Assembly type Asm accession Paired assembly Asm last updated Asm status Asm notes +my_core Octopus bimaculoides 37653 UCB-OBI-ISO-001 ASM119413v2 haploid GCF_001194135.2 GCA_001194135.2 2015-06-29T09:51:41.073 current RefSeq diff --git a/src/python/tests/assembly/test_status/one_core_db/meta.txt b/src/python/tests/assembly/test_status/one_core_db/meta.txt new file mode 100644 index 000000000..8c0d99035 --- /dev/null +++ b/src/python/tests/assembly/test_status/one_core_db/meta.txt @@ -0,0 +1 @@ +1 1 assembly.accession GCF_001194135.2 \ No newline at end of file diff --git a/src/python/tests/assembly/test_status/simple.asm_report.json b/src/python/tests/assembly/test_status/simple.asm_report.json new file mode 100644 index 000000000..033f1aa66 --- /dev/null +++ b/src/python/tests/assembly/test_status/simple.asm_report.json @@ -0,0 +1,13 @@ +{ + "accession": "GCF_001194135.2", + "assembly_info": { + "assembly_level": "Chromosome", + "assembly_name": "ASM119413v2", + "assembly_status": "current", + "assembly_type": "haploid" + }, + "organism": { + "organism_name": "Octopus bimaculoides", + "tax_id": 37653 + } +} \ No newline at end of file diff --git a/src/python/tests/assembly/test_status/strain.asm_report.json b/src/python/tests/assembly/test_status/strain.asm_report.json new file mode 100644 index 000000000..5fbef216b --- /dev/null +++ b/src/python/tests/assembly/test_status/strain.asm_report.json @@ -0,0 +1,16 @@ +{ + "accession": "GCF_001194135.2", + "assembly_info": { + "assembly_level": "Chromosome", + "assembly_name": "ASM119413v2", + "assembly_status": "current", + "assembly_type": "haploid" + }, + "organism": { + "infraspecific_names": { + "strain": "UCB-OBI-ISO-001" + }, + "organism_name": "Octopus bimaculoides", + "tax_id": 37653 + } +} \ No newline at end of file