diff --git a/cicd/gitlab/parts/python.gitlab-ci.yml b/cicd/gitlab/parts/python.gitlab-ci.yml index a7e9560ce..9b74da395 100644 --- a/cicd/gitlab/parts/python.gitlab-ci.yml +++ b/cicd/gitlab/parts/python.gitlab-ci.yml @@ -83,13 +83,11 @@ python:mypy:src: python:mypy:tests: extends: .python:mypy - allow_failure: true script: - $MYPY_CMD src/python/tests python:black: extends: .python:test - allow_failure: true script: - black --config pyproject.toml --check . diff --git a/docs/BRC4_genome_compare_conf.md b/docs/BRC4_genome_compare_conf.md index 33ee8be30..a84d0cd20 100644 --- a/docs/BRC4_genome_compare_conf.md +++ b/docs/BRC4_genome_compare_conf.md @@ -3,10 +3,10 @@ ## **Overview** ----- -This pipeline is used for a sequence-level comparison of an assembly with INSDC and provides a detailed report on the discrepencies. The following steps are performed: +This pipeline is used for a sequence-level comparison of an assembly with INSDC and provides a detailed report on the discrepancies. The following steps are performed: 1. Download the files for the corresponding assembly from INSDC - 2. Retreive metadata seq.json and fasta files from the database + 2. Retrieve metadata seq.json and fasta files from the database 3. Compare the fasta files - compare the sequence ids - compare the sequence @@ -35,8 +35,8 @@ init_pipeline.pl Bio::EnsEMBL::Pipeline::PipeConfig::BRC4_genome_compare_conf \ | `--pipeline_name` | str | brc4_genome_compare | optional| name of the hive pipeline | | `--hive_force_init` | int | | yes | drop and create the hive pipeline from scratch | | `--output_dir` | dir | ./output | optional| directory to store the result | -| `--tmp_dir` | dir | ./tmp | optional| temp directory for dowloaded files | -| `--species` | str | | yes| species (one or muliple) to process (production name) | +| `--tmp_dir` | dir | ./tmp | optional| temp directory for downloaded files | +| `--species` | str | | yes| species (one or multiple) to process (production name) | | `--run_all` | int | 0 | yes| process all the species in the registry | | `--email` | str | $USER.ebi.ac.uk | optional| a summary is emailed when the pipeline is complete | diff --git a/docs/nextflow.md b/docs/nextflow.md index cdc0f024d..79b22683f 100644 --- a/docs/nextflow.md +++ b/docs/nextflow.md @@ -3,7 +3,7 @@ ## Installation If you don't have an installed environment or you don't have nextflow itself, here's one of the ways to install it. -Define [`NXF_HOME` env variable](https://www.nextflow.io/docs/latest/config.html#environment-variables) to use a nextlow home location instead of the default one (`$HOME/.nextflow`). +Define [`NXF_HOME` env variable](https://www.nextflow.io/docs/latest/config.html#environment-variables) to use a nextflow home location instead of the default one (`$HOME/.nextflow`). Everything else is unchanged from the default Nextflow installation instructions on [https://www.nextflow.io/index.html#GetStarted](https://www.nextflow.io/index.html#GetStarted). ``` @@ -76,7 +76,7 @@ Instead pipeline dies with Caused by: Cannot load from object array because "this.keys" is null ``` and when printing this object (`dbs` in this case, with `println "db: ${db}"`), -we see it dict surronded by the curly brackets like this +we see it dict surrounded by the curly brackets like this ``` {..., "db_name":"some_db_name", ...} ``` diff --git a/src/python/ensembl/io/genomio/annotation/load.py b/src/python/ensembl/io/genomio/annotation/load.py index 9bf09643f..bce636f0d 100644 --- a/src/python/ensembl/io/genomio/annotation/load.py +++ b/src/python/ensembl/io/genomio/annotation/load.py @@ -14,6 +14,11 @@ # limitations under the License. """Loads functional annotation from a file into a core database.""" +__all__ = [ + "get_core_data", + "load_descriptions", +] + import logging from pathlib import Path from typing import Any, Dict, List, Optional, Tuple @@ -42,7 +47,7 @@ def get_core_data(session: Session, table: str) -> Dict[str, FeatStruct]: Args: session: Session open on a core database. - table: "gene" or "trancript" table from the core database. + table: "gene" or "transcript" table from the core database. """ if table == "gene": diff --git a/src/python/ensembl/io/genomio/genbank/extract_data.py b/src/python/ensembl/io/genomio/genbank/extract_data.py index e34f92e2c..f97b6cbde 100644 --- a/src/python/ensembl/io/genomio/genbank/extract_data.py +++ b/src/python/ensembl/io/genomio/genbank/extract_data.py @@ -20,7 +20,7 @@ - genome metadata json Raises: - GFFPArseError: If the structure of the gb file can't be parsed. + GBParseError: If the structure of the gb file cannot be parsed. UnsupportedData: If some data is not as expected. Returns: @@ -255,7 +255,7 @@ def _parse_record(self, record: SeqRecord) -> Tuple[SeqRecord, List[str], List[S feats = {**feats, **rna_feats} all_ids += rna_ids - # Any other case? Fail here and check if we shoud support it, or add it to unsupported list + # Any other case? Fail here and check if we should support it, or add it to unsupported list else: raise GBParseError(f"No ID for allowed feature: {feat}") @@ -463,9 +463,9 @@ def _get_codon_table(self, seq: SeqRecord) -> Optional[int]: """ for feat in seq.features: if feat.type == "CDS": - quals = feat.qualifiers - if "transl_table" in quals: - return quals["transl_table"][0] + qualifiers = feat.qualifiers + if "transl_table" in qualifiers: + return qualifiers["transl_table"][0] return None return None diff --git a/src/python/ensembl/io/genomio/gff3/extract_annotation.py b/src/python/ensembl/io/genomio/gff3/extract_annotation.py index 036680c8d..d65e8dcf6 100644 --- a/src/python/ensembl/io/genomio/gff3/extract_annotation.py +++ b/src/python/ensembl/io/genomio/gff3/extract_annotation.py @@ -132,7 +132,7 @@ def add_feature( parent_id: Optional[str] = None, all_parent_ids: Optional[List[str]] = None, ) -> None: - """Add annotation for a feature of a given type. If a parent_id is provided, record the relatioship. + """Add annotation for a feature of a given type. If a parent_id is provided, record the relationship. Args: feature: The feature to create an annotation. diff --git a/src/python/ensembl/io/genomio/manifest/check_integrity.py b/src/python/ensembl/io/genomio/manifest/check_integrity.py index 785b2214a..ae40c197f 100644 --- a/src/python/ensembl/io/genomio/manifest/check_integrity.py +++ b/src/python/ensembl/io/genomio/manifest/check_integrity.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Compare the genomic data in a DNA fasta file, seq_region json, gene models GFF3 and peptide fasta +"""Compare the genomic data in a DNA fasta file, seq_region JSON, gene models GFF3 and peptide fasta to ensure their contents are in sync. """ @@ -483,7 +483,7 @@ def check_integrity(self): # Check fasta_pep.fa integrity # The sequence length and id retrieved from the fasta_pep file # and compared to the translated CDS id and length in the gff - # We don't compare the peptide lengths because of seqedits + # We do not compare the peptide lengths because of sequence edits if pep: tr_errors = self.check_lengths( pep, gff_translations, "Fasta translations vs gff", special_diff=True @@ -527,7 +527,7 @@ def check_integrity(self): ) ) - # Check the seq.json intregrity + # Check the seq.json integrity # Compare the length and id retrieved from seq.json to the gff if seq_regions: self.check_seq_region_lengths( @@ -627,7 +627,7 @@ def check_lengths(self, list1, list2, name, allowed_len_diff=None, special_diff= Error if there is a difference in length or ids between the lists. """ - # check list diffferences, checks if abs(values diff) < allowed_len_diff + # check list differences, checks if abs(values diff) < allowed_len_diff set1 = frozenset(list1) set2 = frozenset(list2) diff --git a/src/python/ensembl/io/genomio/manifest/compute_stats.py b/src/python/ensembl/io/genomio/manifest/compute_stats.py index 394912125..9c1684395 100644 --- a/src/python/ensembl/io/genomio/manifest/compute_stats.py +++ b/src/python/ensembl/io/genomio/manifest/compute_stats.py @@ -14,6 +14,12 @@ # limitations under the License. """Compute stats from the current genome files associated with the manifest.""" +__all__ = [ + "BiotypeCounter", + "manifest_stats", + "StatsError", +] + import json from os import PathLike from pathlib import Path @@ -374,10 +380,10 @@ def compare_ncbi_counts(self, biotypes: Dict[str, BiotypeCounter], ncbi: Dict) - for count_map in maps: ncbi_name, prep_name = count_map ncbi_count = ncbi.get(ncbi_name, 0) - preped: Optional[BiotypeCounter] = biotypes.get(prep_name) + prepped: Optional[BiotypeCounter] = biotypes.get(prep_name) prep_count = 0 - if preped is not None: - prep_count = preped.count + if prepped is not None: + prep_count = prepped.count if prep_count != ncbi_count: diff = prep_count - ncbi_count diff --git a/src/python/ensembl/io/genomio/seq_region/__init__.py b/src/python/ensembl/io/genomio/seq_region/__init__.py index 45797eb75..4c7bc9162 100644 --- a/src/python/ensembl/io/genomio/seq_region/__init__.py +++ b/src/python/ensembl/io/genomio/seq_region/__init__.py @@ -16,3 +16,4 @@ from .dump import * from .prepare import * +from .rename import * diff --git a/src/python/ensembl/io/genomio/seq_region/dump.py b/src/python/ensembl/io/genomio/seq_region/dump.py index f0cd18a24..7786fcf7c 100644 --- a/src/python/ensembl/io/genomio/seq_region/dump.py +++ b/src/python/ensembl/io/genomio/seq_region/dump.py @@ -21,7 +21,6 @@ "get_seq_regions", "add_attribs", "get_synonyms", - "get_attribs", "get_karyotype", ] @@ -201,7 +200,7 @@ def get_synonyms(seq_region: SeqRegion, external_db_map: dict) -> List: return syns -def get_attribs(seq_region: SeqRegion) -> List: +def _get_attribs(seq_region: SeqRegion) -> List: """Given a seq_region, extract the attribs as value-source items. Args: @@ -229,7 +228,7 @@ def get_attribs_dict(seq_region: SeqRegion) -> Dict[str, Any]: Pairs of source and value for each attribute. """ - attribs = get_attribs(seq_region) + attribs = _get_attribs(seq_region) attrib_dict = {attrib["source"]: attrib["value"] for attrib in attribs} return attrib_dict diff --git a/src/python/ensembl/io/genomio/seq_region/rename.py b/src/python/ensembl/io/genomio/seq_region/rename.py index 94647916c..bef2355be 100644 --- a/src/python/ensembl/io/genomio/seq_region/rename.py +++ b/src/python/ensembl/io/genomio/seq_region/rename.py @@ -14,6 +14,15 @@ # limitations under the License. """Rename seq_region BRC names in a given core database.""" +__all__ = [ + "Operation", + "SeqRegionReplacement", + "get_rename_map", + "get_seq_regions_to_replace", + "rename_seq_regions", + "update_seq_region_name", +] + from dataclasses import dataclass import logging from enum import Enum, auto @@ -108,7 +117,7 @@ def get_seq_regions_to_replace( continue seqr.seq_region_id = db_seqr.seq_region_id - attribs = get_attribs(db_seqr) + attribs = _get_attribs(db_seqr) db_brc_name = attribs.get("BRC4_seq_region_name", "") seqr.old_brc_name = db_brc_name if not db_brc_name: @@ -128,7 +137,7 @@ def get_seq_regions_to_replace( return seq_regions -def get_attribs(seq_region: SeqRegion) -> Dict[str, str]: +def _get_attribs(seq_region: SeqRegion) -> Dict[str, str]: """Given a seq_region, extract the attribs as value-source items. Args: diff --git a/src/python/tests/assembly/test_download.py b/src/python/tests/assembly/test_download.py index e7683e0c1..21b4296ce 100644 --- a/src/python/tests/assembly/test_download.py +++ b/src/python/tests/assembly/test_download.py @@ -131,7 +131,7 @@ def test_checksums( pytest.param("wrong_md5_checksums.txt", None, False, id="Incorrect md5 checksum"), pytest.param(None, None, True, id="No md5file specified, resort to default"), pytest.param(None, Path("*"), False, id="Incompatible os path '*'"), - pytest.param("missingfile_md5.txt", None, False, id="md5 checksum with ref of missing file"), + pytest.param("missing_file_md5.txt", None, False, id="md5 checksum with ref of missing file"), ], ) def test_md5_files(data_dir: Path, md5_file: str, md5_path: Optional[Path], checksum_bool: bool) -> None: @@ -404,7 +404,7 @@ def test_get_files_selection( @patch("ensembl.io.genomio.assembly.download.md5_files") def test_retrieve_assembly_data( mock_retrieve: Mock, - mock_download_singlefile: Mock, + mock_download_single_file: Mock, mock_download_files: Mock, mock_file_select: Mock, mock_ftp: Mock, @@ -441,5 +441,5 @@ def side_eff_conn(url: str): with exception: retrieve_assembly_data(accession, download_dir, 2) assert mock_download_files.download_files.called_once() - assert mock_download_singlefile._download_file.called_once() # pylint: disable=protected-access + assert mock_download_single_file._download_file.called_once() # pylint: disable=protected-access assert mock_file_select.get_files_selection.called_with(files_downloaded) diff --git a/src/python/tests/database/test_dbconnection_lite.py b/src/python/tests/database/test_dbconnection_lite.py index 45c76dfac..5bf19ea61 100644 --- a/src/python/tests/database/test_dbconnection_lite.py +++ b/src/python/tests/database/test_dbconnection_lite.py @@ -51,7 +51,6 @@ def fixture_meta_test_db(db_factory) -> UnitTestDB: def test_get_metadata(meta_test_db: UnitTestDB) -> None: """Tests the method get_metadata()""" - # Check the new connection lite dblite = DBConnectionLite(meta_test_db.dbc.url) assert dblite.get_metadata() == _METADATA_CONTENT diff --git a/src/python/tests/genbank/test_extract_data.py b/src/python/tests/genbank/test_extract_data.py index ffb45da1d..5e6677aaf 100644 --- a/src/python/tests/genbank/test_extract_data.py +++ b/src/python/tests/genbank/test_extract_data.py @@ -36,7 +36,7 @@ class TestWriteFormattedFiles: - """Test if all the expected output files are generated and formated correctly""" + """Test if all the expected output files are generated and formatted correctly""" prod_name = "TEST_prod" gb_file = "input_file.gb" @@ -238,7 +238,7 @@ def test_write_pep_fasta( tmp_path: Path, formatted_files_generator: FormattedFilesGenerator, ) -> None: - """Test if peptides FATA file is generated when peptides are identified""" + """Test if peptides FASTA file is generated when peptides are identified""" record = SeqRecord(Seq("MFLRTQARFFHATTKKM"), id="cds-record") CDS_feature = SeqFeature( FeatureLocation(10, 20), type="CDS", qualifiers={"gene": ["GlyrA"], "transl_table": "2"} diff --git a/src/python/tests/gff3/test_extract_annotation.py b/src/python/tests/gff3/test_extract_annotation.py index 2765bbb67..65112a8d0 100644 --- a/src/python/tests/gff3/test_extract_annotation.py +++ b/src/python/tests/gff3/test_extract_annotation.py @@ -83,7 +83,7 @@ def test_product_is_informative(description: str, feature_id: Optional[List[str] ) @pytest.mark.dependency(name="add_feature") def test_add_feature(seq_feat_type: str, feat_type: str, expected: ContextManager) -> None: - """Tests the `FunctionaAnnotation.add_feature()` method with only one feature. + """Tests the `FunctionalAnnotation.add_feature()` method with only one feature. Args: seq_feat_type: Type for the sequence feature to add. @@ -106,7 +106,7 @@ def test_add_feature(seq_feat_type: str, feat_type: str, expected: ContextManage ], ) def test_add_feature_name(feat_id: str, feat_name: str, expected_synonyms: List[str]) -> None: - """Tests the `FunctionaAnnotations.add_feature()` method with a feature name.""" + """Tests the `FunctionalAnnotations.add_feature()` method with a feature name.""" annot = FunctionalAnnotations() seq_feat_type = "gene" @@ -128,7 +128,7 @@ def test_add_feature_name(feat_id: str, feat_name: str, expected_synonyms: List[ ) @pytest.mark.dependency(name="add_parent_link", depends=["add_feature"]) def test_add_parent_link(parent_type: str, parent_id: str, child_id: str, expected: ContextManager) -> None: - """Tests the `FunctionaAnnotation.add_parent_link()` method. + """Tests the `FunctionalAnnotation.add_parent_link()` method. Add a parent feature, and then add a parent link. @@ -164,7 +164,7 @@ def test_get_parent( out_child_id: str, expected: ContextManager, ) -> None: - """Tests the `FunctionaAnnotation.get_parent()` method. + """Tests the `FunctionalAnnotation.get_parent()` method. Args: in_parent_type: Type for the parent sequence feature. @@ -202,7 +202,7 @@ def test_get_parent( def test_add_feature_fail( child_type: str, child_id: str, out_parent_id: Optional[str], expected: ContextManager ) -> None: - """Tests the `FunctionaAnnotation.add_feature()` method failures. + """Tests the `FunctionalAnnotation.add_feature()` method failures. Test the addition of a child feature after a parent has already been added. @@ -270,7 +270,7 @@ def test_add_feature_fail( def test_get_xrefs( in_id: str, in_xrefs: Optional[List[str]], provider_name: str, expected_xrefs: List[Dict[str, str]] ) -> None: - """Tests the `FunctionaAnnotation.get_xrefs()` method.""" + """Tests the `FunctionalAnnotation.get_xrefs()` method.""" annot = FunctionalAnnotations(provider_name=provider_name) one_gene = GFFSeqFeature(type="gene", id=in_id) if in_xrefs is not None: @@ -291,7 +291,7 @@ def test_get_xrefs( ) @pytest.mark.dependency(name="get_features", depends=["add_feature_fail"]) def test_get_features(feat_type: str, expected_number: int, expected: ContextManager) -> None: - """Tests the `FunctionaAnnotation.get_features()` method. + """Tests the `FunctionalAnnotation.get_features()` method. Load 2 features, then test the fetching of those features. @@ -325,7 +325,7 @@ def test_get_features(feat_type: str, expected_number: int, expected: ContextMan None, "Foobar", "Foobar, transcript variant X1", - id="transcr with variant", + id="transcript with variant", ), param(None, "Foobar", "Lorem", "Foobar", "Foobar", id="Transfer from transc, transl also set"), param("Hypothetical gene", "Predicted function", "Foobar", "Foobar", "Foobar", id="Non informative"), @@ -340,7 +340,7 @@ def test_transfer_descriptions( out_gene_desc: Optional[str], out_transc_desc: Optional[str], ) -> None: - """Tests the `FunctionaAnnotation.transfer_descriptions()` method. + """Tests the `FunctionalAnnotation.transfer_descriptions()` method. Load 3 features (gene, transcript, translation) with or without a description for each one. @@ -348,8 +348,8 @@ def test_transfer_descriptions( gene_desc: Description for the gene. transc_desc: Description for the transcript. transl_desc: Description for the translation. - out_gene_desc: Excpected description for the gene after transfer. - out_transc_desc: Excpected description for the transcript after transfer. + out_gene_desc: Expected description for the gene after transfer. + out_transc_desc: Expected description for the transcript after transfer. """ annot = FunctionalAnnotations() @@ -370,9 +370,9 @@ def test_transfer_descriptions( annot.transfer_descriptions() genes = annot.get_features("gene") - transcs = annot.get_features("transcript") + transcripts = annot.get_features("transcript") assert genes[gene_name].get("description") == out_gene_desc - assert transcs[transcript_name].get("description") == out_transc_desc + assert transcripts[transcript_name].get("description") == out_transc_desc @pytest.mark.dependency(depends=["add_feature"]) diff --git a/src/python/tests/gff3/test_restructure.py b/src/python/tests/gff3/test_restructure.py index 2cc5814fc..89355f621 100644 --- a/src/python/tests/gff3/test_restructure.py +++ b/src/python/tests/gff3/test_restructure.py @@ -292,9 +292,9 @@ def test_remove_extra_exons( if has_id: exon_num = 1 - for subfeat in gene.sub_features: - if subfeat.type == "exon": - subfeat.id = f"id-{exon_num}" + for subfeature in gene.sub_features: + if subfeature.type == "exon": + subfeature.id = f"id-{exon_num}" exon_num += 1 if exon_num > has_id: break