Merge pull request #405 from Ensembl/jalvarez/code_review

Minor codebase review looking for typos and other missing elements
Ensembl · Jul 18, 2024 · 3549b83 · 3549b83
2 parents f905263 + 267b1cd
commit 3549b83
Show file tree

Hide file tree

Showing 16 changed files with 66 additions and 49 deletions.
diff --git a/cicd/gitlab/parts/python.gitlab-ci.yml b/cicd/gitlab/parts/python.gitlab-ci.yml
@@ -83,13 +83,11 @@ python:mypy:src:
 
 python:mypy:tests:
   extends: .python:mypy
-  allow_failure: true
   script:
     - $MYPY_CMD src/python/tests
 
 python:black:
   extends: .python:test
-  allow_failure: true
   script:
     - black --config pyproject.toml --check .
 

diff --git a/docs/BRC4_genome_compare_conf.md b/docs/BRC4_genome_compare_conf.md
@@ -3,10 +3,10 @@
 
 ## **Overview**
 -----
-This pipeline is used for a sequence-level comparison of an assembly with INSDC and provides a detailed report on the discrepencies. The following steps are performed:
+This pipeline is used for a sequence-level comparison of an assembly with INSDC and provides a detailed report on the discrepancies. The following steps are performed:
 
   1. Download the files for the corresponding assembly from INSDC
-  2. Retreive metadata seq.json and fasta files from the database
+  2. Retrieve metadata seq.json and fasta files from the database
   3. Compare the fasta files
        - compare the sequence ids
        - compare the sequence 
@@ -35,8 +35,8 @@ init_pipeline.pl Bio::EnsEMBL::Pipeline::PipeConfig::BRC4_genome_compare_conf \
 | `--pipeline_name` | str | brc4_genome_compare |   optional| name of the hive pipeline |
 | `--hive_force_init` | int |  | yes | drop and create the hive pipeline from scratch | 
 | `--output_dir`      | dir |   ./output                     | optional| directory to store the result | 
-| `--tmp_dir`         | dir |   ./tmp                     | optional| temp directory for dowloaded files |
-| `--species`         | str |                        | yes| species (one or muliple) to process (production name) |
+| `--tmp_dir`         | dir |   ./tmp                     | optional| temp directory for downloaded files |
+| `--species`         | str |                        | yes| species (one or multiple) to process (production name) |
 | `--run_all`         | int |     0                   | yes| process all the species in the registry | 
 | `--email`           | str |  $USER.ebi.ac.uk                     | optional| a summary is emailed when the pipeline is complete | 
 

diff --git a/docs/nextflow.md b/docs/nextflow.md
@@ -3,7 +3,7 @@
 ## Installation
 If you don't have an installed environment or you don't have nextflow itself, here's one of the ways to install it.
 
-Define [`NXF_HOME` env variable](https://www.nextflow.io/docs/latest/config.html#environment-variables) to use a nextlow home location instead of the default one (`$HOME/.nextflow`).
+Define [`NXF_HOME` env variable](https://www.nextflow.io/docs/latest/config.html#environment-variables) to use a nextflow home location instead of the default one (`$HOME/.nextflow`).
 Everything else is unchanged from the default Nextflow installation instructions on [https://www.nextflow.io/index.html#GetStarted](https://www.nextflow.io/index.html#GetStarted).
 
 ```
@@ -76,7 +76,7 @@ Instead pipeline dies with
 Caused by: Cannot load from object array because "this.keys" is null
 ```
 and when printing this object (`dbs` in this case, with `println "db: ${db}"`),
-we see it dict surronded by the curly brackets like this
+we see it dict surrounded by the curly brackets like this
 ```
 {..., "db_name":"some_db_name", ...}
 ```

diff --git a/src/python/ensembl/io/genomio/annotation/load.py b/src/python/ensembl/io/genomio/annotation/load.py
@@ -14,6 +14,11 @@
 # limitations under the License.
 """Loads functional annotation from a file into a core database."""
 
+__all__ = [
+    "get_core_data",
+    "load_descriptions",
+]
+
 import logging
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
@@ -42,7 +47,7 @@ def get_core_data(session: Session, table: str) -> Dict[str, FeatStruct]:
 
     Args:
         session: Session open on a core database.
-        table: "gene" or "trancript" table from the core database.
+        table: "gene" or "transcript" table from the core database.
     """
 
     if table == "gene":

diff --git a/src/python/ensembl/io/genomio/genbank/extract_data.py b/src/python/ensembl/io/genomio/genbank/extract_data.py
@@ -20,7 +20,7 @@
 - genome metadata json
 
 Raises:
-    GFFPArseError: If the structure of the gb file can't be parsed.
+    GBParseError: If the structure of the gb file cannot be parsed.
     UnsupportedData: If some data is not as expected.
 
 Returns:
@@ -255,7 +255,7 @@ def _parse_record(self, record: SeqRecord) -> Tuple[SeqRecord, List[str], List[S
                 feats = {**feats, **rna_feats}
                 all_ids += rna_ids
 
-            # Any other case? Fail here and check if we shoud support it, or add it to unsupported list
+            # Any other case? Fail here and check if we should support it, or add it to unsupported list
             else:
                 raise GBParseError(f"No ID for allowed feature: {feat}")
 
@@ -463,9 +463,9 @@ def _get_codon_table(self, seq: SeqRecord) -> Optional[int]:
         """
         for feat in seq.features:
             if feat.type == "CDS":
-                quals = feat.qualifiers
-                if "transl_table" in quals:
-                    return quals["transl_table"][0]
+                qualifiers = feat.qualifiers
+                if "transl_table" in qualifiers:
+                    return qualifiers["transl_table"][0]
                 return None
         return None
 

diff --git a/src/python/ensembl/io/genomio/gff3/extract_annotation.py b/src/python/ensembl/io/genomio/gff3/extract_annotation.py
@@ -132,7 +132,7 @@ def add_feature(
         parent_id: Optional[str] = None,
         all_parent_ids: Optional[List[str]] = None,
     ) -> None:
-        """Add annotation for a feature of a given type. If a parent_id is provided, record the relatioship.
+        """Add annotation for a feature of a given type. If a parent_id is provided, record the relationship.
 
         Args:
             feature: The feature to create an annotation.

diff --git a/src/python/ensembl/io/genomio/manifest/check_integrity.py b/src/python/ensembl/io/genomio/manifest/check_integrity.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Compare the genomic data in a DNA fasta file, seq_region json, gene models GFF3 and peptide fasta
+"""Compare the genomic data in a DNA fasta file, seq_region JSON, gene models GFF3 and peptide fasta
 to ensure their contents are in sync.
 """
 
@@ -483,7 +483,7 @@ def check_integrity(self):
             # Check fasta_pep.fa integrity
             # The sequence length and id retrieved from the fasta_pep file
             # and compared to the translated CDS id and length in the gff
-            # We don't compare the peptide lengths because of seqedits
+            # We do not compare the peptide lengths because of sequence edits
             if pep:
                 tr_errors = self.check_lengths(
                     pep, gff_translations, "Fasta translations vs gff", special_diff=True
@@ -527,7 +527,7 @@ def check_integrity(self):
                     )
                 )
 
-            # Check the seq.json intregrity
+            # Check the seq.json integrity
             # Compare the length and id retrieved from seq.json to the gff
             if seq_regions:
                 self.check_seq_region_lengths(
@@ -627,7 +627,7 @@ def check_lengths(self, list1, list2, name, allowed_len_diff=None, special_diff=
             Error if there is a difference in length or ids between the lists.
         """
 
-        # check list diffferences, checks if abs(values diff) < allowed_len_diff
+        # check list differences, checks if abs(values diff) < allowed_len_diff
 
         set1 = frozenset(list1)
         set2 = frozenset(list2)

diff --git a/src/python/ensembl/io/genomio/manifest/compute_stats.py b/src/python/ensembl/io/genomio/manifest/compute_stats.py
@@ -14,6 +14,12 @@
 # limitations under the License.
 """Compute stats from the current genome files associated with the manifest."""
 
+__all__ = [
+    "BiotypeCounter",
+    "manifest_stats",
+    "StatsError",
+]
+
 import json
 from os import PathLike
 from pathlib import Path
@@ -374,10 +380,10 @@ def compare_ncbi_counts(self, biotypes: Dict[str, BiotypeCounter], ncbi: Dict) -
         for count_map in maps:
             ncbi_name, prep_name = count_map
             ncbi_count = ncbi.get(ncbi_name, 0)
-            preped: Optional[BiotypeCounter] = biotypes.get(prep_name)
+            prepped: Optional[BiotypeCounter] = biotypes.get(prep_name)
             prep_count = 0
-            if preped is not None:
-                prep_count = preped.count
+            if prepped is not None:
+                prep_count = prepped.count
 
             if prep_count != ncbi_count:
                 diff = prep_count - ncbi_count

diff --git a/src/python/ensembl/io/genomio/seq_region/__init__.py b/src/python/ensembl/io/genomio/seq_region/__init__.py
@@ -16,3 +16,4 @@
 
 from .dump import *
 from .prepare import *
+from .rename import *
diff --git a/src/python/ensembl/io/genomio/seq_region/dump.py b/src/python/ensembl/io/genomio/seq_region/dump.py
@@ -21,7 +21,6 @@
     "get_seq_regions",
     "add_attribs",
     "get_synonyms",
-    "get_attribs",
     "get_karyotype",
 ]
 
@@ -201,7 +200,7 @@ def get_synonyms(seq_region: SeqRegion, external_db_map: dict) -> List:
     return syns
 
 
-def get_attribs(seq_region: SeqRegion) -> List:
+def _get_attribs(seq_region: SeqRegion) -> List:
     """Given a seq_region, extract the attribs as value-source items.
 
     Args:
@@ -229,7 +228,7 @@ def get_attribs_dict(seq_region: SeqRegion) -> Dict[str, Any]:
         Pairs of source and value for each attribute.
     """
 
-    attribs = get_attribs(seq_region)
+    attribs = _get_attribs(seq_region)
     attrib_dict = {attrib["source"]: attrib["value"] for attrib in attribs}
     return attrib_dict
 

diff --git a/src/python/ensembl/io/genomio/seq_region/rename.py b/src/python/ensembl/io/genomio/seq_region/rename.py
@@ -14,6 +14,15 @@
 # limitations under the License.
 """Rename seq_region BRC names in a given core database."""
 
+__all__ = [
+    "Operation",
+    "SeqRegionReplacement",
+    "get_rename_map",
+    "get_seq_regions_to_replace",
+    "rename_seq_regions",
+    "update_seq_region_name",
+]
+
 from dataclasses import dataclass
 import logging
 from enum import Enum, auto
@@ -108,7 +117,7 @@ def get_seq_regions_to_replace(
                 continue
             seqr.seq_region_id = db_seqr.seq_region_id
 
-            attribs = get_attribs(db_seqr)
+            attribs = _get_attribs(db_seqr)
             db_brc_name = attribs.get("BRC4_seq_region_name", "")
             seqr.old_brc_name = db_brc_name
             if not db_brc_name:
@@ -128,7 +137,7 @@ def get_seq_regions_to_replace(
     return seq_regions
 
 
-def get_attribs(seq_region: SeqRegion) -> Dict[str, str]:
+def _get_attribs(seq_region: SeqRegion) -> Dict[str, str]:
     """Given a seq_region, extract the attribs as value-source items.
 
     Args:

diff --git a/src/python/tests/assembly/test_download.py b/src/python/tests/assembly/test_download.py
@@ -131,7 +131,7 @@ def test_checksums(
         pytest.param("wrong_md5_checksums.txt", None, False, id="Incorrect md5 checksum"),
         pytest.param(None, None, True, id="No md5file specified, resort to default"),
         pytest.param(None, Path("*"), False, id="Incompatible os path '*'"),
-        pytest.param("missingfile_md5.txt", None, False, id="md5 checksum with ref of missing file"),
+        pytest.param("missing_file_md5.txt", None, False, id="md5 checksum with ref of missing file"),
     ],
 )
 def test_md5_files(data_dir: Path, md5_file: str, md5_path: Optional[Path], checksum_bool: bool) -> None:
@@ -404,7 +404,7 @@ def test_get_files_selection(
 @patch("ensembl.io.genomio.assembly.download.md5_files")
 def test_retrieve_assembly_data(
     mock_retrieve: Mock,
-    mock_download_singlefile: Mock,
+    mock_download_single_file: Mock,
     mock_download_files: Mock,
     mock_file_select: Mock,
     mock_ftp: Mock,
@@ -441,5 +441,5 @@ def side_eff_conn(url: str):
     with exception:
         retrieve_assembly_data(accession, download_dir, 2)
         assert mock_download_files.download_files.called_once()
-        assert mock_download_singlefile._download_file.called_once()  # pylint: disable=protected-access
+        assert mock_download_single_file._download_file.called_once()  # pylint: disable=protected-access
         assert mock_file_select.get_files_selection.called_with(files_downloaded)
diff --git a/src/python/tests/database/test_dbconnection_lite.py b/src/python/tests/database/test_dbconnection_lite.py
@@ -51,7 +51,6 @@ def fixture_meta_test_db(db_factory) -> UnitTestDB:
 def test_get_metadata(meta_test_db: UnitTestDB) -> None:
     """Tests the method get_metadata()"""
 
-
     # Check the new connection lite
     dblite = DBConnectionLite(meta_test_db.dbc.url)
     assert dblite.get_metadata() == _METADATA_CONTENT

diff --git a/src/python/tests/genbank/test_extract_data.py b/src/python/tests/genbank/test_extract_data.py
@@ -36,7 +36,7 @@
 
 
 class TestWriteFormattedFiles:
-    """Test if all the expected output files are generated and formated correctly"""
+    """Test if all the expected output files are generated and formatted correctly"""
 
     prod_name = "TEST_prod"
     gb_file = "input_file.gb"
@@ -238,7 +238,7 @@ def test_write_pep_fasta(
         tmp_path: Path,
         formatted_files_generator: FormattedFilesGenerator,
     ) -> None:
-        """Test if peptides FATA file is generated when peptides are identified"""
+        """Test if peptides FASTA file is generated when peptides are identified"""
         record = SeqRecord(Seq("MFLRTQARFFHATTKKM"), id="cds-record")
         CDS_feature = SeqFeature(
             FeatureLocation(10, 20), type="CDS", qualifiers={"gene": ["GlyrA"], "transl_table": "2"}

diff --git a/src/python/tests/gff3/test_extract_annotation.py b/src/python/tests/gff3/test_extract_annotation.py
@@ -83,7 +83,7 @@ def test_product_is_informative(description: str, feature_id: Optional[List[str]
 )
 @pytest.mark.dependency(name="add_feature")
 def test_add_feature(seq_feat_type: str, feat_type: str, expected: ContextManager) -> None:
-    """Tests the `FunctionaAnnotation.add_feature()` method with only one feature.
+    """Tests the `FunctionalAnnotation.add_feature()` method with only one feature.
 
     Args:
         seq_feat_type: Type for the sequence feature to add.
@@ -106,7 +106,7 @@ def test_add_feature(seq_feat_type: str, feat_type: str, expected: ContextManage
     ],
 )
 def test_add_feature_name(feat_id: str, feat_name: str, expected_synonyms: List[str]) -> None:
-    """Tests the `FunctionaAnnotations.add_feature()` method with a feature name."""
+    """Tests the `FunctionalAnnotations.add_feature()` method with a feature name."""
     annot = FunctionalAnnotations()
 
     seq_feat_type = "gene"
@@ -128,7 +128,7 @@ def test_add_feature_name(feat_id: str, feat_name: str, expected_synonyms: List[
 )
 @pytest.mark.dependency(name="add_parent_link", depends=["add_feature"])
 def test_add_parent_link(parent_type: str, parent_id: str, child_id: str, expected: ContextManager) -> None:
-    """Tests the `FunctionaAnnotation.add_parent_link()` method.
+    """Tests the `FunctionalAnnotation.add_parent_link()` method.
 
     Add a parent feature, and then add a parent link.
 
@@ -164,7 +164,7 @@ def test_get_parent(
     out_child_id: str,
     expected: ContextManager,
 ) -> None:
-    """Tests the `FunctionaAnnotation.get_parent()` method.
+    """Tests the `FunctionalAnnotation.get_parent()` method.
 
     Args:
         in_parent_type: Type for the parent sequence feature.
@@ -202,7 +202,7 @@ def test_get_parent(
 def test_add_feature_fail(
     child_type: str, child_id: str, out_parent_id: Optional[str], expected: ContextManager
 ) -> None:
-    """Tests the `FunctionaAnnotation.add_feature()` method failures.
+    """Tests the `FunctionalAnnotation.add_feature()` method failures.
 
     Test the addition of a child feature after a parent has already been added.
 
@@ -270,7 +270,7 @@ def test_add_feature_fail(
 def test_get_xrefs(
     in_id: str, in_xrefs: Optional[List[str]], provider_name: str, expected_xrefs: List[Dict[str, str]]
 ) -> None:
-    """Tests the `FunctionaAnnotation.get_xrefs()` method."""
+    """Tests the `FunctionalAnnotation.get_xrefs()` method."""
     annot = FunctionalAnnotations(provider_name=provider_name)
     one_gene = GFFSeqFeature(type="gene", id=in_id)
     if in_xrefs is not None:
@@ -291,7 +291,7 @@ def test_get_xrefs(
 )
 @pytest.mark.dependency(name="get_features", depends=["add_feature_fail"])
 def test_get_features(feat_type: str, expected_number: int, expected: ContextManager) -> None:
-    """Tests the `FunctionaAnnotation.get_features()` method.
+    """Tests the `FunctionalAnnotation.get_features()` method.
 
     Load 2 features, then test the fetching of those features.
 
@@ -325,7 +325,7 @@ def test_get_features(feat_type: str, expected_number: int, expected: ContextMan
             None,
             "Foobar",
             "Foobar, transcript variant X1",
-            id="transcr with variant",
+            id="transcript with variant",
         ),
         param(None, "Foobar", "Lorem", "Foobar", "Foobar", id="Transfer from transc, transl also set"),
         param("Hypothetical gene", "Predicted function", "Foobar", "Foobar", "Foobar", id="Non informative"),
@@ -340,16 +340,16 @@ def test_transfer_descriptions(
     out_gene_desc: Optional[str],
     out_transc_desc: Optional[str],
 ) -> None:
-    """Tests the `FunctionaAnnotation.transfer_descriptions()` method.
+    """Tests the `FunctionalAnnotation.transfer_descriptions()` method.
 
     Load 3 features (gene, transcript, translation) with or without a description for each one.
 
     Args:
         gene_desc: Description for the gene.
         transc_desc: Description for the transcript.
         transl_desc: Description for the translation.
-        out_gene_desc: Excpected description for the gene after transfer.
-        out_transc_desc: Excpected description for the transcript after transfer.
+        out_gene_desc: Expected description for the gene after transfer.
+        out_transc_desc: Expected description for the transcript after transfer.
 
     """
     annot = FunctionalAnnotations()
@@ -370,9 +370,9 @@ def test_transfer_descriptions(
 
     annot.transfer_descriptions()
     genes = annot.get_features("gene")
-    transcs = annot.get_features("transcript")
+    transcripts = annot.get_features("transcript")
     assert genes[gene_name].get("description") == out_gene_desc
-    assert transcs[transcript_name].get("description") == out_transc_desc
+    assert transcripts[transcript_name].get("description") == out_transc_desc
 
 
 @pytest.mark.dependency(depends=["add_feature"])