Skip to content

Commit

Permalink
Merge pull request #405 from Ensembl/jalvarez/code_review
Browse files Browse the repository at this point in the history
Minor codebase review looking for typos and other missing elements
  • Loading branch information
JAlvarezJarreta authored Jul 18, 2024
2 parents f905263 + 267b1cd commit 3549b83
Show file tree
Hide file tree
Showing 16 changed files with 66 additions and 49 deletions.
2 changes: 0 additions & 2 deletions cicd/gitlab/parts/python.gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -83,13 +83,11 @@ python:mypy:src:

python:mypy:tests:
extends: .python:mypy
allow_failure: true
script:
- $MYPY_CMD src/python/tests

python:black:
extends: .python:test
allow_failure: true
script:
- black --config pyproject.toml --check .

Expand Down
8 changes: 4 additions & 4 deletions docs/BRC4_genome_compare_conf.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@

## **Overview**
-----
This pipeline is used for a sequence-level comparison of an assembly with INSDC and provides a detailed report on the discrepencies. The following steps are performed:
This pipeline is used for a sequence-level comparison of an assembly with INSDC and provides a detailed report on the discrepancies. The following steps are performed:

1. Download the files for the corresponding assembly from INSDC
2. Retreive metadata seq.json and fasta files from the database
2. Retrieve metadata seq.json and fasta files from the database
3. Compare the fasta files
- compare the sequence ids
- compare the sequence
Expand Down Expand Up @@ -35,8 +35,8 @@ init_pipeline.pl Bio::EnsEMBL::Pipeline::PipeConfig::BRC4_genome_compare_conf \
| `--pipeline_name` | str | brc4_genome_compare | optional| name of the hive pipeline |
| `--hive_force_init` | int | | yes | drop and create the hive pipeline from scratch |
| `--output_dir` | dir | ./output | optional| directory to store the result |
| `--tmp_dir` | dir | ./tmp | optional| temp directory for dowloaded files |
| `--species` | str | | yes| species (one or muliple) to process (production name) |
| `--tmp_dir` | dir | ./tmp | optional| temp directory for downloaded files |
| `--species` | str | | yes| species (one or multiple) to process (production name) |
| `--run_all` | int | 0 | yes| process all the species in the registry |
| `--email` | str | $USER.ebi.ac.uk | optional| a summary is emailed when the pipeline is complete |

Expand Down
4 changes: 2 additions & 2 deletions docs/nextflow.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
## Installation
If you don't have an installed environment or you don't have nextflow itself, here's one of the ways to install it.

Define [`NXF_HOME` env variable](https://www.nextflow.io/docs/latest/config.html#environment-variables) to use a nextlow home location instead of the default one (`$HOME/.nextflow`).
Define [`NXF_HOME` env variable](https://www.nextflow.io/docs/latest/config.html#environment-variables) to use a nextflow home location instead of the default one (`$HOME/.nextflow`).
Everything else is unchanged from the default Nextflow installation instructions on [https://www.nextflow.io/index.html#GetStarted](https://www.nextflow.io/index.html#GetStarted).

```
Expand Down Expand Up @@ -76,7 +76,7 @@ Instead pipeline dies with
Caused by: Cannot load from object array because "this.keys" is null
```
and when printing this object (`dbs` in this case, with `println "db: ${db}"`),
we see it dict surronded by the curly brackets like this
we see it dict surrounded by the curly brackets like this
```
{..., "db_name":"some_db_name", ...}
```
Expand Down
7 changes: 6 additions & 1 deletion src/python/ensembl/io/genomio/annotation/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@
# limitations under the License.
"""Loads functional annotation from a file into a core database."""

__all__ = [
"get_core_data",
"load_descriptions",
]

import logging
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
Expand Down Expand Up @@ -42,7 +47,7 @@ def get_core_data(session: Session, table: str) -> Dict[str, FeatStruct]:
Args:
session: Session open on a core database.
table: "gene" or "trancript" table from the core database.
table: "gene" or "transcript" table from the core database.
"""

if table == "gene":
Expand Down
10 changes: 5 additions & 5 deletions src/python/ensembl/io/genomio/genbank/extract_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
- genome metadata json
Raises:
GFFPArseError: If the structure of the gb file can't be parsed.
GBParseError: If the structure of the gb file cannot be parsed.
UnsupportedData: If some data is not as expected.
Returns:
Expand Down Expand Up @@ -255,7 +255,7 @@ def _parse_record(self, record: SeqRecord) -> Tuple[SeqRecord, List[str], List[S
feats = {**feats, **rna_feats}
all_ids += rna_ids

# Any other case? Fail here and check if we shoud support it, or add it to unsupported list
# Any other case? Fail here and check if we should support it, or add it to unsupported list
else:
raise GBParseError(f"No ID for allowed feature: {feat}")

Expand Down Expand Up @@ -463,9 +463,9 @@ def _get_codon_table(self, seq: SeqRecord) -> Optional[int]:
"""
for feat in seq.features:
if feat.type == "CDS":
quals = feat.qualifiers
if "transl_table" in quals:
return quals["transl_table"][0]
qualifiers = feat.qualifiers
if "transl_table" in qualifiers:
return qualifiers["transl_table"][0]
return None
return None

Expand Down
2 changes: 1 addition & 1 deletion src/python/ensembl/io/genomio/gff3/extract_annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def add_feature(
parent_id: Optional[str] = None,
all_parent_ids: Optional[List[str]] = None,
) -> None:
"""Add annotation for a feature of a given type. If a parent_id is provided, record the relatioship.
"""Add annotation for a feature of a given type. If a parent_id is provided, record the relationship.
Args:
feature: The feature to create an annotation.
Expand Down
8 changes: 4 additions & 4 deletions src/python/ensembl/io/genomio/manifest/check_integrity.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Compare the genomic data in a DNA fasta file, seq_region json, gene models GFF3 and peptide fasta
"""Compare the genomic data in a DNA fasta file, seq_region JSON, gene models GFF3 and peptide fasta
to ensure their contents are in sync.
"""

Expand Down Expand Up @@ -483,7 +483,7 @@ def check_integrity(self):
# Check fasta_pep.fa integrity
# The sequence length and id retrieved from the fasta_pep file
# and compared to the translated CDS id and length in the gff
# We don't compare the peptide lengths because of seqedits
# We do not compare the peptide lengths because of sequence edits
if pep:
tr_errors = self.check_lengths(
pep, gff_translations, "Fasta translations vs gff", special_diff=True
Expand Down Expand Up @@ -527,7 +527,7 @@ def check_integrity(self):
)
)

# Check the seq.json intregrity
# Check the seq.json integrity
# Compare the length and id retrieved from seq.json to the gff
if seq_regions:
self.check_seq_region_lengths(
Expand Down Expand Up @@ -627,7 +627,7 @@ def check_lengths(self, list1, list2, name, allowed_len_diff=None, special_diff=
Error if there is a difference in length or ids between the lists.
"""

# check list diffferences, checks if abs(values diff) < allowed_len_diff
# check list differences, checks if abs(values diff) < allowed_len_diff

set1 = frozenset(list1)
set2 = frozenset(list2)
Expand Down
12 changes: 9 additions & 3 deletions src/python/ensembl/io/genomio/manifest/compute_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@
# limitations under the License.
"""Compute stats from the current genome files associated with the manifest."""

__all__ = [
"BiotypeCounter",
"manifest_stats",
"StatsError",
]

import json
from os import PathLike
from pathlib import Path
Expand Down Expand Up @@ -374,10 +380,10 @@ def compare_ncbi_counts(self, biotypes: Dict[str, BiotypeCounter], ncbi: Dict) -
for count_map in maps:
ncbi_name, prep_name = count_map
ncbi_count = ncbi.get(ncbi_name, 0)
preped: Optional[BiotypeCounter] = biotypes.get(prep_name)
prepped: Optional[BiotypeCounter] = biotypes.get(prep_name)
prep_count = 0
if preped is not None:
prep_count = preped.count
if prepped is not None:
prep_count = prepped.count

if prep_count != ncbi_count:
diff = prep_count - ncbi_count
Expand Down
1 change: 1 addition & 0 deletions src/python/ensembl/io/genomio/seq_region/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@

from .dump import *
from .prepare import *
from .rename import *
5 changes: 2 additions & 3 deletions src/python/ensembl/io/genomio/seq_region/dump.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
"get_seq_regions",
"add_attribs",
"get_synonyms",
"get_attribs",
"get_karyotype",
]

Expand Down Expand Up @@ -201,7 +200,7 @@ def get_synonyms(seq_region: SeqRegion, external_db_map: dict) -> List:
return syns


def get_attribs(seq_region: SeqRegion) -> List:
def _get_attribs(seq_region: SeqRegion) -> List:
"""Given a seq_region, extract the attribs as value-source items.
Args:
Expand Down Expand Up @@ -229,7 +228,7 @@ def get_attribs_dict(seq_region: SeqRegion) -> Dict[str, Any]:
Pairs of source and value for each attribute.
"""

attribs = get_attribs(seq_region)
attribs = _get_attribs(seq_region)
attrib_dict = {attrib["source"]: attrib["value"] for attrib in attribs}
return attrib_dict

Expand Down
13 changes: 11 additions & 2 deletions src/python/ensembl/io/genomio/seq_region/rename.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,15 @@
# limitations under the License.
"""Rename seq_region BRC names in a given core database."""

__all__ = [
"Operation",
"SeqRegionReplacement",
"get_rename_map",
"get_seq_regions_to_replace",
"rename_seq_regions",
"update_seq_region_name",
]

from dataclasses import dataclass
import logging
from enum import Enum, auto
Expand Down Expand Up @@ -108,7 +117,7 @@ def get_seq_regions_to_replace(
continue
seqr.seq_region_id = db_seqr.seq_region_id

attribs = get_attribs(db_seqr)
attribs = _get_attribs(db_seqr)
db_brc_name = attribs.get("BRC4_seq_region_name", "")
seqr.old_brc_name = db_brc_name
if not db_brc_name:
Expand All @@ -128,7 +137,7 @@ def get_seq_regions_to_replace(
return seq_regions


def get_attribs(seq_region: SeqRegion) -> Dict[str, str]:
def _get_attribs(seq_region: SeqRegion) -> Dict[str, str]:
"""Given a seq_region, extract the attribs as value-source items.
Args:
Expand Down
6 changes: 3 additions & 3 deletions src/python/tests/assembly/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def test_checksums(
pytest.param("wrong_md5_checksums.txt", None, False, id="Incorrect md5 checksum"),
pytest.param(None, None, True, id="No md5file specified, resort to default"),
pytest.param(None, Path("*"), False, id="Incompatible os path '*'"),
pytest.param("missingfile_md5.txt", None, False, id="md5 checksum with ref of missing file"),
pytest.param("missing_file_md5.txt", None, False, id="md5 checksum with ref of missing file"),
],
)
def test_md5_files(data_dir: Path, md5_file: str, md5_path: Optional[Path], checksum_bool: bool) -> None:
Expand Down Expand Up @@ -404,7 +404,7 @@ def test_get_files_selection(
@patch("ensembl.io.genomio.assembly.download.md5_files")
def test_retrieve_assembly_data(
mock_retrieve: Mock,
mock_download_singlefile: Mock,
mock_download_single_file: Mock,
mock_download_files: Mock,
mock_file_select: Mock,
mock_ftp: Mock,
Expand Down Expand Up @@ -441,5 +441,5 @@ def side_eff_conn(url: str):
with exception:
retrieve_assembly_data(accession, download_dir, 2)
assert mock_download_files.download_files.called_once()
assert mock_download_singlefile._download_file.called_once() # pylint: disable=protected-access
assert mock_download_single_file._download_file.called_once() # pylint: disable=protected-access
assert mock_file_select.get_files_selection.called_with(files_downloaded)
1 change: 0 additions & 1 deletion src/python/tests/database/test_dbconnection_lite.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ def fixture_meta_test_db(db_factory) -> UnitTestDB:
def test_get_metadata(meta_test_db: UnitTestDB) -> None:
"""Tests the method get_metadata()"""


# Check the new connection lite
dblite = DBConnectionLite(meta_test_db.dbc.url)
assert dblite.get_metadata() == _METADATA_CONTENT
Expand Down
4 changes: 2 additions & 2 deletions src/python/tests/genbank/test_extract_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@


class TestWriteFormattedFiles:
"""Test if all the expected output files are generated and formated correctly"""
"""Test if all the expected output files are generated and formatted correctly"""

prod_name = "TEST_prod"
gb_file = "input_file.gb"
Expand Down Expand Up @@ -238,7 +238,7 @@ def test_write_pep_fasta(
tmp_path: Path,
formatted_files_generator: FormattedFilesGenerator,
) -> None:
"""Test if peptides FATA file is generated when peptides are identified"""
"""Test if peptides FASTA file is generated when peptides are identified"""
record = SeqRecord(Seq("MFLRTQARFFHATTKKM"), id="cds-record")
CDS_feature = SeqFeature(
FeatureLocation(10, 20), type="CDS", qualifiers={"gene": ["GlyrA"], "transl_table": "2"}
Expand Down
26 changes: 13 additions & 13 deletions src/python/tests/gff3/test_extract_annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def test_product_is_informative(description: str, feature_id: Optional[List[str]
)
@pytest.mark.dependency(name="add_feature")
def test_add_feature(seq_feat_type: str, feat_type: str, expected: ContextManager) -> None:
"""Tests the `FunctionaAnnotation.add_feature()` method with only one feature.
"""Tests the `FunctionalAnnotation.add_feature()` method with only one feature.
Args:
seq_feat_type: Type for the sequence feature to add.
Expand All @@ -106,7 +106,7 @@ def test_add_feature(seq_feat_type: str, feat_type: str, expected: ContextManage
],
)
def test_add_feature_name(feat_id: str, feat_name: str, expected_synonyms: List[str]) -> None:
"""Tests the `FunctionaAnnotations.add_feature()` method with a feature name."""
"""Tests the `FunctionalAnnotations.add_feature()` method with a feature name."""
annot = FunctionalAnnotations()

seq_feat_type = "gene"
Expand All @@ -128,7 +128,7 @@ def test_add_feature_name(feat_id: str, feat_name: str, expected_synonyms: List[
)
@pytest.mark.dependency(name="add_parent_link", depends=["add_feature"])
def test_add_parent_link(parent_type: str, parent_id: str, child_id: str, expected: ContextManager) -> None:
"""Tests the `FunctionaAnnotation.add_parent_link()` method.
"""Tests the `FunctionalAnnotation.add_parent_link()` method.
Add a parent feature, and then add a parent link.
Expand Down Expand Up @@ -164,7 +164,7 @@ def test_get_parent(
out_child_id: str,
expected: ContextManager,
) -> None:
"""Tests the `FunctionaAnnotation.get_parent()` method.
"""Tests the `FunctionalAnnotation.get_parent()` method.
Args:
in_parent_type: Type for the parent sequence feature.
Expand Down Expand Up @@ -202,7 +202,7 @@ def test_get_parent(
def test_add_feature_fail(
child_type: str, child_id: str, out_parent_id: Optional[str], expected: ContextManager
) -> None:
"""Tests the `FunctionaAnnotation.add_feature()` method failures.
"""Tests the `FunctionalAnnotation.add_feature()` method failures.
Test the addition of a child feature after a parent has already been added.
Expand Down Expand Up @@ -270,7 +270,7 @@ def test_add_feature_fail(
def test_get_xrefs(
in_id: str, in_xrefs: Optional[List[str]], provider_name: str, expected_xrefs: List[Dict[str, str]]
) -> None:
"""Tests the `FunctionaAnnotation.get_xrefs()` method."""
"""Tests the `FunctionalAnnotation.get_xrefs()` method."""
annot = FunctionalAnnotations(provider_name=provider_name)
one_gene = GFFSeqFeature(type="gene", id=in_id)
if in_xrefs is not None:
Expand All @@ -291,7 +291,7 @@ def test_get_xrefs(
)
@pytest.mark.dependency(name="get_features", depends=["add_feature_fail"])
def test_get_features(feat_type: str, expected_number: int, expected: ContextManager) -> None:
"""Tests the `FunctionaAnnotation.get_features()` method.
"""Tests the `FunctionalAnnotation.get_features()` method.
Load 2 features, then test the fetching of those features.
Expand Down Expand Up @@ -325,7 +325,7 @@ def test_get_features(feat_type: str, expected_number: int, expected: ContextMan
None,
"Foobar",
"Foobar, transcript variant X1",
id="transcr with variant",
id="transcript with variant",
),
param(None, "Foobar", "Lorem", "Foobar", "Foobar", id="Transfer from transc, transl also set"),
param("Hypothetical gene", "Predicted function", "Foobar", "Foobar", "Foobar", id="Non informative"),
Expand All @@ -340,16 +340,16 @@ def test_transfer_descriptions(
out_gene_desc: Optional[str],
out_transc_desc: Optional[str],
) -> None:
"""Tests the `FunctionaAnnotation.transfer_descriptions()` method.
"""Tests the `FunctionalAnnotation.transfer_descriptions()` method.
Load 3 features (gene, transcript, translation) with or without a description for each one.
Args:
gene_desc: Description for the gene.
transc_desc: Description for the transcript.
transl_desc: Description for the translation.
out_gene_desc: Excpected description for the gene after transfer.
out_transc_desc: Excpected description for the transcript after transfer.
out_gene_desc: Expected description for the gene after transfer.
out_transc_desc: Expected description for the transcript after transfer.
"""
annot = FunctionalAnnotations()
Expand All @@ -370,9 +370,9 @@ def test_transfer_descriptions(

annot.transfer_descriptions()
genes = annot.get_features("gene")
transcs = annot.get_features("transcript")
transcripts = annot.get_features("transcript")
assert genes[gene_name].get("description") == out_gene_desc
assert transcs[transcript_name].get("description") == out_transc_desc
assert transcripts[transcript_name].get("description") == out_transc_desc


@pytest.mark.dependency(depends=["add_feature"])
Expand Down
Loading

0 comments on commit 3549b83

Please sign in to comment.