diff --git a/tests/conftest.py b/tests/conftest.py index 50a2da80..d2ab6472 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -148,7 +148,7 @@ def vhl_reference_agree(): @pytest.fixture(scope="session") def protein_insertion(): - """Create test fixture for NP protein insertion.""" + """Create test fixture for NP protein insertion (CA645561585).""" params = { "id": "ga4gh:VA.AOCCh_BU5wKkdgoDNqkORF_x4GQwWh1T", "location": { diff --git a/tests/test_gnomad_vcf_to_protein.py b/tests/test_gnomad_vcf_to_protein.py new file mode 100644 index 00000000..559e47cc --- /dev/null +++ b/tests/test_gnomad_vcf_to_protein.py @@ -0,0 +1,413 @@ +"""Module for testing gnomad_vcf_to_protein works correctly""" +import pytest +from ga4gh.vrs import models + +from tests.conftest import assertion_checks + + +@pytest.fixture(scope="module") +def test_handler(test_query_handler): + """Create test fixture for gnomad vcf to protein handler""" + return test_query_handler.gnomad_vcf_to_protein_handler + + +@pytest.fixture(scope="module") +def mmel1_l30m(): + """Create test fixture for MMEL1 L30M""" + params = { + "id": "ga4gh:VA.OqqETz467CITELOZsYDukkab7JaOWiZf", + "location": { + "id": "ga4gh:SL.Q7kfcqUWpIyEOgxcgPK1sRfgWPDv7zKA", + "end": 30, + "start": 29, + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.iQ8F_pnsiQOLohiV2qh3OWRZiftUt8jZ", + }, + "type": "SequenceLocation", + }, + "state": {"sequence": "M", "type": "LiteralSequenceExpression"}, + "type": "Allele", + } + return models.Allele(**params) + + +@pytest.fixture(scope="module") +def cdk11a_e314del(): + """Create test fixture for CDK11A Glu314del""" + params = { + "id": "ga4gh:VA._CVnGazN6KosqrFnDx7kny-rb6yAZWtB", + "location": { + "id": "ga4gh:SL.VqI6HuIFmm4XP3ocOTaobGxwqg4m6Ooi", + "end": 321, + "start": 308, + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.N728VSRRMHJ1SrhJgKqJOCaa3l5Z4sqm", + }, + "type": "SequenceLocation", + }, + "state": { + "length": 12, + "repeatSubunitLength": 1, + "sequence": "EEEEEEEEEEEE", + "type": "ReferenceLengthExpression", + }, + "type": "Allele", + } + return models.Allele(**params) + + +@pytest.fixture(scope="module") +def protein_insertion2(): + """Create test fixture for LRP8 p.Gln25_Leu26insArg (CA860540)""" + params = { + "id": "ga4gh:VA.5KWhsli69ac5zyoGf40Owu4CVNKy27So", + "location": { + "id": "ga4gh:SL.I4c4NL0g3vBajHe44faZFQtrcqrbA14d", + "end": 25, + "start": 25, + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.qgIh8--4F6IpxRwX_lVtD2BhepH5B5Ef", + }, + "type": "SequenceLocation", + }, + "state": {"sequence": "R", "type": "LiteralSequenceExpression"}, + "type": "Allele", + } + return models.Allele(**params) + + +@pytest.fixture(scope="module") +def atad3a_loc(): + """Create test fixture for ATAD3A location""" + return { + "id": "ga4gh:SL.xiP3uciIfJy_f44wNKCBvtsb35BC330Q", + "end": 7, + "start": 6, + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.MHPOY_7fv8V9SktyvaTxulVFSK6XCxM8", + }, + "type": "SequenceLocation", + } + + +@pytest.fixture(scope="module") +def atad3a_i7v(atad3a_loc): + """Create test fixture for ATAD3A Ile7Val""" + params = { + "id": "ga4gh:VA.i_L_bjPfI4XLMIKmVklV6eDLKEl1f7PD", + "location": atad3a_loc, + "state": {"sequence": "V", "type": "LiteralSequenceExpression"}, + "type": "Allele", + } + return models.Allele(**params) + + +@pytest.fixture(scope="module") +def atad3a_i7t(atad3a_loc): + """Create test fixture for ATAD3A Ile7Thr""" + params = { + "id": "ga4gh:VA.C8QO-YAfG66yj7cEwjEhkEfSd-oCSKfc", + "location": atad3a_loc, + "state": {"sequence": "T", "type": "LiteralSequenceExpression"}, + "type": "Allele", + } + return models.Allele(**params) + + +@pytest.fixture(scope="module") +def atad3a_i7m(atad3a_loc): + """Create test fixture for ATAD3A Ile7Met""" + params = { + "id": "ga4gh:VA.Fhmv3GK3bcIJRXOkigS9QNMzAWGW3WGa", + "location": atad3a_loc, + "state": {"sequence": "M", "type": "LiteralSequenceExpression"}, + "type": "Allele", + } + return models.Allele(**params) + + +@pytest.fixture(scope="session") +def braf_v600l(braf_600loc): + """Create test fixture for BRAF Val600Leu.""" + params = { + "id": "ga4gh:VA.c6f1MPfquVRPZO46wVzCaGaU8QnXoHNN", + "location": braf_600loc, + "state": {"sequence": "L", "type": "LiteralSequenceExpression"}, + "type": "Allele", + } + return models.Allele(**params) + + +@pytest.fixture(scope="session") +def braf_600_reference_agree(braf_600loc): + """Create test fixture for BRAF Val600=.""" + params = { + "id": "ga4gh:VA.wS6kJNbPkRJDIWg8F4CjOMQ5mcJzD_X4", + "location": braf_600loc, + "state": {"sequence": "V", "type": "LiteralSequenceExpression"}, + "type": "Allele", + } + return models.Allele(**params) + + +@pytest.fixture(scope="module") +def kras_g12d(): + """Fixture for KRAS G12D""" + params = { + "id": "ga4gh:VA.CB571ja_KfZM_Hjn9zjjgV1an3tDWRcl", + "type": "Allele", + "location": { + "id": "ga4gh:SL.OndkjmujtyUEZSjjCv0C-gpwnVbRgfj8", + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.fytWhQSNGnA-86vDiQCxTSzgkk_WfQRS", + }, + "start": 11, + "end": 12, + }, + "state": {"type": "LiteralSequenceExpression", "sequence": "D"}, + } + return models.Allele(**params) + + +@pytest.fixture(scope="module") +def multi_nuc_sub_pos(): + """Create test fixture for the protein consequence of a multinucleotide substitution on the + positive strand (CA16042245) + """ + params = { + "id": "ga4gh:VA.q_fhdDFpLT38y6TPU1EMtc2StwRGRVx0", + "type": "Allele", + "location": { + "id": "ga4gh:SL.xcqWKkKU83UBmLq5Q1yrH6IyLvWkMcWk", + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.HtNf7YrmmFih3cwRwYMlylPFMAs7-l9B", + }, + "start": 242, + "end": 244, + }, + "state": {"type": "LiteralSequenceExpression", "sequence": "PS"}, + } + return models.Allele(**params) + + +@pytest.fixture(scope="module") +def multi_nuc_sub_neg(): + """Create test fixture for the protein consequence of a multinucleotide substitution on the + negative strand (CA1139661942) + """ + params = { + "id": "ga4gh:VA.6K950oAyNXfIkPTmSHzX8f8wpNUroBGK", + "type": "Allele", + "location": { + "id": "ga4gh:SL.RiO0HagK846MBCJZK7NVcDtlYPkdaXC4", + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.bg8P_l39rOUQVLwsW3Dme-946Od8-3rB", + }, + "start": 235, + "end": 236, + }, + "state": {"type": "LiteralSequenceExpression", "sequence": "G"}, + } + return models.Allele(**params) + + +@pytest.fixture(scope="module") +def delins_pos(): + """Create test fixture for the protein consequence of a delins on positive strand (CA645561524)""" + params = { + "id": "ga4gh:VA.vBQ2TCfRHiG3ud_vqE88BNZEK7Qw28kg", + "type": "Allele", + "location": { + "id": "ga4gh:SL.1btwhKRj0zZQwo-_CalR-WavTB019t-V", + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.vyo55F6mA6n2LgN4cagcdRzOuh38V4mE", + }, + "start": 746, + "end": 752, + }, + "state": {"type": "LiteralSequenceExpression", "sequence": "Q"}, + } + return models.Allele(**params) + + +@pytest.fixture(scope="module") +def delins_neg(): + """Create test fixture for the protein consequence of a delins on negative strand (ClinVar ID 1217291)""" + params = { + "id": "ga4gh:VA.iSDLORgPGz21BetTQM5grpXyB3tIfZwl", + "type": "Allele", + "location": { + "id": "ga4gh:SL.cmPqZ9vkQlmV_eWEW3MFKFeVtT_n1-yc", + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.aDZLb9cs0cYskMKDIK-AXhaevHRA86JS", + }, + "start": 239, + "end": 259, + }, + "state": {"type": "LiteralSequenceExpression", "sequence": "TLTA"}, + } + return models.Allele(**params) + + +@pytest.mark.asyncio +async def test_substitution( + test_handler, + braf_v600e, + braf_v600l, + braf_600_reference_agree, + mmel1_l30m, + atad3a_i7v, + atad3a_i7t, + atad3a_i7m, + kras_g12d, + multi_nuc_sub_pos, + multi_nuc_sub_neg, +): + """Test that substitution queries return correct response""" + # Reading Frame 1, Negative Strand + resp = await test_handler.gnomad_vcf_to_protein("7-140753337-C-A") + assertion_checks(resp, braf_v600l) + assert resp.warnings == [] + + # Reading Frame 2, Negative Strand + resp = await test_handler.gnomad_vcf_to_protein("7-140753336-A-T") + assertion_checks(resp, braf_v600e) + assert resp.gene_context + assert resp.vrs_ref_allele_seq == "V" + assert resp.warnings == [] + + # Reading Frame 3, Negative Strand + resp = await test_handler.gnomad_vcf_to_protein("7-140753335-C-A") + assertion_checks(resp, braf_600_reference_agree) + assert resp.warnings == [] + + # Reading Frame 3, Negative Strand + resp = await test_handler.gnomad_vcf_to_protein("1-2629397-G-T") + assertion_checks(resp, mmel1_l30m) + assert resp.warnings == [] + + # Reading Frame 1, Positive Strand + resp = await test_handler.gnomad_vcf_to_protein("1-1512287-A-G") + assertion_checks(resp, atad3a_i7v) + assert resp.warnings == [] + + # Reading Frame 2, Positive Strand + resp = await test_handler.gnomad_vcf_to_protein("1-1512288-T-C") + assertion_checks(resp, atad3a_i7t) + assert resp.warnings == [] + + # Reading Frame 3, Positive Strand + resp = await test_handler.gnomad_vcf_to_protein("1-1512289-T-G") + assertion_checks(resp, atad3a_i7m) + assert resp.warnings == [] + + resp = await test_handler.gnomad_vcf_to_protein("12-25245350-C-T") + assertion_checks(resp, kras_g12d) + + # multi nucleotide ref/alt on positive strand (CA16042245) + resp = await test_handler.gnomad_vcf_to_protein("2-74530927-TGC-CAT") + assertion_checks(resp, multi_nuc_sub_pos) + + # multi nucleotide ref/alt on negative strand (CA1139661942) + resp = await test_handler.gnomad_vcf_to_protein("11-47348490-TG-CA") + assertion_checks(resp, multi_nuc_sub_neg) + + +@pytest.mark.asyncio +async def test_reference_agree(test_handler, vhl_reference_agree): + """Test that reference agree queries return correct response""" + # https://www.ncbi.nlm.nih.gov/clinvar/variation/379039/?new_evidence=true + resp = await test_handler.gnomad_vcf_to_protein("3-10142030-C-T") + assertion_checks(resp, vhl_reference_agree) + assert resp.vrs_ref_allele_seq == "P" + assert resp.gene_context + assert resp.warnings == [] + + +@pytest.mark.asyncio +async def test_insertion(test_handler, protein_insertion, protein_insertion2): + """Test that insertion queries return correct response""" + # positive strand (CA645561585) + resp = await test_handler.gnomad_vcf_to_protein("7-55181319-C-CGGGTTA") + assertion_checks(resp, protein_insertion) + assert resp.vrs_ref_allele_seq is None + assert resp.gene_context + assert resp.warnings == [] + + # negative strand (CA860540) + resp = await test_handler.gnomad_vcf_to_protein("1-53327836-A-AGCC") + assertion_checks(resp, protein_insertion2) + assert resp.vrs_ref_allele_seq is None + assert resp.gene_context + assert resp.warnings == [] + + +@pytest.mark.asyncio +async def test_deletion(test_handler, protein_deletion_np_range, cdk11a_e314del): + """Test that deletion queries return correct response""" + resp = await test_handler.gnomad_vcf_to_protein("17-39723966-TTGAGGGAAAACACAT-T") + assertion_checks(resp, protein_deletion_np_range) + assert resp.vrs_ref_allele_seq == "LRENT" + assert resp.gene_context + assert resp.warnings == [] + + resp = await test_handler.gnomad_vcf_to_protein("1-1708855-TTCC-T") + assertion_checks(resp, cdk11a_e314del) + assert resp.gene_context + assert resp.warnings == [] + + +@pytest.mark.asyncio +async def test_delins(test_handler, delins_pos, delins_neg): + """Test that delins queries return correct response""" + # CA645561524, Positive Strand + resp = await test_handler.gnomad_vcf_to_protein("7-55174776-TTAAGAGAAGCAACATCT-CAA") + assertion_checks(resp, delins_pos) + assert resp.vrs_ref_allele_seq == "LREATS" + assert resp.gene_context + + # ClinVar ID 1217291, Negative Strand + resp = await test_handler.gnomad_vcf_to_protein( + "X-153870419-GCTGCCCCTGCAAGGCCACCAGGTGGCTGCTGGAGTTGGTGGGGAAGAGCAGGCGCGG-CTGTCAATGT" + ) + assertion_checks(resp, delins_neg) + assert resp.vrs_ref_allele_seq == "PRLLFPTNSSSHLVALQGQP" + assert resp.gene_context + + +@pytest.mark.asyncio +async def test_invalid(test_handler): + """Test that invalid queries return correct response""" + resp = await test_handler.gnomad_vcf_to_protein("BRAF V600E") + assert resp.variation is None + assert resp.vrs_ref_allele_seq is None + assert resp.gene_context is None + assert resp.warnings == [ + "BRAF V600E is not a gnomAD VCF-like query (`chr-pos-ref-alt`)" + ] + + resp = await test_handler.gnomad_vcf_to_protein("7-140753336-T-G") + assert resp.variation is None + assert resp.vrs_ref_allele_seq is None + assert resp.gene_context is None + assert set(resp.warnings) == {"Unable to get cDNA and protein representation"} + + resp = await test_handler.gnomad_vcf_to_protein("20-2-TC-TG") + assert resp.variation is None + assert resp.vrs_ref_allele_seq is None + assert resp.gene_context is None + assert resp.warnings == ["20-2-TC-TG is not a valid gnomad vcf query"] diff --git a/variation/gnomad_vcf_to_protein_variation.py b/variation/gnomad_vcf_to_protein_variation.py new file mode 100644 index 00000000..e71dea54 --- /dev/null +++ b/variation/gnomad_vcf_to_protein_variation.py @@ -0,0 +1,587 @@ +"""Module for translating VCF-like to protein VRS Allele representation""" +from datetime import datetime +from typing import List, Optional, Tuple + +from cool_seq_tool.handlers import SeqRepoAccess +from cool_seq_tool.mappers import ManeTranscript +from cool_seq_tool.schemas import ResidueMode, Strand +from ga4gh.core import core_models, ga4gh_identify +from ga4gh.vrs import models, normalize +from gene.query import QueryHandler as GeneQueryHandler +from gene.schemas import MatchType as GeneMatchType + +from variation.classify import Classify +from variation.schemas.classification_response_schema import Nomenclature +from variation.schemas.gnomad_vcf_to_protein_schema import GnomadVcfToProteinService +from variation.schemas.normalize_response_schema import ServiceMeta +from variation.schemas.token_response_schema import AltType +from variation.schemas.validation_response_schema import ValidationResult +from variation.tokenize import Tokenize +from variation.translate import Translate +from variation.validate import Validate +from variation.version import __version__ + + +class GnomadVcfToProteinError(Exception): + """Custom exception for Gnomad VCF To Protein""" + + +def _get_char_match_count( + min_length: int, ref: str, alt: str, trim_prefix: bool = True +) -> int: + """Get the count of matched sequential prefixes + + :param min_length: Length of the shortest sequence (using ``ref`` or ``alt``) + :param ref: Reference sequence + :param alt: Alternate sequence + :param trim_prefix: ``True`` if trimming prefixes. ``False`` if trimming suffixes + :return: The number of sequential characters that were the same in ``ref`` and + ``alt`` + """ + matched = 0 + num_seq = range(min_length) if trim_prefix else reversed(range(min_length)) + for i in num_seq: + if alt[i] == ref[i]: + matched += 1 + else: + break + return matched + + +def _trim_prefix_or_suffix( + aa_ref: str, aa_alt: str, aa_start_pos: int = 0, trim_prefix: bool = True +) -> Tuple[str, str, int]: + """Trim prefix or suffix matches + + :param aa_ref: Amino acid reference sequence + :param aa_alt: Amino acid alternate sequence + :param aa_start_pos: Amino acid start position. Only required when ``trim_prefix`` + is ``True`` + :param trim_prefix: ``True`` if trimming prefixes. ``False`` if trimming suffixes + :return: Tuple containing trimmed ``aa_ref``, trimmed `aa_alt``, and + ``aa_start_pos`` after trimming + """ + if (aa_ref and aa_alt) and (aa_ref != aa_alt): + aa_match = 0 + len_aa_ref = len(aa_ref) + len_aa_alt = len(aa_alt) + + # Trim prefixes + range_len = len_aa_ref if len_aa_ref < len_aa_alt else len_aa_alt + aa_match = _get_char_match_count( + range_len, aa_ref, aa_alt, trim_prefix=trim_prefix + ) + if aa_match: + aa_start_pos += aa_match + aa_alt = aa_alt[aa_match:] if trim_prefix else aa_alt[:-aa_match] + aa_ref = aa_ref[aa_match:] if trim_prefix else aa_ref[:-aa_ref] + + return aa_ref, aa_alt, aa_start_pos + + +RNA_CODON_TO_1AA = { + "AUA": "I", + "AUC": "I", + "AUU": "I", + "AUG": "M", + "ACA": "T", + "ACC": "T", + "ACG": "T", + "ACU": "T", + "AAC": "N", + "AAU": "N", + "AAA": "K", + "AAG": "K", + "AGC": "S", + "AGU": "S", + "AGA": "R", + "AGG": "R", + "CUA": "L", + "CUC": "L", + "CUG": "L", + "CUU": "L", + "CCA": "P", + "CCC": "P", + "CCG": "P", + "CCU": "P", + "CAC": "H", + "CAU": "H", + "CAA": "Q", + "CAG": "Q", + "CGA": "R", + "CGC": "R", + "CGG": "R", + "CGU": "R", + "GUA": "V", + "GUC": "V", + "GUG": "V", + "GUU": "V", + "GCA": "A", + "GCC": "A", + "GCG": "A", + "GCU": "A", + "GAC": "D", + "GAU": "D", + "GAA": "E", + "GAG": "E", + "GGA": "G", + "GGC": "G", + "GGG": "G", + "GGU": "G", + "UCA": "S", + "UCC": "S", + "UCG": "S", + "UCU": "S", + "UUC": "F", + "UUU": "F", + "UUA": "L", + "UUG": "L", + "UAC": "Y", + "UAU": "Y", + "UAA": "*", + "UAG": "*", + "UGC": "C", + "UGU": "C", + "UGA": "*", + "UGG": "W", +} + + +class GnomadVcfToProteinVariation: + """Class for translating gnomAD VCF-like representation to VRS Allele protein + representation + """ + + def __init__( + self, + seqrepo_access: SeqRepoAccess, + tokenizer: Tokenize, + classifier: Classify, + validator: Validate, + translator: Translate, + mane_transcript: ManeTranscript, + gene_normalizer: GeneQueryHandler, + ) -> None: + """Initialize the GnomadVcfToProteinVariation class + + :param seqrepo_access: Access to SeqRepo + :param tokenizer: Tokenizer class for tokenizing + :param classifier: Classifier class for classifying tokens + :param validator: Validator class for validating valid inputs + :param translator: Translating valid inputs + :param mane_transcript: Access MANE Transcript information + :param gene_normalizer: Client for normalizing gene concepts + """ + self.seqrepo_access = seqrepo_access + self.tokenizer = tokenizer + self.classifier = classifier + self.validator = validator + self.translator = translator + self.mane_transcript = mane_transcript + self.gene_normalizer = gene_normalizer + + async def _get_valid_result( + self, vcf_query: str, warnings: List + ) -> List[ValidationResult]: + """Get gnomad vcf validation summary + + :param vcf_query: gnomad vcf input query + :param warnings: List of warnings + :raises GnomadVcfToProteinError: If no tokens, classifications, or valid results + are found. Also if ``vcf_query`` is not a gnomAD VCF-like query. + :return: List of valid results for a gnomad VCF query + """ + tokens = self.tokenizer.perform(vcf_query, warnings) + if not tokens: + raise GnomadVcfToProteinError("No tokens found") + + classification = self.classifier.perform(tokens) + if not classification: + raise GnomadVcfToProteinError("No classification found") + + if classification.nomenclature != Nomenclature.GNOMAD_VCF: + raise GnomadVcfToProteinError( + f"{vcf_query} is not a gnomAD VCF-like query (`chr-pos-ref-alt`)" + ) + + validation_summary = await self.validator.perform(classification) + valid_results = validation_summary.valid_results + if valid_results: + # Temporary work around until issue-490 complete + valid_results.sort( + key=lambda v: int(v.accession.split(".")[-1]), + reverse=True, + ) + return valid_results[0] + else: + raise GnomadVcfToProteinError( + f"{vcf_query} is not a valid gnomad vcf query" + ) + + @staticmethod + def _get_alt_type_and_prefix_match( + len_g_ref: int, len_g_alt: int, g_ref: str, g_alt: str + ) -> Tuple[AltType, int]: + """Get genomic alteration type and number of prefixes match + + :param len_g_ref: Length of genomic reference sequence + :param len_g_alt: Length of genomic alternate sequence + :param g_ref: Genomic reference sequence + :param g_alt: Genomic alternate sequence + :return: Tuple containing genomic alteration type and the number of prefixes matched + """ + if len_g_ref == len_g_alt: + num_prefix_matched = _get_char_match_count( + len_g_ref, g_ref, g_alt, trim_prefix=True + ) + alt_type = AltType.SUBSTITUTION + elif len_g_ref > len_g_alt: + num_prefix_matched = _get_char_match_count( + len_g_alt, g_ref, g_alt, trim_prefix=True + ) + alt_type = ( + AltType.DELETION if num_prefix_matched == len_g_alt else AltType.DELINS + ) + else: + num_prefix_matched = _get_char_match_count( + len_g_ref, g_ref, g_alt, trim_prefix=True + ) + alt_type = ( + AltType.INSERTION if num_prefix_matched == len_g_ref else AltType.DELINS + ) + + return alt_type, num_prefix_matched + + def _get_genomic_pos_range( + self, + c_start_pos: int, + c_end_pos: int, + strand: Strand, + g_start_pos: int, + g_end_pos: int, + ) -> Tuple[int, int, int]: + """Get genomic positions to cover the range of codons + + :param c_start_pos: cDNA start position + :param c_end_pos: cDNA end position + :param strand: Strand + :param g_start_pos: Original genomic start position for change + :param g_end_pos: Original genomic end position for change + :return: Tuple containing genomic start and end positions and the start index + for the original position change + """ + # Get cDNA reading frame + start_reading_frame = self.mane_transcript._get_reading_frame(c_start_pos + 1) + end_reading_frame = self.mane_transcript._get_reading_frame(c_end_pos) + + # Get genomic position range change + # This ensures that there 3 nucleotides needed for codon + strand = strand + start_ix = start_reading_frame - 1 + if strand == Strand.NEGATIVE: + new_g_end_pos = g_end_pos + (start_reading_frame - 1) + new_g_start_pos = g_start_pos - (3 - end_reading_frame) + else: + new_g_start_pos = g_start_pos - (start_reading_frame - 1) + new_g_end_pos = g_end_pos + (3 - end_reading_frame) + return new_g_start_pos, new_g_end_pos, start_ix + + def _get_genomic_alt( + self, + g_ac: str, + g_input_alt: str, + g_end_pos: int, + alt_type: AltType, + genomic_start_ix: int, + strand: Strand, + ref: str, + ) -> str: + """Get entire genomic altered sequence + + :param g_ac: Genomic accession + :param g_input_alt: Original alteration provided by VCF-like query + :param g_end_pos: Genomic end position for codon + :param alt_type: The type of alteration + :param genomic_start_ix: The start index for the original genomic start position + :param strand: Strand + :param ref: The genomic reference sequence + :return: The updated genomic alteration + """ + if alt_type == AltType.DELETION: + alt = "" + else: + alt = ref[:genomic_start_ix] + + if strand == Strand.POSITIVE: + alt += g_input_alt + else: + alt += g_input_alt[::-1] + + if alt_type == AltType.SUBSTITUTION: + alt += ref[len(alt) :] + else: + alt += ref[len(ref) - genomic_start_ix :] + + # We need to get the entire inserted sequence. It needs to be a factor of 3 + # since DNA (3 nuc) -> RNA (3 nuc) -> Protein (1 aa). The reason why we + # DO NOT do this for insertions, is because we only want the provided + # insertion sequence + if alt_type != AltType.INSERTION: + len_alt = len(alt) + rem_alt = len_alt % 3 + if rem_alt != 0: + tmp_g_end_pos = g_end_pos + (3 - rem_alt) + tmp_ref, _ = self.seqrepo_access.get_reference_sequence( + g_ac, g_end_pos, tmp_g_end_pos + ) + alt += tmp_ref + return alt + + @staticmethod + def _dna_to_aa(dna_seq: str, strand: Strand) -> str: + """Get amino acid(s) from DNA sequence + + :param dna_seq: DNA sequence + :raises ValueError: If DNA character is not supported + :return: Amino acid(s) + """ + # DNA -> RNA + rna_seq = "" + if strand == strand.NEGATIVE: + # Since it's on the negative strand, we need to get the nucleic acid complement + for char in dna_seq: + if char == "A": + rna_seq += "U" + elif char == "T": + rna_seq += "A" + elif char == "G": + rna_seq += "C" + elif char == "C": + rna_seq += "G" + else: + raise ValueError(f"{char} is not a supported nucleotide") + else: + # We only need to replace T/U for DNA->RNA + rna_seq = dna_seq.replace("T", "U") + + # RNA -> 1 letter Amino Acid codes + aa = "" + for i in range(int(len(rna_seq) / 3)): + aa += RNA_CODON_TO_1AA[rna_seq[3 * i : (3 * i) + 3]] + return aa + + def _get_protein_representation( + self, ga4gh_seq_id: str, aa_start_pos: int, aa_end_pos: int, aa_alt: str + ) -> models.Allele: + """Create VRS Allele for protein representation + + :param ga4gh_seq_id: GA4GH identifier for protein accession + :param aa_start_pos: Protein start position (inter-residue coordinates) + :param aa_end_pos: Protein end position (inter-residue coordinates) + :param aa_alt: Protein alternate sequence + :raises GnomadVcfToProteinError: If VRS-Python is unable to perform fully + justified allele normalization + :return: Normalized VRS Allele on the protein sequence + """ + variation = models.Allele( + location=models.SequenceLocation( + sequenceReference=models.SequenceReference( + refgetAccession=ga4gh_seq_id[0].split("ga4gh:")[-1] + ), + start=aa_start_pos, + end=aa_end_pos, + ), + state=models.LiteralSequenceExpression(sequence=aa_alt), + ) + + # Perform fully justified allele normalization + try: + variation = normalize(variation, self.seqrepo_access) + except (KeyError, AttributeError) as e: + raise GnomadVcfToProteinError(f"VRS-Python unable to normalize allele: {e}") + + # Add VRS digests for VRS Allele and VRS Sequence Location + variation.id = ga4gh_identify(variation) + variation.location.id = ga4gh_identify(variation.location) + return variation + + def _get_gene_context(self, gene: str) -> Optional[core_models.Gene]: + """Get additional gene information from gene-normalizer + + :param gene: Gene symbol + :return: Gene data from gene-normalizer if match found + """ + gene_norm_resp = self.gene_normalizer.normalize(gene) + return ( + gene_norm_resp.gene + if gene_norm_resp.match_type != GeneMatchType.NO_MATCH + else None + ) + + def _get_vrs_ref_allele_seq( + self, location: models.SequenceLocation, p_ac: str + ) -> Optional[str]: + """Return reference sequence given a VRS location. + + :param location: VRS Location object + :param identifier: Identifier for allele + :return: VRS ref seq allele + """ + start = location.start + end = location.end + if isinstance(start, int) and isinstance(end, int) and (start != end): + ref, _ = self.seqrepo_access.get_reference_sequence( + p_ac, start, end, residue_mode=ResidueMode.INTER_RESIDUE + ) + else: + ref = None + return ref + + async def gnomad_vcf_to_protein(self, vcf_query: str) -> GnomadVcfToProteinService: + """Get protein consequence for gnomAD-VCF like expression + Assumes input query uses GRCh38 representation + + :param vcf_query: gnomAD VCF-like expression (``chr-pos-ref-alt``) on the GRCh38 + assembly + :return: GnomadVcfToProteinService containing protein VRS Allele, if translation + was successful + """ + variation = None + warnings = [] + + # First we need to validate the input query + try: + valid_result = await self._get_valid_result(vcf_query, warnings) + except GnomadVcfToProteinError as e: + warnings.append(str(e)) + return GnomadVcfToProteinService( + variation_query=vcf_query, + variation=variation, + warnings=warnings, + service_meta_=ServiceMeta( + version=__version__, response_datetime=datetime.now() + ), + ) + + # Get relevant information from query + token = valid_result.classification.matching_tokens[0] + g_ac = valid_result.accession + g_ref = token.ref + g_alt = token.alt + + len_g_ref = len(g_ref) + len_g_alt = len(g_alt) + + # Determine the type of alteration and the number of nucleotide prefixes matched + alt_type, num_prefix_matched = self._get_alt_type_and_prefix_match( + len_g_ref, len_g_alt, g_ref, g_alt + ) + + # Get genomic position change + g_start_pos = token.pos + if alt_type == AltType.SUBSTITUTION: + g_end_pos = g_start_pos + (len_g_ref - (num_prefix_matched + 1)) + else: + g_end_pos = g_start_pos + (len_g_ref - num_prefix_matched) + + if num_prefix_matched == 0: + g_end_pos -= 1 + + # Given genomic data, get associated cDNA and protein representation + p_c_data = await self.mane_transcript.grch38_to_mane_c_p( + g_ac, g_start_pos, g_end_pos, try_longest_compatible=True + ) + if not p_c_data: + warnings.append("Unable to get cDNA and protein representation") + return GnomadVcfToProteinService( + variation_query=vcf_query, + variation=variation, + warnings=warnings, + service_meta_=ServiceMeta( + version=__version__, response_datetime=datetime.now() + ), + ) + + p_data = p_c_data.protein + c_data = p_c_data.cdna + + # Get GA4GH identifier for protein accession. This is used later, but we want + # to fail fast + p_ac = p_data.refseq or p_data.ensembl + p_ga4gh_seq_id, w = self.seqrepo_access.translate_identifier(p_ac, "ga4gh") + if w: + warnings.append(w) + return GnomadVcfToProteinService( + variation_query=vcf_query, + variation=variation, + warnings=warnings, + service_meta_=ServiceMeta( + version=__version__, response_datetime=datetime.now() + ), + ) + + # Get genomic position range change + # This ensures that there 3 nucleotides needed for codon + strand = c_data.strand + new_g_start_pos, new_g_end_pos, genomic_start_ix = self._get_genomic_pos_range( + c_data.pos[0], c_data.pos[1], strand, g_start_pos, g_end_pos + ) + + # Get genomic reference sequence + ref, w = self.seqrepo_access.get_reference_sequence( + g_ac, new_g_start_pos, new_g_end_pos + ) + if w: + warnings.append(w) + return GnomadVcfToProteinService( + variation_query=vcf_query, + variation=variation, + warnings=warnings, + service_meta_=ServiceMeta( + version=__version__, response_datetime=datetime.now() + ), + ) + + if strand == Strand.NEGATIVE: + ref = ref[::-1] + + # Get genomic altered sequence + alt = self._get_genomic_alt( + g_ac, g_alt, new_g_end_pos, alt_type, genomic_start_ix, strand, ref + ) + + # DNA -> RNA -> Protein (1 AA) + aa_ref = self._dna_to_aa(ref, strand) + aa_alt = self._dna_to_aa(alt, strand) + + # Trim AA prefixes / suffixes and update the protein start position accordingly + aa_start_pos = p_data.pos[0] + aa_ref, aa_alt, aa_start_pos = _trim_prefix_or_suffix( + aa_ref, aa_alt, aa_start_pos=aa_start_pos, trim_prefix=True + ) + aa_ref, aa_alt, _ = _trim_prefix_or_suffix(aa_ref, aa_alt, trim_prefix=False) + + # Get protein end position + if alt_type == AltType.DELETION: + aa_end_pos = aa_start_pos + (len(aa_ref) - 1) + else: + aa_end_pos = p_data.pos[1] + + # Create the protein VRS Allele + try: + variation = self._get_protein_representation( + p_ga4gh_seq_id, aa_start_pos, aa_end_pos, aa_alt + ) + except GnomadVcfToProteinError as e: + warnings.append(str(e)) + + return GnomadVcfToProteinService( + variation_query=vcf_query, + variation=variation, + vrs_ref_allele_seq=self._get_vrs_ref_allele_seq(variation.location, p_ac), + gene_context=self._get_gene_context(p_data.gene), + warnings=warnings, + service_meta_=ServiceMeta( + version=__version__, response_datetime=datetime.now() + ), + ) diff --git a/variation/main.py b/variation/main.py index 1f442520..d83b87a3 100644 --- a/variation/main.py +++ b/variation/main.py @@ -23,6 +23,7 @@ ParsedToCxVarQuery, ParsedToCxVarService, ) +from variation.schemas.gnomad_vcf_to_protein_schema import GnomadVcfToProteinService from variation.schemas.hgvs_to_copy_number_schema import ( HgvsToCopyNumberChangeService, HgvsToCopyNumberCountService, @@ -296,6 +297,28 @@ def vrs_python_translate_from( q_description = "GRCh38 gnomAD VCF (chr-pos-ref-alt) to normalize to MANE protein variation." # noqa: E501 +@app.get( + "/variation/gnomad_vcf_to_protein", + summary=g_to_p_summary, + response_description=g_to_p_response_description, + response_model_exclude_none=True, + description=g_to_p_description, + response_model=GnomadVcfToProteinService, + tags=[Tag.TO_PROTEIN_VARIATION], +) +async def gnomad_vcf_to_protein( + q: str = Query(..., description=q_description), +) -> GnomadVcfToProteinService: + """Return VRS representation for variation on protein coordinate. + + :param q: gnomad VCF to normalize to protein variation. + :return: GnomadVcfToProteinService for variation + """ + q = unquote(q.strip()) + resp = await query_handler.gnomad_vcf_to_protein_handler.gnomad_vcf_to_protein(q) + return resp + + hgvs_dup_del_mode_decsr = ( "This parameter determines how to interpret HGVS dup/del expressions in VRS." ) diff --git a/variation/normalize.py b/variation/normalize.py index 5e350895..37b05fc5 100644 --- a/variation/normalize.py +++ b/variation/normalize.py @@ -264,21 +264,3 @@ async def normalize( params["variation"] = variation params["warnings"] = warnings return NormalizeService(**params) - - # def get_ref_allele_seq(self, location: Dict, ac: str) -> Optional[str]: - # """Return ref allele seq for transcript. - - # :param location: VRS Location object - # :param identifier: Identifier for allele - # :return: Ref seq allele - # """ - # ref = None - # start = location["start"] - # end = location["end"] - # if isinstance(start, int) and isinstance(end, int): - # if start != end: - # ref, _ = self.seqrepo_access.get_reference_sequence( - # ac, start, end, residue_mode=ResidueMode.INTER_RESIDUE - # ) - - # return ref diff --git a/variation/query.py b/variation/query.py index 87add090..60c6a4f3 100644 --- a/variation/query.py +++ b/variation/query.py @@ -7,6 +7,7 @@ from gene.query import QueryHandler as GeneQueryHandler from variation.classify import Classify +from variation.gnomad_vcf_to_protein_variation import GnomadVcfToProteinVariation from variation.hgvs_dup_del_mode import HGVSDupDelMode from variation.normalize import Normalize from variation.to_copy_number_variation import ToCopyNumberVariation @@ -65,6 +66,9 @@ def __init__( self.to_vrs_handler = ToVRS(*to_vrs_params) normalize_params = to_vrs_params + [uta_db] self.normalize_handler = Normalize(*normalize_params) + self.gnomad_vcf_to_protein_handler = GnomadVcfToProteinVariation( + *to_vrs_params + [mane_transcript, gene_query_handler] + ) self.to_copy_number_handler = ToCopyNumberVariation( *to_vrs_params + [gene_query_handler, uta_db] ) diff --git a/variation/schemas/gnomad_vcf_to_protein_schema.py b/variation/schemas/gnomad_vcf_to_protein_schema.py new file mode 100644 index 00000000..2eb7dc6f --- /dev/null +++ b/variation/schemas/gnomad_vcf_to_protein_schema.py @@ -0,0 +1,14 @@ +"""Module for gnomad vcf to protein response schema""" +from typing import Optional + +from ga4gh.core import core_models +from pydantic import StrictStr + +from variation.schemas.normalize_response_schema import NormalizeService + + +class GnomadVcfToProteinService(NormalizeService): + """Define response for gnomad vcf to protein service""" + + gene_context: Optional[core_models.Gene] = None + vrs_ref_allele_seq: Optional[StrictStr] = None diff --git a/variation/version.py b/variation/version.py index 7b64f4f8..fc0dc146 100644 --- a/variation/version.py +++ b/variation/version.py @@ -1,2 +1,2 @@ """Module for version of app""" -__version__ = "0.8.0-dev1" +__version__ = "0.8.1-dev0"