diff --git a/src/variation/gnomad_vcf_to_protein_variation.py b/src/variation/gnomad_vcf_to_protein_variation.py index 9607f4f4..270dfe31 100644 --- a/src/variation/gnomad_vcf_to_protein_variation.py +++ b/src/variation/gnomad_vcf_to_protein_variation.py @@ -4,7 +4,7 @@ from cool_seq_tool.handlers import SeqRepoAccess from cool_seq_tool.mappers import ManeTranscript -from cool_seq_tool.schemas import ResidueMode, Strand +from cool_seq_tool.schemas import Strand from ga4gh.core import domain_models, ga4gh_identify from ga4gh.vrs import models, normalize from gene.query import QueryHandler as GeneQueryHandler @@ -19,6 +19,7 @@ from variation.schemas.validation_response_schema import ValidationResult from variation.tokenize import Tokenize from variation.translate import Translate +from variation.utils import get_vrs_loc_seq from variation.validate import Validate @@ -372,11 +373,17 @@ def _dna_to_aa(dna_seq: str, strand: Strand) -> str: return aa def _get_protein_representation( - self, ga4gh_seq_id: str, aa_start_pos: int, aa_end_pos: int, aa_alt: str + self, + ga4gh_seq_id: str, + p_ac: str, + aa_start_pos: int, + aa_end_pos: int, + aa_alt: str, ) -> models.Allele: """Create VRS Allele for protein representation :param ga4gh_seq_id: GA4GH identifier for protein accession + :param p_ac: RefSeq or Ensembl protein accession :param aa_start_pos: Protein start position (inter-residue coordinates) :param aa_end_pos: Protein end position (inter-residue coordinates) :param aa_alt: Protein alternate sequence @@ -402,6 +409,12 @@ def _get_protein_representation( msg = f"VRS-Python unable to normalize allele: {e}" raise GnomadVcfToProteinError(msg) from e + loc_seq = get_vrs_loc_seq( + self.seqrepo_access, p_ac, variation.location.start, variation.location.end + ) + if loc_seq: + variation.location.sequence = models.SequenceString(root=loc_seq) + # Add VRS digests for VRS Allele and VRS Sequence Location variation.id = ga4gh_identify(variation) variation.location.id = ga4gh_identify(variation.location) @@ -420,25 +433,6 @@ def _get_gene_context(self, gene: str) -> domain_models.Gene | None: else None ) - def _get_vrs_ref_allele_seq( - self, location: models.SequenceLocation, p_ac: str - ) -> str | None: - """Return reference sequence given a VRS location. - - :param location: VRS Location object - :param identifier: Identifier for allele - :return: VRS ref seq allele - """ - start = location.start - end = location.end - if isinstance(start, int) and isinstance(end, int) and (start != end): - ref, _ = self.seqrepo_access.get_reference_sequence( - p_ac, start, end, residue_mode=ResidueMode.INTER_RESIDUE - ) - else: - ref = None - return ref - async def gnomad_vcf_to_protein(self, vcf_query: str) -> GnomadVcfToProteinService: """Get protein consequence for gnomAD-VCF like expression Assumes input query uses GRCh38 representation @@ -576,7 +570,7 @@ async def gnomad_vcf_to_protein(self, vcf_query: str) -> GnomadVcfToProteinServi # Create the protein VRS Allele try: variation = self._get_protein_representation( - p_ga4gh_seq_id, aa_start_pos, aa_end_pos, aa_alt + p_ga4gh_seq_id, p_ac, aa_start_pos, aa_end_pos, aa_alt ) except GnomadVcfToProteinError as e: warnings.append(str(e)) @@ -591,7 +585,6 @@ async def gnomad_vcf_to_protein(self, vcf_query: str) -> GnomadVcfToProteinServi return GnomadVcfToProteinService( variation_query=vcf_query, variation=variation, - vrs_ref_allele_seq=self._get_vrs_ref_allele_seq(variation.location, p_ac), gene_context=gene_context, warnings=warnings, service_meta_=ServiceMeta( diff --git a/src/variation/normalize.py b/src/variation/normalize.py index b40faf3a..b7759984 100644 --- a/src/variation/normalize.py +++ b/src/variation/normalize.py @@ -10,6 +10,7 @@ from variation import __version__ from variation.classify import Classify from variation.schemas.app_schemas import Endpoint +from variation.schemas.classification_response_schema import ClassificationType from variation.schemas.normalize_response_schema import ( HGVSDupDelModeOption, NormalizeService, @@ -21,10 +22,11 @@ TranslationResult, VrsSeqLocAcStatus, ) +from variation.schemas.validation_response_schema import ValidationSummary from variation.to_vrs import ToVRS from variation.tokenize import Tokenize from variation.translate import Translate -from variation.utils import update_warnings_for_no_resp +from variation.utils import get_vrs_loc_seq, update_warnings_for_no_resp from variation.validate import Validate @@ -138,6 +140,40 @@ def get_hgvs_dup_del_mode( return hgvs_dup_del_mode, warning + def _get_location_seq( + self, + validation_summary: ValidationSummary, + variation: dict, + priority_translation_result: TranslationResult, + ) -> str | None: + """Get reference sequence for a Sequence Location + + Does not support: + - Ambiguous genomic deletions or duplications + - Amplifications + - Variations that are not Allele or Copy Number + + :param validation_summary: Validation summary for classification containing + valid and invalid results + :param variation: VRS Variation object + :param priority_translation_result: Prioritized translation result + :return: Reference sequence for a sequence location if found + """ + valid_result = validation_summary.valid_results[0] + classification_type = valid_result.classification.classification_type + if classification_type not in { + ClassificationType.GENOMIC_DELETION_AMBIGUOUS, + ClassificationType.GENOMIC_DUPLICATION_AMBIGUOUS, + ClassificationType.AMPLIFICATION, + } and variation["type"] in {"Allele", "CopyNumberChange", "CopyNumberCount"}: + return get_vrs_loc_seq( + self.seqrepo_access, + priority_translation_result.vrs_seq_loc_ac, + variation["location"]["start"], + variation["location"]["end"], + ) + return None + async def normalize( self, q: str, @@ -231,26 +267,11 @@ async def normalize( try: variation = translation_result.vrs_variation except AttributeError as e: - # vrs_ref_allele_seq = None warnings.append(str(e)) else: - pass - # valid_result = validation_summary.valid_results[0] - # classification_type = valid_result.classification.classification_type - # if classification_type not in { - # ClassificationType.GENOMIC_DELETION_AMBIGUOUS, - # ClassificationType.GENOMIC_DUPLICATION_AMBIGUOUS, - # ClassificationType.AMPLIFICATION, - # }: - # variation_type = variation["type"] - # if variation_type in { - # "Allele", "CopyNumberChange", "CopyNumberCount" - # }: - # vrs_ref_allele_seq = self.get_ref_allele_seq( - # variation["location"], translation_result.vrs_seq_loc_ac - # ) - # else: - # vrs_ref_allele_seq = None + variation["location"]["sequence"] = self._get_location_seq( + validation_summary, variation, translation_result + ) if not variation: update_warnings_for_no_resp(label, warnings) diff --git a/src/variation/schemas/gnomad_vcf_to_protein_schema.py b/src/variation/schemas/gnomad_vcf_to_protein_schema.py index 4aa7c7aa..435d4e98 100644 --- a/src/variation/schemas/gnomad_vcf_to_protein_schema.py +++ b/src/variation/schemas/gnomad_vcf_to_protein_schema.py @@ -1,7 +1,6 @@ """Module for gnomad vcf to protein response schema""" from ga4gh.core import domain_models -from pydantic import StrictStr from variation.schemas.normalize_response_schema import NormalizeService @@ -10,4 +9,3 @@ class GnomadVcfToProteinService(NormalizeService): """Define response for gnomad vcf to protein service""" gene_context: domain_models.Gene | None = None - vrs_ref_allele_seq: StrictStr | None = None diff --git a/src/variation/to_copy_number_variation.py b/src/variation/to_copy_number_variation.py index f345bb3f..8eaba668 100644 --- a/src/variation/to_copy_number_variation.py +++ b/src/variation/to_copy_number_variation.py @@ -42,7 +42,7 @@ from variation.to_vrs import ToVRS from variation.tokenize import Tokenize from variation.translate import Translate -from variation.utils import get_priority_sequence_location +from variation.utils import get_priority_sequence_location, get_vrs_loc_seq from variation.validate import Validate VALID_CLASSIFICATION_TYPES = [ @@ -184,7 +184,14 @@ async def _hgvs_to_cnv_resp( do_liftover=do_liftover, ) if translations: - variation = translations[0].vrs_variation + translation_result = translations[0] + variation = translation_result.vrs_variation + variation["location"]["sequence"] = get_vrs_loc_seq( + self.seqrepo_access, + translation_result.vrs_seq_loc_ac, + variation["location"]["start"], + variation["location"]["end"], + ) if variation: if copy_number_type == HGVSDupDelModeOption.COPY_NUMBER_COUNT: @@ -493,6 +500,9 @@ def _get_parsed_seq_loc( sequenceReference=models.SequenceReference(refgetAccession=sequence), start=start_vrs, end=end_vrs, + sequence=get_vrs_loc_seq( + self.seqrepo_access, accession, start_vrs, end_vrs + ), ) seq_loc.id = ga4gh_identify(seq_loc) diff --git a/src/variation/to_vrs.py b/src/variation/to_vrs.py index 857d9b36..bfed644b 100644 --- a/src/variation/to_vrs.py +++ b/src/variation/to_vrs.py @@ -18,6 +18,7 @@ from variation.schemas.validation_response_schema import ValidationResult from variation.tokenize import Tokenize from variation.translate import Translate +from variation.utils import get_vrs_loc_seq from variation.validate import Validate from variation.vrs_representation import VRSRepresentation @@ -88,6 +89,31 @@ async def get_translations( return translations, warnings + def _get_vrs_variations(self, translations: list[TranslationResult]) -> list[dict]: + """Get translated VRS Variations. + + This method will also add ``sequence`` to the variation's location + + :param translations: List of translation results + :return: List of unique VRS Variations + """ + variations = [] + _added_variation_ids = set() + + # Ensure only unique VRS variations are in the list of variations returned + for tr in translations: + if tr.vrs_variation["id"] not in _added_variation_ids: + vrs_variation = tr.vrs_variation + vrs_variation["location"]["sequence"] = get_vrs_loc_seq( + self.seqrepo_access, + tr.vrs_seq_loc_ac, + vrs_variation["location"]["start"], + vrs_variation["location"]["end"], + ) + variations.append(vrs_variation) + _added_variation_ids.add(vrs_variation["id"]) + return variations + async def to_vrs(self, q: str) -> ToVRSService: """Return a VRS-like representation of all validated variations for a query. @@ -134,15 +160,6 @@ async def to_vrs(self, q: str) -> ToVRSService: translations = [] warnings = validation_summary.warnings - if not translations: - variations = [] - else: - variations = [] - # Ensure only unique VRS variations are in the list of variations returned - for tr in translations: - if tr.vrs_variation not in variations: - variations.append(tr.vrs_variation) - params["warnings"] = warnings - params["variations"] = variations + params["variations"] = self._get_vrs_variations(translations) return ToVRSService(**params) diff --git a/src/variation/utils.py b/src/variation/utils.py index 0b05b47f..b669af48 100644 --- a/src/variation/utils.py +++ b/src/variation/utils.py @@ -7,7 +7,9 @@ from bioutils.sequences import aa1_to_aa3 as _aa1_to_aa3 from bioutils.sequences import aa3_to_aa1 as _aa3_to_aa1 from cool_seq_tool.handlers import SeqRepoAccess +from cool_seq_tool.schemas import ResidueMode from ga4gh.core import domain_models +from ga4gh.vrs import models from variation.schemas.app_schemas import AmbiguousRegexType from variation.schemas.classification_response_schema import AmbiguousType @@ -209,3 +211,29 @@ def get_refget_accession( refget_accession = ids[0].split("ga4gh:")[-1] return refget_accession + + +def get_vrs_loc_seq( + seqrepo_access: SeqRepoAccess, + identifier: str, + start: int | models.Range | None, + end: int | models.Range | None, +) -> str | None: + """Get the literal sequence encoded by the ``identifier`` at the start and end + coordinates. + + Does not support locations that do not have both start/end as ints + + :param seqrepo_access: Access to SeqRepo client + :param identifier: Accession for VRS Location (not ga4gh) + :param start: Start position (inter-residue) + :param end: End position (inter-residue) + :return: Get the literal sequence at the given location + """ + if isinstance(start, int) and isinstance(end, int) and (start != end): + ref, _ = seqrepo_access.get_reference_sequence( + identifier, start, end, residue_mode=ResidueMode.INTER_RESIDUE + ) + else: + ref = None + return ref or None # get_reference_sequence can return empty str diff --git a/tests/conftest.py b/tests/conftest.py index 9d84de35..67bae951 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -143,6 +143,7 @@ def braf_600loc(): "type": "SequenceReference", "refgetAccession": "SQ.cQvw4UsHHRRlogxbWCB8W-mKD4AraM9y", }, + "sequence": "V", "type": "SequenceLocation", } @@ -172,6 +173,7 @@ def vhl_reference_agree(): "type": "SequenceReference", "refgetAccession": "SQ.z-Oa0pZkJ6GHJHOYM7h5mY_umc0SJzTu", }, + "sequence": "P", "type": "SequenceLocation", }, "state": {"sequence": "P", "type": "LiteralSequenceExpression"}, @@ -212,6 +214,7 @@ def protein_deletion_np_range(): "type": "SequenceReference", "refgetAccession": "SQ.AF1UFydIo02-bMplonKSfxlWY2q6ze3m", }, + "sequence": "LRENT", "type": "SequenceLocation", }, "state": { @@ -236,6 +239,7 @@ def braf_v600e_genomic_sub(): "type": "SequenceReference", "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", }, + "sequence": "A", "type": "SequenceLocation", }, "state": {"sequence": "T", "type": "LiteralSequenceExpression"}, @@ -255,6 +259,7 @@ def genomic_dup1_seq_loc_normalized(): "type": "SequenceReference", "refgetAccession": "SQ.Zu7h9AggXxhTaGVsy7h_EZSChSZGcmgX", }, + "sequence": "GG", "start": 49531260, "end": 49531262, "type": "SequenceLocation", @@ -276,6 +281,7 @@ def genomic_dup1_seq_loc_not_normalized(): }, "start": 49531261, "end": 49531262, + "sequence": "G", "type": "SequenceLocation", } @@ -304,6 +310,7 @@ def genomic_dup2_seq_loc_normalized(): }, "start": 33211289, "end": 33211293, + "sequence": "TCTA", "type": "SequenceLocation", } @@ -385,6 +392,7 @@ def genomic_del1_seq_loc(): }, "start": 10149810, "end": 10149811, + "sequence": "T", "type": "SequenceLocation", } @@ -424,6 +432,7 @@ def genomic_del2_seq_loc(): "type": "SequenceReference", "refgetAccession": "SQ.Zu7h9AggXxhTaGVsy7h_EZSChSZGcmgX", }, + "sequence": "ATGTTGACGGACAGCCTAT", "start": 10146594, "end": 10146613, "type": "SequenceLocation", @@ -509,6 +518,7 @@ def grch38_genomic_insertion_seq_loc(): "type": "SequenceReference", "refgetAccession": "SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7", }, + "sequence": "TACGTGATGGCT", "type": "SequenceLocation", } @@ -602,7 +612,7 @@ def assertion_checks(normalize_response, test_variation, check_vrs_id=False): if not check_vrs_id: _vrs_id_and_digest_existence_checks(actual) - expected = test_variation.copy().model_dump(exclude_none=True) + expected = test_variation.model_copy().model_dump(exclude_none=True) if not check_vrs_id: _delete_id_and_digest(expected) _delete_id_and_digest(expected["location"]) @@ -624,7 +634,7 @@ def cnv_assertion_checks(resp, test_fixture, check_vrs_id=False): if not check_vrs_id: _vrs_id_and_digest_existence_checks(actual, prefix=prefix) - expected = test_fixture.copy().model_dump(exclude_none=True) + expected = test_fixture.model_copy().model_dump(exclude_none=True) if not check_vrs_id: _delete_id_and_digest(expected) _delete_id_and_digest(expected["location"]) diff --git a/tests/test_gnomad_vcf_to_protein.py b/tests/test_gnomad_vcf_to_protein.py index d7231c84..0ff42edf 100644 --- a/tests/test_gnomad_vcf_to_protein.py +++ b/tests/test_gnomad_vcf_to_protein.py @@ -23,6 +23,7 @@ def mmel1_l30m(): "type": "SequenceReference", "refgetAccession": "SQ.iQ8F_pnsiQOLohiV2qh3OWRZiftUt8jZ", }, + "sequence": "L", "type": "SequenceLocation", }, "state": {"sequence": "M", "type": "LiteralSequenceExpression"}, @@ -42,6 +43,7 @@ def cdk11a_e314del(): "type": "SequenceReference", "refgetAccession": "SQ.N728VSRRMHJ1SrhJgKqJOCaa3l5Z4sqm", }, + "sequence": "EEEEEEEEEEEEE", "type": "SequenceLocation", }, "state": { @@ -66,6 +68,7 @@ def protein_insertion2(): "type": "SequenceReference", "refgetAccession": "SQ.qgIh8--4F6IpxRwX_lVtD2BhepH5B5Ef", }, + "Sequence": "Q", "type": "SequenceLocation", }, "state": {"sequence": "R", "type": "LiteralSequenceExpression"}, @@ -84,6 +87,7 @@ def atad3a_loc(): "type": "SequenceReference", "refgetAccession": "SQ.MHPOY_7fv8V9SktyvaTxulVFSK6XCxM8", }, + "sequence": "I", "type": "SequenceLocation", } @@ -156,6 +160,7 @@ def kras_g12d(): }, "start": 11, "end": 12, + "sequence": "G", }, "state": {"type": "LiteralSequenceExpression", "sequence": "D"}, } @@ -177,6 +182,7 @@ def multi_nuc_sub_pos(): }, "start": 242, "end": 244, + "sequence": "LP", }, "state": {"type": "LiteralSequenceExpression", "sequence": "PS"}, } @@ -198,6 +204,7 @@ def multi_nuc_sub_neg(): }, "start": 235, "end": 236, + "sequence": "S", }, "state": {"type": "LiteralSequenceExpression", "sequence": "G"}, } @@ -217,6 +224,7 @@ def delins_pos(): }, "start": 746, "end": 752, + "sequence": "LREATS", }, "state": {"type": "LiteralSequenceExpression", "sequence": "Q"}, } @@ -236,6 +244,7 @@ def delins_neg(): }, "start": 239, "end": 259, + "sequence": "PRLLFPTNSSSHLVALQGQP", }, "state": {"type": "LiteralSequenceExpression", "sequence": "TLTA"}, } @@ -266,7 +275,6 @@ async def test_substitution( resp = await test_handler.gnomad_vcf_to_protein("7-140753336-A-T") assertion_checks(resp, braf_v600e, check_vrs_id=True) assert resp.gene_context - assert resp.vrs_ref_allele_seq == "V" assert resp.warnings == [] # Reading Frame 3, Negative Strand @@ -312,7 +320,6 @@ async def test_reference_agree(test_handler, vhl_reference_agree): # https://www.ncbi.nlm.nih.gov/clinvar/variation/379039/?new_evidence=true resp = await test_handler.gnomad_vcf_to_protein("3-10142030-C-T") assertion_checks(resp, vhl_reference_agree) - assert resp.vrs_ref_allele_seq == "P" assert resp.gene_context assert resp.warnings == [] @@ -323,14 +330,14 @@ async def test_insertion(test_handler, protein_insertion, protein_insertion2): # positive strand (CA645561585) resp = await test_handler.gnomad_vcf_to_protein("7-55181319-C-CGGGTTA") assertion_checks(resp, protein_insertion) - assert resp.vrs_ref_allele_seq is None + assert resp.variation.location.sequence is None assert resp.gene_context assert resp.warnings == [] # negative strand (CA860540) resp = await test_handler.gnomad_vcf_to_protein("1-53327836-A-AGCC") assertion_checks(resp, protein_insertion2) - assert resp.vrs_ref_allele_seq is None + assert resp.variation.location.sequence is None assert resp.gene_context assert resp.warnings == [] @@ -340,7 +347,6 @@ async def test_deletion(test_handler, protein_deletion_np_range, cdk11a_e314del) """Test that deletion queries return correct response""" resp = await test_handler.gnomad_vcf_to_protein("17-39723966-TTGAGGGAAAACACAT-T") assertion_checks(resp, protein_deletion_np_range) - assert resp.vrs_ref_allele_seq == "LRENT" assert resp.gene_context assert resp.warnings == [] @@ -356,7 +362,6 @@ async def test_delins(test_handler, delins_pos, delins_neg): # CA645561524, Positive Strand resp = await test_handler.gnomad_vcf_to_protein("7-55174776-TTAAGAGAAGCAACATCT-CAA") assertion_checks(resp, delins_pos) - assert resp.vrs_ref_allele_seq == "LREATS" assert resp.gene_context # ClinVar ID 1217291, Negative Strand @@ -364,7 +369,6 @@ async def test_delins(test_handler, delins_pos, delins_neg): "X-153870419-GCTGCCCCTGCAAGGCCACCAGGTGGCTGCTGGAGTTGGTGGGGAAGAGCAGGCGCGG-CTGTCAATGT" ) assertion_checks(resp, delins_neg) - assert resp.vrs_ref_allele_seq == "PRLLFPTNSSSHLVALQGQP" assert resp.gene_context # CA16602420. Example where protein gene not found, but cDNA gene found @@ -379,7 +383,6 @@ async def test_invalid(test_handler): """Test that invalid queries return correct response""" resp = await test_handler.gnomad_vcf_to_protein("BRAF V600E") assert resp.variation is None - assert resp.vrs_ref_allele_seq is None assert resp.gene_context is None assert resp.warnings == [ "BRAF V600E is not a gnomAD VCF-like query (`chr-pos-ref-alt`)" @@ -387,12 +390,10 @@ async def test_invalid(test_handler): resp = await test_handler.gnomad_vcf_to_protein("7-140753336-T-G") assert resp.variation is None - assert resp.vrs_ref_allele_seq is None assert resp.gene_context is None assert set(resp.warnings) == {"Unable to get cDNA and protein representation"} resp = await test_handler.gnomad_vcf_to_protein("20-2-TC-TG") assert resp.variation is None - assert resp.vrs_ref_allele_seq is None assert resp.gene_context is None assert resp.warnings == ["20-2-TC-TG is not a valid gnomad vcf query"] diff --git a/tests/test_hgvs_dup_del_mode.py b/tests/test_hgvs_dup_del_mode.py index 97ebf1a4..b018ec6e 100644 --- a/tests/test_hgvs_dup_del_mode.py +++ b/tests/test_hgvs_dup_del_mode.py @@ -56,6 +56,7 @@ def genomic_dup1_free_text_seq_loc_normalized(): }, "start": 1032, "end": 1034, + "sequence": "GG", "type": "SequenceLocation", } @@ -70,6 +71,7 @@ def genomic_dup1_free_text_seq_loc_not_normalized(): }, "start": 1033, "end": 1034, + "sequence": "G", "type": "SequenceLocation", } @@ -138,6 +140,7 @@ def seq_loc_gt_100_bp(): }, "start": 33211289, "end": 33211490, + "sequence": "TCTACTTCTTCCCACCAAAGCATTTTGAAAAGTGTATATCAAGGCAGCGATAAAAAAAACCTGGTAAAAGTTCTTCAAACTTTATTGCTCCAGTAGGCTTAAAAACAATGAGAAACCAACAAACTTCAGCAGCTTTAAAAAAAGTAACACTTCAGTTTTTCCTATTCGTTTTTCTCCGAAGGTAATTGCCTCCCAGATCTG", "type": "SequenceLocation", } @@ -167,6 +170,7 @@ def genomic_dup2_free_text_seq_loc(): }, "start": 256, "end": 260, + "sequence": "TAGA", "type": "SequenceLocation", } @@ -399,6 +403,7 @@ def genomic_del1_free_text_seq_loc(): }, "start": 557, "end": 558, + "sequence": "T", "type": "SequenceLocation", } @@ -513,6 +518,7 @@ def genomic_del2_free_text_seq_loc(): }, "start": 491, "end": 510, + "sequence": "ATGTTGACGGACAGCCTAT", "type": "SequenceLocation", } diff --git a/tests/test_normalize.py b/tests/test_normalize.py index 472c449d..5ba2ed06 100644 --- a/tests/test_normalize.py +++ b/tests/test_normalize.py @@ -28,6 +28,7 @@ def dis3_p63a(): "type": "SequenceReference", "refgetAccession": "SQ.mlWsxfPKINN3o300stAI8oqN5U7P6kEu", }, + "sequence": "P", "type": "SequenceLocation", }, "state": {"sequence": "A", "type": "LiteralSequenceExpression"}, @@ -66,6 +67,7 @@ def vhl(): "type": "SequenceReference", "refgetAccession": "SQ.z-Oa0pZkJ6GHJHOYM7h5mY_umc0SJzTu", }, + "sequence": "Y", "type": "SequenceLocation", }, "state": {"sequence": "*", "type": "LiteralSequenceExpression"}, @@ -85,6 +87,7 @@ def nm_004448_cdna_delins(): "type": "SequenceReference", "refgetAccession": "SQ.y9b4LVMiCXpZxOg9Xt1NwRtssA03MwWM", }, + "sequence": "GG", "type": "SequenceLocation", }, "state": {"sequence": "CT", "type": "LiteralSequenceExpression"}, @@ -104,6 +107,7 @@ def nm_000551(): "type": "SequenceReference", "refgetAccession": "SQ.xBKOKptLLDr-k4hTyCetvARn16pDS_rW", }, + "sequence": "C", "type": "SequenceLocation", }, "state": {"sequence": "AA", "type": "LiteralSequenceExpression"}, @@ -122,6 +126,7 @@ def braf_cdna_seq_loc(): "type": "SequenceReference", "refgetAccession": "SQ.aKMPEJgmlZXt_F6gRY5cUG3THH2n-GUa", }, + "sequence": "T", "type": "SequenceLocation", } @@ -159,6 +164,7 @@ def protein_delins(): "type": "SequenceReference", "refgetAccession": "SQ.vyo55F6mA6n2LgN4cagcdRzOuh38V4mE", }, + "sequence": "LREAT", "type": "SequenceLocation", }, "state": {"sequence": "P", "type": "LiteralSequenceExpression"}, @@ -180,6 +186,7 @@ def cdna_deletion(): "type": "SequenceReference", "refgetAccession": "SQ.y9b4LVMiCXpZxOg9Xt1NwRtssA03MwWM", }, + "sequence": "TTGAGGGAAAACACAT", "type": "SequenceLocation", }, "state": { @@ -206,6 +213,7 @@ def genomic_deletion(): "type": "SequenceReference", "refgetAccession": "SQ.Zu7h9AggXxhTaGVsy7h_EZSChSZGcmgX", }, + "sequence": "CTCT", "type": "SequenceLocation", }, "state": { @@ -249,6 +257,7 @@ def genomic_insertion(): "type": "SequenceReference", "refgetAccession": "SQ.y9b4LVMiCXpZxOg9Xt1NwRtssA03MwWM", }, + "sequence": "TACGTGATGGCT", "type": "SequenceLocation", }, "state": { @@ -273,6 +282,7 @@ def genomic_substitution(): "type": "SequenceReference", "refgetAccession": "SQ.d_QsP29RWJi6bac7GOC9cJ9AO7s_HUMN", }, + "sequence": "C", "type": "SequenceLocation", }, "state": {"sequence": "T", "type": "LiteralSequenceExpression"}, @@ -292,6 +302,7 @@ def gnomad_vcf_genomic_sub_mnv(): "type": "SequenceReference", "refgetAccession": "SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI", }, + "sequence": "G", "type": "SequenceLocation", }, "state": {"sequence": "A", "type": "LiteralSequenceExpression"}, @@ -311,6 +322,7 @@ def genomic_sub_grch38(): "type": "SequenceReference", "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", }, + "sequence": "C", "type": "SequenceLocation", }, "state": {"sequence": "T", "type": "LiteralSequenceExpression"}, @@ -330,6 +342,7 @@ def grch38_braf_genom_reference_agree(): "type": "SequenceReference", "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", }, + "sequence": "A", "type": "SequenceLocation", }, "state": {"sequence": "A", "type": "LiteralSequenceExpression"}, @@ -349,6 +362,7 @@ def grch38_genomic_delins1(): "type": "SequenceReference", "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", }, + "sequence": "CA", "type": "SequenceLocation", }, "state": {"sequence": "AT", "type": "LiteralSequenceExpression"}, @@ -368,6 +382,7 @@ def grch38_genomic_delins2(): "type": "SequenceReference", "refgetAccession": "SQ.Zu7h9AggXxhTaGVsy7h_EZSChSZGcmgX", }, + "sequence": "C", "type": "SequenceLocation", }, "state": {"sequence": "AA", "type": "LiteralSequenceExpression"}, @@ -387,6 +402,7 @@ def genomic_delins_gene(): "type": "SequenceReference", "refgetAccession": "SQ.aKMPEJgmlZXt_F6gRY5cUG3THH2n-GUa", }, + "sequence": "TG", "type": "SequenceLocation", }, "state": {"sequence": "AT", "type": "LiteralSequenceExpression"}, @@ -410,6 +426,7 @@ def gnomad_vcf_genomic_delins1(): "type": "SequenceReference", "refgetAccession": "SQ.Zu7h9AggXxhTaGVsy7h_EZSChSZGcmgX", }, + "sequence": "AAAAGCTTTA", "type": "SequenceLocation", }, "state": {"sequence": "GAGGCTTT", "type": "LiteralSequenceExpression"}, @@ -429,6 +446,7 @@ def gnomad_vcf_genomic_delins2(): "type": "SequenceReference", "refgetAccession": "SQ.yC_0RBj3fgBlvgyAuycbzdubtLxq-rE0", }, + "sequence": "AG", "type": "SequenceLocation", }, "state": {"sequence": "TGAGTTT", "type": "LiteralSequenceExpression"}, @@ -448,6 +466,7 @@ def gnomad_vcf_genomic_delins3(): "type": "SequenceReference", "refgetAccession": "SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", }, + "sequence": "GG", "type": "SequenceLocation", }, "state": { @@ -470,6 +489,7 @@ def gnomad_vcf_genomic_delins4(): "type": "SequenceReference", "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", }, + "sequence": "CCC", "type": "SequenceLocation", }, "state": { @@ -496,6 +516,7 @@ def gnomad_vcf_genomic_delins5(): }, "start": 7675139, "end": 7675141, + "sequence": "GG", }, "state": { "type": "ReferenceLengthExpression", diff --git a/tests/to_copy_number_variation/test_hgvs_to_copy_number.py b/tests/to_copy_number_variation/test_hgvs_to_copy_number.py index 309a491a..4e0bbd90 100644 --- a/tests/to_copy_number_variation/test_hgvs_to_copy_number.py +++ b/tests/to_copy_number_variation/test_hgvs_to_copy_number.py @@ -29,6 +29,7 @@ def genomic_dup1_37_loc(): }, "start": 49568694, "end": 49568695, + "sequence": "G", "type": "SequenceLocation", } @@ -76,6 +77,7 @@ def genomic_dup2_37_loc(): }, "start": 33229406, "end": 33229410, + "sequence": "TCTA", "type": "SequenceLocation", } @@ -344,6 +346,7 @@ def genomic_del1_37_loc(): }, "start": 10191494, "end": 10191495, + "sequence": "T", "type": "SequenceLocation", } @@ -391,6 +394,7 @@ def genomic_del2_37_loc(): }, "start": 10188278, "end": 10188297, + "sequence": "ATGTTGACGGACAGCCTAT", "type": "SequenceLocation", }