From 047f08521079dab64583312f43ca04321cb2b49c Mon Sep 17 00:00:00 2001 From: miseminger Date: Mon, 29 Apr 2024 16:56:46 -0700 Subject: [PATCH] include alias_protein_id in HGVS alias names, and make a new attribute called 'alias_protein_id' --- bin/functions.py | 15 +++++++++------ bin/vcf2gvf.py | 3 ++- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/bin/functions.py b/bin/functions.py index 1479eca..db725d1 100755 --- a/bin/functions.py +++ b/bin/functions.py @@ -4,7 +4,7 @@ # standard variables used by all scripts empty_attributes = 'ID=;Name=;alias=;gene=;protein_name=;protein_symbol=;\ - protein_id=;transcript_id=;ps_filter=;ps_exc=; \ + protein_id=;alias_protein_id=;transcript_id=;ps_filter=;ps_exc=; \ mat_pep=;mat_pep_desc=;mat_pep_acc=;ro=;ao=;dp=;sample_size=; \ Reference_seq=;Variant_seq=;nt_name=;aa_name=;hgvs_nt=;hgvs_aa=;hgvs_alias=; \ vcf_gene=;mutation_type=;viral_lineage=;multi_aa_name=; \ @@ -122,15 +122,15 @@ def add_hgvs_names(new_gvf): convert_amino_acid_codes) # fill in 'hgvs_alias' - # add hgvs alias snps to rows with protein_id!=n/a - alias_snp_mask = (new_gvf['alias'].str.contains(aa_snp_regex, regex=True)) & (new_gvf['protein_id']!='n/a') & (new_gvf['alias']!='n/a') + # add hgvs alias snps to rows with alias_protein_id!=n/a + alias_snp_mask = (new_gvf['alias'].str.contains(aa_snp_regex, regex=True)) & (new_gvf['alias_protein_id']!='n/a') & (new_gvf['alias']!='n/a') new_gvf.loc[alias_snp_mask, 'hgvs_alias'] = \ - new_gvf["protein_id"] + ":" + new_gvf.loc[alias_snp_mask, 'alias'].apply( + new_gvf["alias_protein_id"] + ":" + new_gvf.loc[alias_snp_mask, 'alias'].apply( convert_amino_acid_codes) # add hgvs alias names for non-snps - alias_non_snp_mask = (new_gvf['alias'].str.contains(aa_other_regex, regex=True)) & (new_gvf['protein_id']!='n/a') & (new_gvf['alias']!='n/a') + alias_non_snp_mask = (new_gvf['alias'].str.contains(aa_other_regex, regex=True)) & (new_gvf['alias_protein_id']!='n/a') & (new_gvf['alias']!='n/a') new_gvf.loc[alias_non_snp_mask, 'hgvs_alias'] = \ - new_gvf["protein_id"] + ":" + new_gvf.loc[alias_non_snp_mask, 'alias'].apply( + new_gvf["alias_protein_id"] + ":" + new_gvf.loc[alias_non_snp_mask, 'alias'].apply( convert_amino_acid_codes) return(new_gvf) @@ -570,7 +570,10 @@ def add_alias_names(df, GENE_PROTEIN_POSITIONS_DICT): # for each nsp in nsps_list, operate on the number column based on the nsp start coordinates for nsp in nsps_list: nsp_start_aa = int(GENE_PROTEIN_POSITIONS_DICT[nsp]["aa_start"]) + nsp_protein_id = GENE_PROTEIN_POSITIONS_DICT[nsp]["protein_id"] nsp_mask = df['mat_pep']==nsp + # add alias_protein_id + df.loc[nsp_mask, 'alias_protein_id'] = nsp_protein_id # for each half of the mutation name... # update the numeric part df.loc[nsp_mask, '1_newnum'] = df['1_num'].astype(int) - nsp_start_aa + 1 diff --git a/bin/vcf2gvf.py b/bin/vcf2gvf.py index 0e77490..5037c58 100755 --- a/bin/vcf2gvf.py +++ b/bin/vcf2gvf.py @@ -88,8 +88,9 @@ def vcftogvf(vcf, strain, GENE_PROTEIN_POSITIONS_DICT, sample_size): new_gvf["protein_symbol"] = json_df["protein_symbol"] new_gvf["protein_id"] = json_df["protein_id"] new_gvf["transcript_id"] = json_df["transcript_id"] + new_gvf["alias_protein_id"] = 'n/a' - # add 'alias' column for ORF1a/b mutations + # add 'alias' column for ORF1a/b mutations, and fill in "alias_protein_id" for these as well new_gvf = add_alias_names(new_gvf, GENE_PROTEIN_POSITIONS_DICT) # add clade_defining attribute