include alias_protein_id in HGVS alias names, and make a new attribut…

…e called 'alias_protein_id'
cidgoh · Apr 29, 2024 · 047f085 · 047f085
1 parent 1a57d85
commit 047f085
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 7 deletions.
diff --git a/bin/functions.py b/bin/functions.py
@@ -4,7 +4,7 @@
 
 # standard variables used by all scripts
 empty_attributes = 'ID=;Name=;alias=;gene=;protein_name=;protein_symbol=;\
-    protein_id=;transcript_id=;ps_filter=;ps_exc=; \
+    protein_id=;alias_protein_id=;transcript_id=;ps_filter=;ps_exc=; \
     mat_pep=;mat_pep_desc=;mat_pep_acc=;ro=;ao=;dp=;sample_size=; \
     Reference_seq=;Variant_seq=;nt_name=;aa_name=;hgvs_nt=;hgvs_aa=;hgvs_alias=; \
     vcf_gene=;mutation_type=;viral_lineage=;multi_aa_name=; \
@@ -122,15 +122,15 @@ def add_hgvs_names(new_gvf):
                     convert_amino_acid_codes)
 
     # fill in 'hgvs_alias'
-    # add hgvs alias snps to rows with protein_id!=n/a
-    alias_snp_mask = (new_gvf['alias'].str.contains(aa_snp_regex, regex=True)) & (new_gvf['protein_id']!='n/a') & (new_gvf['alias']!='n/a')
+    # add hgvs alias snps to rows with alias_protein_id!=n/a
+    alias_snp_mask = (new_gvf['alias'].str.contains(aa_snp_regex, regex=True)) & (new_gvf['alias_protein_id']!='n/a') & (new_gvf['alias']!='n/a')
     new_gvf.loc[alias_snp_mask, 'hgvs_alias'] = \
-                new_gvf["protein_id"] + ":" + new_gvf.loc[alias_snp_mask, 'alias'].apply(
+                new_gvf["alias_protein_id"] + ":" + new_gvf.loc[alias_snp_mask, 'alias'].apply(
                     convert_amino_acid_codes)
     # add hgvs alias names for non-snps
-    alias_non_snp_mask = (new_gvf['alias'].str.contains(aa_other_regex, regex=True)) & (new_gvf['protein_id']!='n/a') & (new_gvf['alias']!='n/a')
+    alias_non_snp_mask = (new_gvf['alias'].str.contains(aa_other_regex, regex=True)) & (new_gvf['alias_protein_id']!='n/a') & (new_gvf['alias']!='n/a')
     new_gvf.loc[alias_non_snp_mask, 'hgvs_alias'] = \
-                new_gvf["protein_id"] + ":" + new_gvf.loc[alias_non_snp_mask, 'alias'].apply(
+                new_gvf["alias_protein_id"] + ":" + new_gvf.loc[alias_non_snp_mask, 'alias'].apply(
                     convert_amino_acid_codes)
 
     return(new_gvf)
@@ -570,7 +570,10 @@ def add_alias_names(df, GENE_PROTEIN_POSITIONS_DICT):
         # for each nsp in nsps_list, operate on the number column based on the nsp start coordinates
         for nsp in nsps_list:
             nsp_start_aa = int(GENE_PROTEIN_POSITIONS_DICT[nsp]["aa_start"])
+            nsp_protein_id = GENE_PROTEIN_POSITIONS_DICT[nsp]["protein_id"]
             nsp_mask = df['mat_pep']==nsp
+            # add alias_protein_id
+            df.loc[nsp_mask, 'alias_protein_id'] = nsp_protein_id
             # for each half of the mutation name...
             # update the numeric part
             df.loc[nsp_mask, '1_newnum'] = df['1_num'].astype(int) - nsp_start_aa + 1

diff --git a/bin/vcf2gvf.py b/bin/vcf2gvf.py
@@ -88,8 +88,9 @@ def vcftogvf(vcf, strain, GENE_PROTEIN_POSITIONS_DICT, sample_size):
     new_gvf["protein_symbol"] = json_df["protein_symbol"]
     new_gvf["protein_id"] = json_df["protein_id"]
     new_gvf["transcript_id"] = json_df["transcript_id"]
+    new_gvf["alias_protein_id"] = 'n/a'
 
-    # add 'alias' column for ORF1a/b mutations
+    # add 'alias' column for ORF1a/b mutations, and fill in "alias_protein_id" for these as well
     new_gvf = add_alias_names(new_gvf, GENE_PROTEIN_POSITIONS_DICT)
 
     # add clade_defining attribute