Skip to content

Commit

Permalink
include alias_protein_id in HGVS alias names, and make a new attribut…
Browse files Browse the repository at this point in the history
…e called 'alias_protein_id'
  • Loading branch information
miseminger committed Apr 29, 2024
1 parent 1a57d85 commit 047f085
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 7 deletions.
15 changes: 9 additions & 6 deletions bin/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

# standard variables used by all scripts
empty_attributes = 'ID=;Name=;alias=;gene=;protein_name=;protein_symbol=;\
protein_id=;transcript_id=;ps_filter=;ps_exc=; \
protein_id=;alias_protein_id=;transcript_id=;ps_filter=;ps_exc=; \
mat_pep=;mat_pep_desc=;mat_pep_acc=;ro=;ao=;dp=;sample_size=; \
Reference_seq=;Variant_seq=;nt_name=;aa_name=;hgvs_nt=;hgvs_aa=;hgvs_alias=; \
vcf_gene=;mutation_type=;viral_lineage=;multi_aa_name=; \
Expand Down Expand Up @@ -122,15 +122,15 @@ def add_hgvs_names(new_gvf):
convert_amino_acid_codes)

# fill in 'hgvs_alias'
# add hgvs alias snps to rows with protein_id!=n/a
alias_snp_mask = (new_gvf['alias'].str.contains(aa_snp_regex, regex=True)) & (new_gvf['protein_id']!='n/a') & (new_gvf['alias']!='n/a')
# add hgvs alias snps to rows with alias_protein_id!=n/a
alias_snp_mask = (new_gvf['alias'].str.contains(aa_snp_regex, regex=True)) & (new_gvf['alias_protein_id']!='n/a') & (new_gvf['alias']!='n/a')
new_gvf.loc[alias_snp_mask, 'hgvs_alias'] = \
new_gvf["protein_id"] + ":" + new_gvf.loc[alias_snp_mask, 'alias'].apply(
new_gvf["alias_protein_id"] + ":" + new_gvf.loc[alias_snp_mask, 'alias'].apply(
convert_amino_acid_codes)
# add hgvs alias names for non-snps
alias_non_snp_mask = (new_gvf['alias'].str.contains(aa_other_regex, regex=True)) & (new_gvf['protein_id']!='n/a') & (new_gvf['alias']!='n/a')
alias_non_snp_mask = (new_gvf['alias'].str.contains(aa_other_regex, regex=True)) & (new_gvf['alias_protein_id']!='n/a') & (new_gvf['alias']!='n/a')
new_gvf.loc[alias_non_snp_mask, 'hgvs_alias'] = \
new_gvf["protein_id"] + ":" + new_gvf.loc[alias_non_snp_mask, 'alias'].apply(
new_gvf["alias_protein_id"] + ":" + new_gvf.loc[alias_non_snp_mask, 'alias'].apply(
convert_amino_acid_codes)

return(new_gvf)
Expand Down Expand Up @@ -570,7 +570,10 @@ def add_alias_names(df, GENE_PROTEIN_POSITIONS_DICT):
# for each nsp in nsps_list, operate on the number column based on the nsp start coordinates
for nsp in nsps_list:
nsp_start_aa = int(GENE_PROTEIN_POSITIONS_DICT[nsp]["aa_start"])
nsp_protein_id = GENE_PROTEIN_POSITIONS_DICT[nsp]["protein_id"]
nsp_mask = df['mat_pep']==nsp
# add alias_protein_id
df.loc[nsp_mask, 'alias_protein_id'] = nsp_protein_id
# for each half of the mutation name...
# update the numeric part
df.loc[nsp_mask, '1_newnum'] = df['1_num'].astype(int) - nsp_start_aa + 1
Expand Down
3 changes: 2 additions & 1 deletion bin/vcf2gvf.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,9 @@ def vcftogvf(vcf, strain, GENE_PROTEIN_POSITIONS_DICT, sample_size):
new_gvf["protein_symbol"] = json_df["protein_symbol"]
new_gvf["protein_id"] = json_df["protein_id"]
new_gvf["transcript_id"] = json_df["transcript_id"]
new_gvf["alias_protein_id"] = 'n/a'

# add 'alias' column for ORF1a/b mutations
# add 'alias' column for ORF1a/b mutations, and fill in "alias_protein_id" for these as well
new_gvf = add_alias_names(new_gvf, GENE_PROTEIN_POSITIONS_DICT)

# add clade_defining attribute
Expand Down

0 comments on commit 047f085

Please sign in to comment.