Skip to content

Commit

Permalink
change 'gene' attribute to 'gene_name' in VCF
Browse files Browse the repository at this point in the history
  • Loading branch information
miseminger committed Jul 30, 2024
1 parent c93085c commit 257d230
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 5 deletions.
6 changes: 3 additions & 3 deletions bin/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import logging

# standard variables used by all scripts
empty_attributes = 'ID=;Name=;alias=;gene=;gene_symbol=;protein_name=;protein_symbol=;\
empty_attributes = 'ID=;Name=;alias=;gene_name=;gene_symbol=;protein_name=;protein_symbol=;\
protein_id=;alias_protein_id=;transcript_id=;ps_filter=;ps_exc=; \
mat_pep=;mat_pep_desc=;mat_pep_acc=;ro=;ao=;dp=;sample_size=; \
Reference_seq=;Variant_seq=;nt_name=;aa_name=;hgvs_nt=;hgvs_aa=;hgvs_alias=; \
Expand Down Expand Up @@ -506,7 +506,7 @@ def map_pos_to_gene_protein(pos, GENE_PROTEIN_POSITIONS_DICT):
protein_symbol = GENE_PROTEIN_POSITIONS_DICT[entry]["protein_alias"]
protein_id = GENE_PROTEIN_POSITIONS_DICT[entry]["protein_id"]
transcript_id = GENE_PROTEIN_POSITIONS_DICT[entry]["locus_tag"]

# fill in attributes for mutations in this CDS region
cds_mask = df[pos_column].astype(int).between(start, end, inclusive="both")
df.loc[cds_mask, "gene"] = gene
Expand Down Expand Up @@ -542,7 +542,7 @@ def add_alias_names(df, GENE_PROTEIN_POSITIONS_DICT):
df.loc[:, 'alias'] = 'n/a'

# get list of all NSP, 3CL, and PlPro proteins in the file:
alias_mask = (df['gene'].str.contains("ORF1ab")) & (df['mat_pep']!='n/a')
alias_mask = (df['gene_symbol'].str.contains("ORF1ab")) & (df['mat_pep']!='n/a')
nsps_list = sorted(list(set(df[alias_mask]['mat_pep'].tolist())))
if len(nsps_list) > 0:

Expand Down
4 changes: 2 additions & 2 deletions bin/vcf2gvf.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
files. Required user input is a VCF file.
The attributes completed by this script are:
['ID', 'Name', 'gene', 'protein_name', 'protein_symbol', 'protein_id', 'ps_filter', 'ps_exc', 'mat_pep',
['ID', 'Name', 'gene_name', 'gene_symbol', 'protein_name', 'protein_symbol', 'protein_id', 'ps_filter', 'ps_exc', 'mat_pep',
'mat_pep_desc','mat_pep_acc', 'ro', 'ao', 'dp', 'sample_size', 'Reference_seq',
'Variant_seq', 'nt_name', 'aa_name', 'hgvs_nt', 'hgvs_aa', 'hgvs_alias', 'vcf_gene', 'mutation_type',
'viral_lineage', 'alternate_frequency', 'transcript_id']
Expand Down Expand Up @@ -83,7 +83,7 @@ def vcftogvf(vcf, strain, GENE_PROTEIN_POSITIONS_DICT, sample_size):
# add gene and protein attributes from JSON
json_df = map_pos_to_gene_protein(
vcf_df['POS'].astype(int), GENE_PROTEIN_POSITIONS_DICT)
new_gvf["gene"] = json_df["gene"]
new_gvf["gene_name"] = json_df["gene"]
new_gvf["gene_symbol"] = json_df["gene"]
new_gvf["protein_name"] = json_df["protein_name"]
new_gvf["protein_symbol"] = json_df["protein_symbol"]
Expand Down

1 comment on commit 257d230

@miseminger
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

GVF, not VCF

Please sign in to comment.