Skip to content

Commit

Permalink
Use "accession" column as ID column directly
Browse files Browse the repository at this point in the history
New options in Augur 22.1.0 allow usage of this column as the ID column
across all subcommands that read metadata.

For this workflow in particular, the metadata file can now be used
as-is. This removes the need for a modified copy of the metadata. It
also allows specifying an original metadata column "strain" as the
display strain field, rather than a column "strain_original" generated
during the Snakemake workflow.
  • Loading branch information
victorlin committed Jul 17, 2023
1 parent ba171f4 commit e012512
Show file tree
Hide file tree
Showing 5 changed files with 15 additions and 46 deletions.
2 changes: 1 addition & 1 deletion config/configfile.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ exclude: "config/outliers.txt"
description: "config/description.md"

strain_id_field: "accession"
display_strain_field: "strain_original"
display_strain_field: "strain"

subtypes: ['a', 'b']

Expand Down
25 changes: 0 additions & 25 deletions scripts/wrangle_metadata.py

This file was deleted.

2 changes: 1 addition & 1 deletion workflow/envs/nextstrain.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,6 @@ dependencies:
- pip
- pip:
- awscli==1.18.45
- nextstrain-augur==11.3.0
- nextstrain-augur==22.1.0
- nextstrain-cli==1.16.5
- rethinkdb==2.3.0.post6
28 changes: 10 additions & 18 deletions workflow/snakemake_rules/core.smk
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,6 @@ This part of the workflow expects input files
metadata = "data/metadata.tsv"
'''

rule wrangle_metadata:
input:
metadata="data/{a_or_b}/metadata.tsv",
output:
metadata="data/{a_or_b}/metadata_by_accession.tsv",
params:
strain_id=lambda w: config.get("strain_id_field", "strain"),
shell:
"""
python3 scripts/wrangle_metadata.py --metadata {input.metadata} \
--strain-id {params.strain_id} \
--output {output.metadata}
"""

rule index_sequences:
message:
"""
Expand Down Expand Up @@ -64,21 +50,23 @@ rule filter:
input:
sequences = "data/{a_or_b}/sequences.fasta",
reference = "config/{a_or_b}reference.gbk",
metadata = "data/{a_or_b}/metadata_by_accession.tsv",
metadata = "data/{a_or_b}/metadata.tsv",
sequence_index = rules.index_sequences.output,
exclude = config['exclude']
output:
sequences = build_dir + "/{a_or_b}/{build_name}/filtered.fasta"
params:
group_by = config["filter"]["group_by"],
min_coverage = lambda w: f'{w.build_name}_coverage>{config["filter"]["min_coverage"].get(w.build_name, 10000)}',
subsample_max_sequences = lambda w: config["filter"]["subsample_max_sequences"].get(w.build_name, 1000)
subsample_max_sequences = lambda w: config["filter"]["subsample_max_sequences"].get(w.build_name, 1000),
strain_id=lambda w: config.get("strain_id_field", "strain"),
shell:
"""
augur filter \
--sequences {input.sequences} \
--sequence-index {input.sequence_index} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--exclude {input.exclude} \
--output {output.sequences} \
--group-by {params.group_by} \
Expand Down Expand Up @@ -196,13 +184,15 @@ rule refine:
params:
coalescent = config["refine"]["coalescent"],
clock_filter_iqd = config["refine"]["clock_filter_iqd"],
date_inference = config["refine"]["date_inference"]
date_inference = config["refine"]["date_inference"],
strain_id=lambda w: config.get("strain_id_field", "strain"),
shell:
"""
augur refine \
--tree {input.tree} \
--alignment {input.alignment} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--output-tree {output.tree} \
--output-node-data {output.node_data} \
--coalescent {params.coalescent} \
Expand Down Expand Up @@ -264,12 +254,14 @@ rule traits:
log:
"logs/{a_or_b}/traits_{build_name}_rsv.txt"
params:
columns = config["traits"]["columns"]
columns = config["traits"]["columns"],
strain_id=lambda w: config.get("strain_id_field", "strain"),
shell:
"""
augur traits \
--tree {input.tree} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--output {output.node_data} \
--columns {params.columns} \
--confidence
Expand Down
4 changes: 3 additions & 1 deletion workflow/snakemake_rules/export.smk
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,14 @@ rule export:
auspice_json = build_dir + "/{a_or_b}/{build_name}/tree.json",
root_sequence = build_dir + "/{a_or_b}/{build_name}/tree_root-sequence.json"
params:
title = lambda w: f"RSV-{w.a_or_b.upper()} phylogeny"
title = lambda w: f"RSV-{w.a_or_b.upper()} phylogeny",
strain_id=lambda w: config.get("strain_id_field", "strain"),
shell:
"""
augur export v2 \
--tree {input.tree} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--node-data {input.node_data} \
--title {params.title:q} \
--description {input.description} \
Expand Down

0 comments on commit e012512

Please sign in to comment.