Skip to content

Commit

Permalink
Merge pull request #68 from nextstrain/rsv-revamp
Browse files Browse the repository at this point in the history
Add resolutions to rsv
  • Loading branch information
rneher authored Jul 16, 2024
2 parents 20e7697 + 1360175 commit 61a4a5d
Show file tree
Hide file tree
Showing 12 changed files with 43,773 additions and 6,749 deletions.
5 changes: 3 additions & 2 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@ auspice_dir = 'auspice'

rule all:
input:
expand("auspice/rsv_{subtype}_{build}.json",
expand("auspice/rsv_{subtype}_{build}_{resolution}.json",
subtype = config.get("subtypes",['a']),
build = config.get("buildstorun", ['genome'])),
build = config.get("builds_to_run", ['genome']),
resolution = config.get("resolutions_to_run", ["all-time"])),

include: "workflow/snakemake_rules/chores.smk"

Expand Down
20 changes: 18 additions & 2 deletions config/configfile.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@ conda_environment: "workflow/envs/nextstrain.yaml"

genesforglycosylation: ["G", "F"]

buildstorun: ["genome", "G", "F"]
builds_to_run: ["genome", "G", "F"]

resolutions_to_run: ["all-time", "6y", "3y"]

exclude: "config/outliers.txt"

Expand All @@ -20,8 +22,22 @@ filter:
G: 0.3
F: 0.3

min_length:
genome: 10000
G: 600
F: 1200
resolutions:
all-time:
min_date: 100Y
6y:
min_date: 6Y
background_min_date: 100Y
3y:
min_date: 3Y
background_min_date: 100Y

subsample_max_sequences:
genome: 2000
genome: 3000
G: 3000
F: 3000

Expand Down
7 changes: 7 additions & 0 deletions config/outliers.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,15 @@

MW455133 # over diverged/misdated
OK649675 # over diverged/misdated
KP317955
JX661581
OR140548


# B
JX489420 # over diverged/misdated
KF246645
KF246646
MF426028
KJ672473
OR326815
150 changes: 90 additions & 60 deletions example_data/a/metadata.tsv

Large diffs are not rendered by default.

25,290 changes: 21,970 additions & 3,320 deletions example_data/a/sequences.fasta

Large diffs are not rendered by default.

143 changes: 87 additions & 56 deletions example_data/b/metadata.tsv

Large diffs are not rendered by default.

24,749 changes: 21,484 additions & 3,265 deletions example_data/b/sequences.fasta

Large diffs are not rendered by default.

5 changes: 4 additions & 1 deletion nextclade/config/pathogen.json
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,10 @@
},
"stopCodons": {
"enabled": true,
"ignoredStopCodons": []
"ignoredStopCodons": [{
"codon":320,
"cdsName":"G"
}]
}
},
"cdsOrderPreference": [
Expand Down
12 changes: 6 additions & 6 deletions workflow/snakemake_rules/clades.smk
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ rule clades_genome:
nuc_muts = rules.ancestral.output.node_data,
clades = "config/clades_genome_{a_or_b}.tsv"
output:
node_data = build_dir + "/{a_or_b}/{build_name}/clades_genome.json"
node_data = build_dir + "/{a_or_b}/{build_name}/{resolution}/clades_genome.json"
log:
"logs/{a_or_b}/clades_genome_{build_name}.txt"
"logs/{a_or_b}/clades_genome_{build_name}_{resolution}.txt"
shell:
"""
augur clades --tree {input.tree} \
Expand All @@ -29,9 +29,9 @@ rule clades_Goya:
nuc_muts = rules.ancestral.output.node_data,
clades = "config/clades_G_{a_or_b}.tsv"
output:
node_data = build_dir + "/{a_or_b}/{build_name}/clades_G.json"
node_data = build_dir + "/{a_or_b}/{build_name}/{resolution}/clades_G.json"
log:
"logs/{a_or_b}/clades_{build_name}.txt"
"logs/{a_or_b}/clades_{build_name}_{resolution}.txt"
shell:
"""
augur clades --tree {input.tree} \
Expand All @@ -50,9 +50,9 @@ rule clades_consortium:
nuc_muts = rules.ancestral.output.node_data,
clades = "config/clades_consortium_{a_or_b}.tsv"
output:
node_data = build_dir + "/{a_or_b}/{build_name}/clades_consortium.json"
node_data = build_dir + "/{a_or_b}/{build_name}/{resolution}/clades_consortium.json"
log:
"logs/{a_or_b}/clades_{build_name}.txt"
"logs/{a_or_b}/clades_{build_name}_{resolution}.txt"
shell:
"""
augur clades --tree {input.tree} \
Expand Down
125 changes: 96 additions & 29 deletions workflow/snakemake_rules/core.smk
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ rule index_sequences:
input:
sequences = "data/{a_or_b}/sequences.fasta"
output:
sequence_index = build_dir + "/{a_or_b}/{build_name}/sequence_index.tsv"
sequence_index = build_dir + "/{a_or_b}/{build_name}/{resolution}/sequence_index.tsv"
shell:
"""
augur index \
Expand All @@ -28,8 +28,8 @@ rule newreference:
input:
oldreference = "config/{a_or_b}reference.gbk"
output:
newreferencegbk = build_dir + "/{a_or_b}/{build_name}/{gene}_reference.gbk",
newreferencefasta = build_dir + "/{a_or_b}/{build_name}/{gene}_reference.fasta",
newreferencegbk = build_dir + "/{a_or_b}/{build_name}/{resolution}/{gene}_reference.gbk",
newreferencefasta = build_dir + "/{a_or_b}/{build_name}/{resolution}/{gene}_reference.fasta",
params:
gene = lambda w: w.gene,
shell:
Expand All @@ -41,8 +41,7 @@ rule newreference:
--gene {params.gene}
"""


rule filter:
rule filter_recent:
message:
"""
filtering sequences
Expand All @@ -54,12 +53,14 @@ rule filter:
sequence_index = rules.index_sequences.output,
exclude = config['exclude']
output:
sequences = build_dir + "/{a_or_b}/{build_name}/filtered.fasta"
sequences = build_dir + "/{a_or_b}/{build_name}/{resolution}/filtered_recent.fasta"
params:
group_by = config["filter"]["group_by"],
min_coverage = lambda w: f'{w.build_name}_coverage>{config["filter"]["min_coverage"].get(w.build_name, 10000)}',
min_length = lambda w: config["filter"]["min_length"].get(w.build_name, 10000),
subsample_max_sequences = lambda w: config["filter"]["subsample_max_sequences"].get(w.build_name, 1000),
strain_id=config["strain_id_field"],
min_date=lambda w: config['filter']['resolutions'][w.resolution]["min_date"]
shell:
"""
augur filter \
Expand All @@ -69,27 +70,92 @@ rule filter:
--metadata-id-columns {params.strain_id} \
--exclude {input.exclude} \
--exclude-where 'qc.overallStatus=bad' \
--min-date {params.min_date} \
--min-length {params.min_length} \
--output {output.sequences} \
--group-by {params.group_by} \
--subsample-max-sequences {params.subsample_max_sequences} \
--query '{params.min_coverage}'
"""

rule filter_background:
message:
"""
filtering sequences
"""
input:
sequences = "data/{a_or_b}/sequences.fasta",
reference = "config/{a_or_b}reference.gbk",
metadata = "data/{a_or_b}/metadata.tsv",
sequence_index = rules.index_sequences.output,
exclude = config['exclude']
output:
sequences = build_dir + "/{a_or_b}/{build_name}/{resolution}/filtered_background.fasta"
params:
group_by = config["filter"]["group_by"],
min_coverage = lambda w: f'{w.build_name}_coverage>{config["filter"]["min_coverage"].get(w.build_name, 10000)}',
min_length = lambda w: config["filter"]["min_length"].get(w.build_name, 10000),
subsample_max_sequences = lambda w: int(config["filter"]["subsample_max_sequences"].get(w.build_name, 1000))//10,
strain_id=config["strain_id_field"],
max_date=lambda w: config['filter']['resolutions'][w.resolution]["min_date"],
min_date=lambda w: config['filter']['resolutions'][w.resolution]["background_min_date"]
shell:
"""
augur filter \
--sequences {input.sequences} \
--sequence-index {input.sequence_index} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--exclude {input.exclude} \
--exclude-where 'qc.overallStatus=bad' 'qc.overallStatus=mediocre' \
--min-date {params.min_date} \
--max-date {params.max_date} \
--min-length {params.min_length} \
--output {output.sequences} \
--group-by {params.group_by} \
--subsample-max-sequences {params.subsample_max_sequences} \
--query '{params.min_coverage}'
"""

rule combine_samples:
input:
subsamples = lambda w: [rules.filter_recent.output.sequences, rules.filter_background.output.sequences] if 'background_min_date' in config['filter']['resolutions'][w.resolution] else [rules.filter_recent.output.sequences]
output:
sequences = build_dir + "/{a_or_b}/{build_name}/{resolution}/filtered.fasta"
shell:
"""
cat {input.subsamples} | seqkit rmdup > {output}
"""

rule get_nextclade_dataset:
message:
"""
fetching nextclade dataset
"""
output:
dataset="nextclade_rsv-{a_or_b}.zip"
params:
ds_name = lambda w: "nextstrain/rsv/a/EPI_ISL_412866" if w.a_or_b=='a' else "nextstrain/rsv/b/EPI_ISL_1653999"
shell:
"""
nextclade3 dataset get -n {params.ds_name} --output-zip {output.dataset}
"""

rule genome_align:
message:
"""
Aligning sequences to {input.reference}
Aligning sequences to the reference
"""
input:
sequences = rules.filter.output.sequences,
reference = build_dir + "/{a_or_b}/{build_name}/genome_reference.fasta"
sequences = rules.combine_samples.output.sequences,
dataset = rules.get_nextclade_dataset.output.dataset
output:
alignment = build_dir + "/{a_or_b}/{build_name}/sequences.aligned.fasta"
alignment = build_dir + "/{a_or_b}/{build_name}/{resolution}/sequences.aligned.fasta"
threads: 4
shell:
"""
nextclade3 run -j {threads}\
--input-ref {input.reference} \
-D {input.dataset} \
--output-fasta {output.alignment} \
{input.sequences}
"""
Expand All @@ -100,7 +166,7 @@ rule cut:
oldalignment = rules.genome_align.output.alignment,
reference = "config/{a_or_b}reference.gbk"
output:
slicedalignment = build_dir + "/{a_or_b}/{build_name}/{gene}_slicedalignment.fasta"
slicedalignment = build_dir + "/{a_or_b}/{build_name}/{resolution}/{gene}_slicedalignment.fasta"
params:
gene = lambda w: w.gene
shell:
Expand All @@ -116,9 +182,9 @@ rule cut:
rule realign:
input:
slicedalignment = rules.cut.output.slicedalignment,
reference = build_dir + "/{a_or_b}/{build_name}/{gene}_reference.fasta"
reference = build_dir + "/{a_or_b}/{build_name}/{resolution}/{gene}_reference.fasta"
output:
realigned = build_dir + "/{a_or_b}/{build_name}/{gene}_aligned.fasta"
realigned = build_dir + "/{a_or_b}/{build_name}/{resolution}/{gene}_aligned.fasta"
threads: 4
shell:
"""
Expand All @@ -132,10 +198,10 @@ rule realign:
rule hybrid_align:
input:
original = rules.genome_align.output.alignment,
G_alignment = build_dir + "/{a_or_b}/{build_name}/G_aligned.fasta",
G_alignment = build_dir + "/{a_or_b}/{build_name}/{resolution}/G_aligned.fasta",
reference = "config/{a_or_b}reference.gbk"
output:
hybrid_alignment = build_dir + "/{a_or_b}/{build_name}/hybrid_alignment.fasta"
hybrid_alignment = build_dir + "/{a_or_b}/{build_name}/{resolution}/hybrid_alignment.fasta"
params:
gene = lambda w: w.build_name
shell:
Expand All @@ -152,20 +218,21 @@ def get_alignment(w):
if w.build_name == "genome":
return rules.hybrid_align.output.hybrid_alignment
else:
return build_dir + f"/{w.a_or_b}/{w.build_name}/{w.build_name}_aligned.fasta"
return build_dir + f"/{w.a_or_b}/{w.build_name}/{w.resolution}/{w.build_name}_aligned.fasta"

rule tree:
message: "Building tree"
input:
alignment = get_alignment
output:
tree = build_dir + "/{a_or_b}/{build_name}/tree_raw.nwk"
tree = build_dir + "/{a_or_b}/{build_name}/{resolution}/tree_raw.nwk"
threads: 4
shell:
"""
augur tree \
--alignment {input.alignment} \
--output {output.tree} \
--tree-builder-args '-ninit 10 -n 4 -czb' \
--nthreads {threads}
"""

Expand All @@ -180,10 +247,10 @@ rule refine:
input:
tree = rules.tree.output.tree,
alignment =get_alignment,
metadata = rules.filter.input.metadata
metadata = "data/{a_or_b}/metadata.tsv"
output:
tree = build_dir + "/{a_or_b}/{build_name}/tree.nwk",
node_data = build_dir + "/{a_or_b}/{build_name}/branch_lengths.json"
tree = build_dir + "/{a_or_b}/{build_name}/{resolution}/tree.nwk",
node_data = build_dir + "/{a_or_b}/{build_name}/{resolution}/branch_lengths.json"
params:
coalescent = config["refine"]["coalescent"],
clock_filter_iqd = config["refine"]["clock_filter_iqd"],
Expand Down Expand Up @@ -215,9 +282,9 @@ rule ancestral:
input:
tree = rules.refine.output.tree,
alignment = get_alignment,
root_sequence = build_dir + "/{a_or_b}/{build_name}/{build_name}_reference.gbk"
root_sequence = build_dir + "/{a_or_b}/{build_name}/{resolution}/{build_name}_reference.gbk"
output:
node_data = build_dir + "/{a_or_b}/{build_name}/nt_muts.json"
node_data = build_dir + "/{a_or_b}/{build_name}/{resolution}/nt_muts.json"
params:
inference = config["ancestral"]["inference"]
shell:
Expand All @@ -235,11 +302,11 @@ rule translate:
input:
tree = rules.refine.output.tree,
node_data = rules.ancestral.output.node_data,
reference = build_dir + "/{a_or_b}/{build_name}/{build_name}_reference.gbk",
reference = build_dir + "/{a_or_b}/{build_name}/{resolution}/{build_name}_reference.gbk",
output:
node_data = build_dir + "/{a_or_b}/{build_name}/aa_muts.json"
node_data = build_dir + "/{a_or_b}/{build_name}/{resolution}/aa_muts.json"
params:
alignment_file_mask = build_dir + "/{a_or_b}/{build_name}/aligned_%GENE.fasta"
alignment_file_mask = build_dir + "/{a_or_b}/{build_name}/{resolution}/aligned_%GENE.fasta"
shell:
"""
augur translate \
Expand All @@ -253,11 +320,11 @@ rule translate:
rule traits:
input:
tree = rules.refine.output.tree,
metadata = rules.filter.input.metadata
metadata = "data/{a_or_b}/metadata.tsv"
output:
node_data = build_dir + "/{a_or_b}/{build_name}/traits.json"
node_data = build_dir + "/{a_or_b}/{build_name}/{resolution}/traits.json"
log:
"logs/{a_or_b}/traits_{build_name}_rsv.txt"
"logs/{a_or_b}/traits_{build_name}_{resolution}_rsv.txt"
params:
columns = config["traits"]["columns"],
strain_id=config["strain_id_field"],
Expand Down
Loading

0 comments on commit 61a4a5d

Please sign in to comment.