From d2405ed42617d31d74224163cecafd60ca80ffb3 Mon Sep 17 00:00:00 2001 From: siebrenf Date: Thu, 12 Jan 2023 13:48:45 +0100 Subject: [PATCH 1/9] p300 --- CHANGELOG.md | 1 + anansnake/envs/ananse.yaml | 4 ++++ anansnake/rules/ananse.smk | 3 ++- example/config.yaml | 1 + 4 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 88bed1b..b04954f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added +- experimental p300 support - run differential gene expression analysis using `DESeq2` from `Seq2Science` RNA-seq output - run peak enrichment analysis using `gimmemotifs maelstrom` from `Seq2Science` ATAC-seq output - obtain nonhuman transcription factor binding motifs using `gimmemotifs motif2factors` diff --git a/anansnake/envs/ananse.yaml b/anansnake/envs/ananse.yaml index 5ec140d..14c0ca9 100644 --- a/anansnake/envs/ananse.yaml +++ b/anansnake/envs/ananse.yaml @@ -5,3 +5,7 @@ channels: - defaults dependencies: - ananse =0.4.1 + + - pip + - pip: + - git+https://github.com/vanheeringen-lab/ANANSE.git@p300_2 \ No newline at end of file diff --git a/anansnake/rules/ananse.smk b/anansnake/rules/ananse.smk index ecbd8f2..c2a9dd6 100644 --- a/anansnake/rules/ananse.smk +++ b/anansnake/rules/ananse.smk @@ -18,6 +18,7 @@ rule binding: expand("{result_dir}/benchmarks/binding_{{condition}}.txt", **config)[0] params: atac_samples=lambda wildcards: CONDITIONS[wildcards.condition]["ATAC-seq samples"], + enhancer_data=lambda wildcards: "-P" if config.get("enhancer_data") == "p300" else "-A", jaccard=config["jaccard"], threads: 1 # multithreading not required when using a pfmscorefile resources: @@ -35,7 +36,7 @@ rule binding: printf "using columns: {params.atac_samples}\n\n" > {log} ananse binding \ - -A {input.atac} \ + {params.enhancer_data} {input.atac} \ -c {params.atac_samples} \ -g {input.genome} \ -p {input.pfm} \ diff --git a/example/config.yaml b/example/config.yaml index b7ace74..2dfc951 100644 --- a/example/config.yaml +++ b/example/config.yaml @@ -48,6 +48,7 @@ merged_technical_reps: true # set to false if you used 'technical_replicates: " merged_biological_reps: true # set to false if you used 'biological_replicates: "keep" in s2s' # ANANSE binding +enhancer_data: "ATAC" # experimental option "p300" jaccard: 0.2 # default: 0.1 # ANANSE influence From 9704568d8e5f7bf6237a61eb11cebbc27d517e52 Mon Sep 17 00:00:00 2001 From: siebrenf Date: Thu, 12 Jan 2023 13:49:48 +0100 Subject: [PATCH 2/9] nl --- anansnake/envs/ananse.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anansnake/envs/ananse.yaml b/anansnake/envs/ananse.yaml index 14c0ca9..665431d 100644 --- a/anansnake/envs/ananse.yaml +++ b/anansnake/envs/ananse.yaml @@ -8,4 +8,4 @@ dependencies: - pip - pip: - - git+https://github.com/vanheeringen-lab/ANANSE.git@p300_2 \ No newline at end of file + - git+https://github.com/vanheeringen-lab/ANANSE.git@p300_2 From d97b09cedf975fdf89d21675c0acd96bef25fc4d Mon Sep 17 00:00:00 2001 From: siebrenf Date: Thu, 12 Jan 2023 13:48:45 +0100 Subject: [PATCH 3/9] p300 --- CHANGELOG.md | 1 + anansnake/envs/ananse.yaml | 4 ++++ anansnake/rules/ananse.smk | 3 ++- example/config.yaml | 1 + 4 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 88bed1b..b04954f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added +- experimental p300 support - run differential gene expression analysis using `DESeq2` from `Seq2Science` RNA-seq output - run peak enrichment analysis using `gimmemotifs maelstrom` from `Seq2Science` ATAC-seq output - obtain nonhuman transcription factor binding motifs using `gimmemotifs motif2factors` diff --git a/anansnake/envs/ananse.yaml b/anansnake/envs/ananse.yaml index 5ec140d..14c0ca9 100644 --- a/anansnake/envs/ananse.yaml +++ b/anansnake/envs/ananse.yaml @@ -5,3 +5,7 @@ channels: - defaults dependencies: - ananse =0.4.1 + + - pip + - pip: + - git+https://github.com/vanheeringen-lab/ANANSE.git@p300_2 \ No newline at end of file diff --git a/anansnake/rules/ananse.smk b/anansnake/rules/ananse.smk index ecbd8f2..c2a9dd6 100644 --- a/anansnake/rules/ananse.smk +++ b/anansnake/rules/ananse.smk @@ -18,6 +18,7 @@ rule binding: expand("{result_dir}/benchmarks/binding_{{condition}}.txt", **config)[0] params: atac_samples=lambda wildcards: CONDITIONS[wildcards.condition]["ATAC-seq samples"], + enhancer_data=lambda wildcards: "-P" if config.get("enhancer_data") == "p300" else "-A", jaccard=config["jaccard"], threads: 1 # multithreading not required when using a pfmscorefile resources: @@ -35,7 +36,7 @@ rule binding: printf "using columns: {params.atac_samples}\n\n" > {log} ananse binding \ - -A {input.atac} \ + {params.enhancer_data} {input.atac} \ -c {params.atac_samples} \ -g {input.genome} \ -p {input.pfm} \ diff --git a/example/config.yaml b/example/config.yaml index b7ace74..2dfc951 100644 --- a/example/config.yaml +++ b/example/config.yaml @@ -48,6 +48,7 @@ merged_technical_reps: true # set to false if you used 'technical_replicates: " merged_biological_reps: true # set to false if you used 'biological_replicates: "keep" in s2s' # ANANSE binding +enhancer_data: "ATAC" # experimental option "p300" jaccard: 0.2 # default: 0.1 # ANANSE influence From 56e416de6cf28bb0374a7e874624e460ad519d69 Mon Sep 17 00:00:00 2001 From: siebrenf Date: Thu, 12 Jan 2023 13:49:48 +0100 Subject: [PATCH 4/9] nl --- anansnake/envs/ananse.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anansnake/envs/ananse.yaml b/anansnake/envs/ananse.yaml index 14c0ca9..665431d 100644 --- a/anansnake/envs/ananse.yaml +++ b/anansnake/envs/ananse.yaml @@ -8,4 +8,4 @@ dependencies: - pip - pip: - - git+https://github.com/vanheeringen-lab/ANANSE.git@p300_2 \ No newline at end of file + - git+https://github.com/vanheeringen-lab/ANANSE.git@p300_2 From c23a417627d0e7be64b6a51b85f4535333ba26cf Mon Sep 17 00:00:00 2001 From: siebrenf Date: Fri, 26 May 2023 18:57:04 +0200 Subject: [PATCH 5/9] control filepaths (faster reruns) --- anansnake/Snakefile | 4 ++-- anansnake/__init__.py | 2 +- anansnake/rules/ananse.smk | 28 +++++++++++++-------------- anansnake/rules/configuration.smk | 31 ++++++++++++++++-------------- anansnake/rules/deseq2.smk | 4 ++-- anansnake/rules/gimme.smk | 15 +++++++-------- anansnake/scripts/motif2factors.py | 12 +++++++----- example/config.yaml | 16 ++++++++------- 8 files changed, 59 insertions(+), 53 deletions(-) diff --git a/anansnake/Snakefile b/anansnake/Snakefile index 07095c6..a0d66fc 100644 --- a/anansnake/Snakefile +++ b/anansnake/Snakefile @@ -9,5 +9,5 @@ include: "rules/ananse.smk" rule all: input: - expand("{result_dir}/plot/{contrasts}.{plot_type}", **config), - rules.maelstrom.output, + expand("{plot_dir}/{contrasts}.{plot_type}", **config), + rules.maelstrom.output if config.get("run_maelstrom") else [], diff --git a/anansnake/__init__.py b/anansnake/__init__.py index f102a9c..3dc1f76 100644 --- a/anansnake/__init__.py +++ b/anansnake/__init__.py @@ -1 +1 @@ -__version__ = "0.0.1" +__version__ = "0.1.0" diff --git a/anansnake/rules/ananse.smk b/anansnake/rules/ananse.smk index c2a9dd6..410d0c8 100644 --- a/anansnake/rules/ananse.smk +++ b/anansnake/rules/ananse.smk @@ -11,11 +11,11 @@ rule binding: pfmscorefile=rules.pfmscorefile.output, genome=GENOME, output: - expand("{result_dir}/binding/{{condition}}.h5", ** config), + expand("{binding_dir}/{{condition}}.h5", ** config), log: - expand("{result_dir}/binding/log_{{condition}}.txt", **config), + expand("{log_dir}/binding_{{condition}}.txt", **config), benchmark: - expand("{result_dir}/benchmarks/binding_{{condition}}.txt", **config)[0] + expand("{benchmark_dir}/binding_{{condition}}.txt", **config)[0] params: atac_samples=lambda wildcards: CONDITIONS[wildcards.condition]["ATAC-seq samples"], enhancer_data=lambda wildcards: "-P" if config.get("enhancer_data") == "p300" else "-A", @@ -71,11 +71,11 @@ rule network: genes=config["rna_tpms"], genome=GENOME, output: - expand("{result_dir}/network/{{condition}}.tsv",**config), + expand("{network_dir}/{{condition}}.tsv",**config), log: - expand("{result_dir}/network/log_{{condition}}.txt",**config), + expand("{log_dir}/network_{{condition}}.txt",**config), benchmark: - expand("{result_dir}/benchmarks/network_{{condition}}.txt",**config)[0] + expand("{benchmark_dir}/network_{{condition}}.txt",**config)[0] params: rna_samples=lambda wildcards: CONDITIONS[wildcards.condition]["RNA-seq samples"], threads: 1 # multithreading explodes memory @@ -106,10 +106,10 @@ rule network: def get_source(wildcards): - return f"{config['result_dir']}/network/{CONTRASTS[wildcards.contrast]['source']}.tsv" + return f"{config['network_dir']}/{CONTRASTS[wildcards.contrast]['source']}.tsv" def get_target(wildcards): - return f"{config['result_dir']}/network/{CONTRASTS[wildcards.contrast]['target']}.tsv" + return f"{config['network_dir']}/{CONTRASTS[wildcards.contrast]['target']}.tsv" rule influence: """ @@ -122,12 +122,12 @@ rule influence: degenes=rules.deseq2.output, genome=GENOME, output: - inf = expand("{result_dir}/influence/{{contrast}}.tsv",**config), - diff_inf = expand("{result_dir}/influence/{{contrast}}_diffnetwork.tsv",** config), + inf = expand("{influence_dir}/{{contrast}}.tsv",**config), + diff_inf = expand("{influence_dir}/{{contrast}}_diffnetwork.tsv",** config), log: - expand("{result_dir}/influence/log_{{contrast}}.txt",**config), + expand("{log_dir}/influence_{{contrast}}.txt",** config)[0] benchmark: - expand("{result_dir}/benchmarks/influence_{{contrast}}.txt",**config)[0] + expand("{benchmark_dir}/influence_{{contrast}}.txt",**config)[0] params: edges=config["edges"], padj=config["padj"], @@ -164,9 +164,9 @@ rule plot: inf=rules.influence.output.inf, diff_inf=rules.influence.output.diff_inf, output: - expand("{result_dir}/plot/{{contrast}}.{plot_type}",**config), + expand("{plot_dir}/{{contrast}}.{plot_type}",**config), log: - expand("{result_dir}/plot/log_{{contrast}}.txt",**config), + expand("{log_dir}/plot_{{contrast}}.txt",**config), params: type=config["plot_type"], threads: 1 diff --git a/anansnake/rules/configuration.smk b/anansnake/rules/configuration.smk index 2a6cd8e..1074a84 100644 --- a/anansnake/rules/configuration.smk +++ b/anansnake/rules/configuration.smk @@ -11,25 +11,28 @@ from seq2science.util import dense_samples as parse_samples # NOTE: global variables are written in all caps -# check + fix file paths -for item in ["result_dir", "tmp_dir", "rna_samples", "rna_counts", "rna_tpms", "atac_samples", "atac_counts"]: - if item == "tmp_dir" and "tmp_dir" not in config: - config[item] = None # make it explicit - continue - - path = genomepy.utils.cleanpath(config[item]) - - # create output dir - if item in ["result_dir", "tmp_dir"]: - genomepy.utils.mkdir_p(path) +# check directories +for d in ["result", "gimme", "deseq2", "binding", "network", "influence", "plot", "log", "benchmark"]: + key = f"{d}_dir" + if not config.get(key): + if key == "result_dir": + raise KeyError("A 'result_dir' is required in the config") + else: + config[key] = os.path.join(config["result_dir"], d) + else: + config[key] = genomepy.utils.cleanpath(config[key]) + +# check files +for f in ["rna_samples", "rna_counts", "rna_tpms", "atac_samples", "atac_counts"]: + path = genomepy.utils.cleanpath(config[f]) # all files found? - elif not os.path.exists(path): - logger.error(f"could not find '{item}' in {path}") + if not os.path.exists(path): + logger.error(f"could not find '{f}' in {path}") sys.exit(1) # save absolute path - config[item] = path + config[f] = path # check if the genome is found + add to globals g = genomepy.Genome(config["genome"], config.get("genomes_dir"), build_index=False) diff --git a/anansnake/rules/deseq2.smk b/anansnake/rules/deseq2.smk index 3adf6f5..393ef75 100644 --- a/anansnake/rules/deseq2.smk +++ b/anansnake/rules/deseq2.smk @@ -9,9 +9,9 @@ rule deseq2: rna_samples = config["rna_samples"], genes = config["rna_counts"], output: - expand("{result_dir}/deseq2/{assembly}-{{contrast}}.diffexp.tsv", assembly=ASSEMBLY, **config), + expand("{deseq2_dir}/{assembly}-{{contrast}}.diffexp.tsv", assembly=ASSEMBLY, **config), log: - expand("{result_dir}/deseq2/log_{{contrast}}.txt", **config), + expand("{log_dir}/deseq2_{{contrast}}.txt", **config), resources: deseq2=1 shell: diff --git a/anansnake/rules/gimme.smk b/anansnake/rules/gimme.smk index a47bfd3..561a52c 100644 --- a/anansnake/rules/gimme.smk +++ b/anansnake/rules/gimme.smk @@ -9,15 +9,14 @@ rule motif2factors: input: genome=GENOME, output: - expand("{result_dir}/gimme/{assembly}.{database}.pfm", assembly=ASSEMBLY, **config), + expand("{gimme_dir}/{assembly}.{database}.pfm", assembly=ASSEMBLY, **config), log: - expand("{result_dir}/gimme/log_{assembly}_m2f.txt", assembly=ASSEMBLY, **config), + expand("{log_dir}/gimme_m2f.txt", **config), params: genomes_dir=config.get("genomes_dir"), database=config.get("database", "gimme.vertebrate.v5.0"), get_orthologs=config.get("get_orthologs", True), - tmpdir=config.get("tmp_dir"), - keeptmp=config.get("keep_tmp_data", False), + keep_tmp=config.get("keep_ortholog_data", False), threads: 24 conda: "../envs/gimme.yaml" script: @@ -35,9 +34,9 @@ rule pfmscorefile: pfm=rules.motif2factors.output, genome=GENOME, output: - expand("{result_dir}/gimme/pfmscorefile.tsv", **config), + expand("{gimme_dir}/pfmscorefile.tsv", **config), log: - expand("{result_dir}/gimme/log_{assembly}_pfmscorefile.txt", assembly=ASSEMBLY, **config), + expand("{log_dir}/gimme_pfmscorefile.txt", **config), threads: 12 conda: "../envs/gimme.yaml" shell: @@ -67,9 +66,9 @@ rule maelstrom: pfm=rules.motif2factors.output, genome=GENOME, output: - directory(expand("{result_dir}/gimme/{assembly}-maelstrom", assembly=ASSEMBLY, **config)), + directory(expand("{gimme_dir}/{assembly}-maelstrom", assembly=ASSEMBLY, **config)), log: - expand("{result_dir}/gimme/log_{assembly}_maelstrom.txt", assembly=ASSEMBLY, **config), + expand("{log_dir}/gimme_maelstrom.txt", **config), params: atac_samples=lambda wildcards : sorted({sample for v in CONDITIONS.values() for sample in v['ATAC-seq samples']}), threads: 24 diff --git a/anansnake/scripts/motif2factors.py b/anansnake/scripts/motif2factors.py index d88b072..fb61abe 100644 --- a/anansnake/scripts/motif2factors.py +++ b/anansnake/scripts/motif2factors.py @@ -1,4 +1,5 @@ from contextlib import redirect_stdout, redirect_stderr +from os import makedirs from os.path import dirname, join from shutil import copyfile @@ -25,16 +26,17 @@ # create an ortholog m2f from gimmemotifs.orthologs import motif2factor_from_orthologs - tmpdir = snakemake.params.tmpdir - if tmpdir is not None: - tmpdir = join(tmpdir, "motif2factors") + outdir = dirname(snakemake.output[0]) + tmpdir = None + if snakemake.params.keep_tmp: + tmpdir = join(outdir, "orthofinder") motif2factor_from_orthologs( database=snakemake.params.database, new_reference=[snakemake.input.genome], genomes_dir=snakemake.params.genomes_dir, - outdir=dirname(snakemake.output[0]), + outdir=outdir, tmpdir=tmpdir, - keep_intermediate=snakemake.params.keeptmp, + keep_intermediate=snakemake.params.keep_tmp, threads=snakemake.threads, ) diff --git a/example/config.yaml b/example/config.yaml index 2dfc951..2917846 100644 --- a/example/config.yaml +++ b/example/config.yaml @@ -15,7 +15,7 @@ atac_samples: example/atac_samples.tsv atac_counts: example/GRCz11_raw.tsv genome: GRCz11 # genomepy install GRCz11 --annotation -# genomes_dir: # where to search for genomes (default: $HOME/.local/share/genomes) +genomes_dir: ~/genomes # where to search for genomes (default: $HOME/.local/share/genomes) # output result_dir: example/outdir @@ -26,8 +26,8 @@ result_dir: example/outdir # groups must be present in this column # for RNA-seq, each group must have 2 or more samples (for DESeq2) contrasts: - - "anansnake_group2_group1" - - "anansnake_group1_group2" + - anansnake_group2_group1 + - anansnake_group1_group2 # optional settings @@ -39,8 +39,10 @@ contrasts: # for example: with zebrafish, you can use "database: gimme.vertebrate.v5.0" with "get_orthologs: true". database: gimme.vertebrate.v5.0 # options: https://gimmemotifs.readthedocs.io/en/master/overview.html#motif-databases get_orthologs: true # map the database to the genome's gene annotation, using orthofinder. -tmp_dir: example/outdir/tmp # intermediate orthofinder output location, can be None -keep_tmp_data: true # keep the intermediate orthofinder output? +keep_ortholog_data: true # keep the intermediate orthofinder output? + +# run gimme maelstrom on the enhancer data +run_maelstrom: false # seq2science options use_descriptive_names: true # used descriptive names from the sample.tsv as counts table columns @@ -48,7 +50,7 @@ merged_technical_reps: true # set to false if you used 'technical_replicates: " merged_biological_reps: true # set to false if you used 'biological_replicates: "keep" in s2s' # ANANSE binding -enhancer_data: "ATAC" # experimental option "p300" +enhancer_data: ATAC # experimental option: p300 jaccard: 0.2 # default: 0.1 # ANANSE influence @@ -56,4 +58,4 @@ edges: 500_000 # default: 500_000 padj: 0.05 # ANANSE plot -plot_type: "png" +plot_type: png From 96414dfca76e35b64c26fcfade61e355262ec54d Mon Sep 17 00:00:00 2001 From: siebrenf Date: Fri, 26 May 2023 19:00:33 +0200 Subject: [PATCH 6/9] oopsies --- anansnake/scripts/motif2factors.py | 1 - example/config.yaml | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/anansnake/scripts/motif2factors.py b/anansnake/scripts/motif2factors.py index fb61abe..6d20f6d 100644 --- a/anansnake/scripts/motif2factors.py +++ b/anansnake/scripts/motif2factors.py @@ -1,5 +1,4 @@ from contextlib import redirect_stdout, redirect_stderr -from os import makedirs from os.path import dirname, join from shutil import copyfile diff --git a/example/config.yaml b/example/config.yaml index 2917846..19d2f3a 100644 --- a/example/config.yaml +++ b/example/config.yaml @@ -15,7 +15,7 @@ atac_samples: example/atac_samples.tsv atac_counts: example/GRCz11_raw.tsv genome: GRCz11 # genomepy install GRCz11 --annotation -genomes_dir: ~/genomes # where to search for genomes (default: $HOME/.local/share/genomes) +# genomes_dir: # where to search for genomes (default: $HOME/.local/share/genomes) # output result_dir: example/outdir From ca435eef63d20019adb2495c3d3d065969cc277e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Siebren=20Fr=C3=B6lich?= <48289046+siebrenf@users.noreply.github.com> Date: Thu, 27 Jul 2023 10:14:33 +0200 Subject: [PATCH 7/9] test with improved maelstrom report --- anansnake/envs/gimme.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/anansnake/envs/gimme.yaml b/anansnake/envs/gimme.yaml index b32146e..aeca8d3 100644 --- a/anansnake/envs/gimme.yaml +++ b/anansnake/envs/gimme.yaml @@ -8,3 +8,7 @@ dependencies: - gffread - orthofinder - xgboost # optional maelstrom dependency + + - pip + - pip: + - git+https://github.com/vanheeringen-lab/gimmemotifs@report From 665573e20b1cef76e2a10fe06f987eb546f51d2a Mon Sep 17 00:00:00 2001 From: siebrenf Date: Thu, 27 Jul 2023 10:36:07 +0200 Subject: [PATCH 8/9] fix typo --- anansnake/envs/gimme.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anansnake/envs/gimme.yaml b/anansnake/envs/gimme.yaml index aeca8d3..3e01953 100644 --- a/anansnake/envs/gimme.yaml +++ b/anansnake/envs/gimme.yaml @@ -1,4 +1,4 @@ -name: ananse +name: gimme channels: - conda-forge - bioconda From ef24e67601aa04a8277a10fa2dbdd899ac980a5c Mon Sep 17 00:00:00 2001 From: siebrenf Date: Wed, 22 Nov 2023 17:04:15 +0100 Subject: [PATCH 9/9] update seq2science --- .gitignore | 1 + requirements.yaml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 6d3ce93..eba22ea 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ **/ananse/ anansenake.egg-info/ build/ +config.yaml dask-worker-space/ example/config.yaml example/outdir/ diff --git a/requirements.yaml b/requirements.yaml index 2147e05..0df8ddd 100644 --- a/requirements.yaml +++ b/requirements.yaml @@ -4,4 +4,4 @@ channels: - bioconda - defaults dependencies: - - seq2science =1.2.0 + - seq2science =1.2.1