Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mbarba/brc mode #180

Merged
merged 12 commits into from
Nov 2, 2023
7 changes: 7 additions & 0 deletions data/test/genome_prepare/inputDir_nobrc/gen_prep_in_meta.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"species": {},
"assembly": {
"accession": "GCA_017607445.1"
},
"genebuild": {}
}
14 changes: 4 additions & 10 deletions pipelines/nextflow/modules/database/db_factory.nf
Original file line number Diff line number Diff line change
Expand Up @@ -25,22 +25,16 @@ process DB_FACTORY {
path "dbs.json"

script:
brc_mode = params.brc_mode ? '--brc_mode 1' : ''
dbname_re = filter_map.dbname_re ? "--dbname_re $filter_map.dbname_re" : ''
"""
brc_mode=''
if [ $filter_map.brc_mode == 1 ]; then
brc_mode='--brc_mode 1'
fi
dbname_re=''
if [ -n '${filter_map.dbname_re}' ]; then
dbname_re="--dbname_re ${filter_map.dbname_re}"
fi
db_factory --host '${server.host}' \
--port '${server.port}' \
--user '${server.user}' \
--password '${server.password}' \
--prefix '${filter_map.prefix}' \
\$brc_mode \
\$dbname_re \
$brc_mode \
$dbname_re \
--output_json dbs.json
"""
}
5 changes: 0 additions & 5 deletions pipelines/nextflow/modules/events/dump_events.nf
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,12 @@ process DUMP_EVENTS {
input:
val server
val db
val filter_map

output:
tuple val(db), val("events"), path("events.txt")

script:
"""
brc_mode=''
if [ $filter_map.brc_mode == 1 ]; then
brc_mode='--brc_mode 1'
fi
touch "events.txt"
events_dumper --host '${server.host}' \
--port '${server.port}' \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ process AMEND_GENOME_DATA {
input:
tuple val(meta), path(genome_json, stageAs: "incoming_genome.json"), path(asm_report),
path(genomic_fna), path(genbank_gbff)
val brc4_mode

output:
tuple val(meta), path ("genome.json"), emit: amended_json
Expand All @@ -29,6 +28,6 @@ process AMEND_GENOME_DATA {
'''
amend_genomic_data --genome_infile !{genome_json} --genome_outfile genome.json \
--INSDC_RefSeq_report_infile !{asm_report} --genbank_infile !{genbank_gbff} \
--brc4_mode !{brc4_mode}
--brc4_mode !{params.brc_mode}
'''
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,18 +22,13 @@ process DUMP_GENOME_META {
input:
val server
val db
val filter_map

output:
tuple val(db), val("genome"), path("genome.json")

script:
def output = "genome.json"
"""
brc_mode=''
if [ $filter_map.brc_mode == 1 ]; then
brc_mode='--brc_mode 1'
fi
touch $output
genome_meta_dumper --host '${server.host}' \
--port '${server.port}' \
Expand Down
9 changes: 2 additions & 7 deletions pipelines/nextflow/modules/manifest/check_integrity_db.nf
Original file line number Diff line number Diff line change
Expand Up @@ -22,18 +22,13 @@ process CHECK_INTEGRITY {

input:
tuple val(db), path(manifest_dir)
val filter_map

output:
tuple val(db), path(manifest_dir, includeInputs: true)

script:
brc_mode = params.brc_mode ? '--brc_mode 1' : ''
"""
brc_mode=''
if [ $filter_map.brc_mode == 1 ]; then
brc_mode='--brc_mode 1'
fi
check_integrity --manifest_file ${manifest_dir}/manifest.json \
\$brc_mode
check_integrity --manifest_file ${manifest_dir}/manifest.json $brc_mode
"""
}
6 changes: 0 additions & 6 deletions pipelines/nextflow/modules/seq_region/dump_seq_regions.nf
Original file line number Diff line number Diff line change
Expand Up @@ -20,24 +20,18 @@ process DUMP_SEQ_REGIONS {
input:
val server
val db
val filter_map

output:
tuple val(db), val("seq_region"), path("seq_region.json")

script:
"""
brc_mode=''
if [ $filter_map.brc_mode == 1 ]; then
brc_mode='--brc_mode 1'
fi
touch seq_region.json
seq_region_dumper --host '${server.host}' \
--port '${server.port}' \
--user '${server.user}' \
--password '${server.password}' \
--database '${db.database}' \
\$brc_mode \
--output_json seq_region.json
"""
}
9 changes: 4 additions & 5 deletions pipelines/nextflow/subworkflows/dump_metadata/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ workflow DUMP_METADATA {
take:
server
db
filter_map

emit:
db
Expand All @@ -40,20 +39,20 @@ workflow DUMP_METADATA {

// Seq regions
if (params.selection.contains("seq_regions")) {
seq_regions = DUMP_SEQ_REGIONS(server, db, filter_map)
seq_regions = DUMP_SEQ_REGIONS(server, db)
seq_regions_checked = CHECK_JSON_SCHEMA(seq_regions)
db_files = db_files.concat(seq_regions_checked)
}

// Events
if (params.selection.contains("events")) {
events = DUMP_EVENTS(server, db, filter_map)
events = DUMP_EVENTS(server, db)
db_files = db_files.concat(events)
}

// Genome metadata
if (params.selection.contains("genome_metadata")) {
genome_meta = DUMP_GENOME_META(server, db, filter_map)
genome_meta = DUMP_GENOME_META(server, db)
db_files = db_files.concat(genome_meta)
}

Expand All @@ -79,6 +78,6 @@ workflow DUMP_METADATA {
// Collect, create manifest, and publish
collect_dir = COLLECT_FILES(db_files)
manifested_dir = MANIFEST(collect_dir)
manifest_checked = CHECK_INTEGRITY(manifested_dir, filter_map)
manifest_checked = CHECK_INTEGRITY(manifested_dir)
PUBLISH_DIR(manifest_checked, params.output_dir)
}
1 change: 0 additions & 1 deletion pipelines/nextflow/subworkflows/dump_sql/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ workflow DUMP_SQL {
take:
server
dbs
filter_map

emit:
dbs
Expand Down
2 changes: 1 addition & 1 deletion pipelines/nextflow/subworkflows/genome_prepare/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ workflow GENOME_PREPARE {
fasta_dna = PROCESS_FASTA_DNA(download_min, 0).processed_fasta

// Amend genome data find any additional sequence regions
amended_genome = AMEND_GENOME_DATA(genome_data_files, params.brc_mode).amended_json
amended_genome = AMEND_GENOME_DATA(genome_data_files).amended_json

// Group files
prepared_files = amended_genome.concat(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# See the NOTICE file distributed with this work for additional information
# regarding copyright ownership.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This test runs the genome_prepare pipeline for the INSDC accession GCA_017607445.1 from Genbank
# It uses a cache file to avoid downloading assembly related data files (--cache_dir)
# The cached data has been reduced to two seq regions, 1 nuclear and 1 mitochondrial sequence
- name: genome_prepare_pipeline
command: nextflow run ./pipelines/nextflow/workflows/genome_prepare/main.nf \\
-c ./pipelines/nextflow/tests/workflows/nextflow_test.config \\
--input_dir ./data/test/genome_prepare/inputDir_nobrc \\
--cache_dir ./data/test/genome_prepare/cache/ \\
--output_dir ./test_genome_prepare_output \\
--brc_mode 0 \\
--ncbi_check 0

# Check that all the expected files are produced
# Make sure to update those if the processing of the files changes!
files:
- path: ./test_genome_prepare_output/GCA_017607445.1/fasta_dna.fa
md5sum: 68d26226b04950883edecd6095d1db1f
- path: ./test_genome_prepare_output/GCA_017607445.1/fasta_pep.fa
md5sum: d3be87e392cc53ded62987c28952cc3d
- path: ./test_genome_prepare_output/GCA_017607445.1/functional_annotation.json
md5sum: eb834948fb9363dd71d02cb591848345
- path: ./test_genome_prepare_output/GCA_017607445.1/gene_models.gff3
md5sum: 3303f5a000173812ba53d01571037b30
# Genome contains fields that depend on the date, so can't check md5sum
- path: ./test_genome_prepare_output/GCA_017607445.1/genome.json
contains:
- GCA_017607445.1
must_not_contain:
- organAbrev123
- OrganismDB
# Manifest depends on the genome checksum, so also date dependent
- path: ./test_genome_prepare_output/GCA_017607445.1/manifest.json
- path: ./test_genome_prepare_output/GCA_017607445.1/seq_region.json
md5sum: 585da9f97f094e83702860ce43da652f
- path: ./test_genome_prepare_output/GCA_017607445.1/stats.txt
md5sum: 6104869d437ec4a9d13a2d070c307b0f
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# See the NOTICE file distributed with this work for additional information
# regarding copyright ownership.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This test runs the genome_prepare pipeline for the INSDC accession GCA_017607445.1 from Genbank
# It uses a cache file to avoid downloading assembly related data files (--cache_dir)
# The cached data has been reduced to two seq regions, 1 nuclear and 1 mitochondrial sequence
- name: genome_prepare_pipeline
command: nextflow run ./pipelines/nextflow/workflows/genome_prepare/main.nf \\
-c ./pipelines/nextflow/tests/workflows/nextflow_test.config \\
--input_dir ./data/test/genome_prepare/inputDir_brc \\
--cache_dir ./data/test/genome_prepare/cache/ \\
--output_dir ./test_genome_prepare_output \\
--brc_mode 1 \\
--ncbi_check 0

# Check that all the expected files are produced
# Make sure to update those if the processing of the files changes!
files:
- path: ./test_genome_prepare_output/GCA_017607445.1/fasta_dna.fa
md5sum: 68d26226b04950883edecd6095d1db1f
- path: ./test_genome_prepare_output/GCA_017607445.1/fasta_pep.fa
md5sum: d3be87e392cc53ded62987c28952cc3d
- path: ./test_genome_prepare_output/GCA_017607445.1/functional_annotation.json
md5sum: eb834948fb9363dd71d02cb591848345
- path: ./test_genome_prepare_output/GCA_017607445.1/gene_models.gff3
md5sum: 3303f5a000173812ba53d01571037b30
# Genome contains fields that depend on the date, so can't check md5sum
- path: ./test_genome_prepare_output/GCA_017607445.1/genome.json
contains:
- GCA_017607445.1
- organAbrev123
- OrganismDB
# Manifest depends on the genome checksum, so also date dependent
- path: ./test_genome_prepare_output/GCA_017607445.1/manifest.json
- path: ./test_genome_prepare_output/GCA_017607445.1/seq_region.json
md5sum: 585da9f97f094e83702860ce43da652f
- path: ./test_genome_prepare_output/GCA_017607445.1/stats.txt
md5sum: 6104869d437ec4a9d13a2d070c307b0f
7 changes: 5 additions & 2 deletions pipelines/nextflow/workflows/additional_seq_prepare/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@

//default params
params.help = false
params.brc_mode = 1

// mandatory params
params.accession = null
Expand All @@ -38,7 +37,7 @@ def helpMessage() {
--output_dir Output directory to place final output
--cache_dir Cache directory for downloaded files
--help This usage statement.
--brc_mode By default it is set to 1, set it to 0 if you are not using it for brc
--brc_mode Set to 1 to use with BRC data (default: ${params.brc_mode})
"""
}

Expand All @@ -48,6 +47,10 @@ if (params.help) {
exit 0
}

if (params.brc_mode) {
params.brc_mode = params.brc_mode as Integer
}

assert params.accession, "Parameter 'accession' is not specified"
assert params.prefix, "Parameter 'prefix' is not specified"
assert params.production_name, "Parameter 'production_name' is not specified"
Expand Down
17 changes: 8 additions & 9 deletions pipelines/nextflow/workflows/dumper_pipeline/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
// default params
params.help = false
params.prefix = ''
params.brc_mode = 0
params.dbname_re = ''
params.output_dir = './dumper_output'
params.password = ''
Expand All @@ -34,13 +33,13 @@ default_selection = [
def helpMessage() {
log.info """
Mandatory arguments:
--host, --port, --user Connection parameters to the SQL servers we getting core db(s) from
--host, --port, --user Connection parameters to the SQL servers we getting core db(s) from

Optional arguments:
--password Password part of the connection parameters
--prefix Core dabase(s) name prefixes
--dbname_re Regexp to match core db name(s) against
--brc_mode Override Ensembl 'species' and 'division' with the corresponding BRC4 ones ('organism_abbrev' and 'component')
--brc_mode Override Ensembl 'species' and 'division' with the corresponding BRC ones ('organism_abbrev' and 'component')
--output_dir Name of Output directory to gather prepared outfiles. (default: ${params.output_dir})
--select_dump Comma-separated list of items to dump (all by default, or choose among ${default_selection})
--cache_dir Directory where some files are cached (e.g. NCBI stats files)
Expand All @@ -66,6 +65,10 @@ if (params.help) {
exit 0
}

if (params.brc_mode) {
params.brc_mode = params.brc_mode as Integer
}

def create_server(params) {
server = [
"host": params.host,
Expand All @@ -81,13 +84,9 @@ def create_server(params) {

def create_filter_map(params) {
filter_map = [
"brc_mode": 0,
"prefix": "",
"dbname_re": ""
]
if (params.brc_mode) {
filter_map["brc_mode"] = 1
}
if (params.prefix) {
filter_map["prefix"] = params.prefix
}
Expand Down Expand Up @@ -129,7 +128,7 @@ workflow {
.flatten()

if (params.selection.contains('sql')) {
DUMP_SQL(server, dbs, filter_map)
DUMP_SQL(server, dbs)
}
DUMP_METADATA(server, dbs, filter_map)
DUMP_METADATA(server, dbs)
}
4 changes: 4 additions & 0 deletions pipelines/nextflow/workflows/genome_prepare/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,10 @@ if (params.input_dir) {
exit 1, 'Input directory not specified!'
}

if (params.brc_mode) {
params.brc_mode = params.brc_mode as Integer
}

// Import subworkflow
include { GENOME_PREPARE } from '../../subworkflows/genome_prepare/main.nf'
// Import module
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ workDir = "$NXF_WORK/genome_prepare"
params {
input_dir = ""
regions_to_exclude = ""
brc_mode = "1"
output_dir = "Output_GenomePrepare"
ncbi_check = 1

Expand Down
Loading
Loading