Ensembl · MatBarba · Nov 2, 2023 · Oct 26, 2023 · Oct 26, 2023 · Oct 26, 2023
diff --git a/...me_prepare/inputDir/gen_prep_in_meta.json → ...repare/inputDir_brc/gen_prep_in_meta.json b/...me_prepare/inputDir/gen_prep_in_meta.json → ...repare/inputDir_brc/gen_prep_in_meta.json
diff --git a/data/test/genome_prepare/inputDir_nobrc/gen_prep_in_meta.json b/data/test/genome_prepare/inputDir_nobrc/gen_prep_in_meta.json
@@ -0,0 +1,7 @@
+{
+ "species": {},
+ "assembly": {
+  "accession": "GCA_017607445.1"
+ },
+ "genebuild": {}
+}
diff --git a/pipelines/nextflow/modules/database/db_factory.nf b/pipelines/nextflow/modules/database/db_factory.nf
@@ -25,22 +25,16 @@ process DB_FACTORY {
         path "dbs.json"
 
     script:
+        brc_mode = params.brc_mode ? '--brc_mode 1' : ''
+        dbname_re = filter_map.dbname_re ? "--dbname_re $filter_map.dbname_re" : ''
         """
-        brc_mode=''
-        if [ $filter_map.brc_mode == 1 ]; then
-            brc_mode='--brc_mode 1'
-        fi
-        dbname_re=''
-        if [ -n '${filter_map.dbname_re}' ]; then
-            dbname_re="--dbname_re ${filter_map.dbname_re}"
-        fi
         db_factory --host '${server.host}' \
             --port '${server.port}' \
             --user '${server.user}' \
             --password '${server.password}' \
             --prefix '${filter_map.prefix}' \
-            \$brc_mode \
-            \$dbname_re \
+            $brc_mode \
+            $dbname_re \
             --output_json dbs.json
         """
 }
diff --git a/pipelines/nextflow/modules/events/dump_events.nf b/pipelines/nextflow/modules/events/dump_events.nf
@@ -21,17 +21,12 @@ process DUMP_EVENTS {
     input:
         val server
         val db
-        val filter_map
 
     output:
         tuple val(db), val("events"), path("events.txt")
 
     script:
         """
-        brc_mode=''
-        if [ $filter_map.brc_mode == 1 ]; then
-            brc_mode='--brc_mode 1'
-        fi
         touch "events.txt"
         events_dumper --host '${server.host}' \
             --port '${server.port}' \

diff --git a/pipelines/nextflow/modules/genome_metadata/amend_genome_data.nf b/pipelines/nextflow/modules/genome_metadata/amend_genome_data.nf
@@ -20,7 +20,6 @@ process AMEND_GENOME_DATA {
     input:
         tuple val(meta), path(genome_json, stageAs: "incoming_genome.json"), path(asm_report),
             path(genomic_fna), path(genbank_gbff)
-        val brc4_mode
 
     output:
         tuple val(meta), path ("genome.json"), emit: amended_json
@@ -29,6 +28,6 @@ process AMEND_GENOME_DATA {
         '''
         amend_genomic_data --genome_infile !{genome_json} --genome_outfile genome.json \
             --INSDC_RefSeq_report_infile !{asm_report} --genbank_infile !{genbank_gbff} \
-            --brc4_mode !{brc4_mode}
+            --brc4_mode !{params.brc_mode}
         '''
 }
diff --git a/pipelines/nextflow/modules/genome_metadata/dump_genome_meta.nf b/pipelines/nextflow/modules/genome_metadata/dump_genome_meta.nf
@@ -22,18 +22,13 @@ process DUMP_GENOME_META {
     input:
         val server
         val db
-        val filter_map
 
     output:
         tuple val(db), val("genome"), path("genome.json")
 
     script:
         def output = "genome.json"
         """
-        brc_mode=''
-        if [ $filter_map.brc_mode == 1 ]; then
-            brc_mode='--brc_mode 1'
-        fi
         touch $output
         genome_meta_dumper --host '${server.host}' \
             --port '${server.port}' \

diff --git a/pipelines/nextflow/modules/manifest/check_integrity_db.nf b/pipelines/nextflow/modules/manifest/check_integrity_db.nf
@@ -22,18 +22,13 @@ process CHECK_INTEGRITY {
 
     input:
         tuple val(db), path(manifest_dir)
-        val filter_map
 
     output:
         tuple val(db), path(manifest_dir, includeInputs: true)
 
     script:
+        brc_mode = params.brc_mode ? '--brc_mode 1' : ''
         """
-        brc_mode=''
-        if [ $filter_map.brc_mode == 1 ]; then
-            brc_mode='--brc_mode 1'
-        fi
-        check_integrity --manifest_file ${manifest_dir}/manifest.json \
-            \$brc_mode
+        check_integrity --manifest_file ${manifest_dir}/manifest.json $brc_mode
         """
 }
diff --git a/pipelines/nextflow/modules/seq_region/dump_seq_regions.nf b/pipelines/nextflow/modules/seq_region/dump_seq_regions.nf
@@ -20,24 +20,18 @@ process DUMP_SEQ_REGIONS {
     input:
         val server
         val db
-        val filter_map
 
     output:
         tuple val(db), val("seq_region"), path("seq_region.json")
 
     script:
         """
-        brc_mode=''
-        if [ $filter_map.brc_mode == 1 ]; then
-            brc_mode='--brc_mode 1'
-        fi
         touch seq_region.json
         seq_region_dumper --host '${server.host}' \
             --port '${server.port}' \
             --user '${server.user}' \
             --password '${server.password}' \
             --database '${db.database}' \
-            \$brc_mode \
             --output_json seq_region.json
         """
 }
diff --git a/pipelines/nextflow/subworkflows/dump_metadata/main.nf b/pipelines/nextflow/subworkflows/dump_metadata/main.nf
@@ -30,7 +30,6 @@ workflow DUMP_METADATA {
     take:
         server
         db
-        filter_map
 
     emit:
         db
@@ -40,20 +39,20 @@ workflow DUMP_METADATA {
 
         // Seq regions
         if (params.selection.contains("seq_regions")) {
-            seq_regions = DUMP_SEQ_REGIONS(server, db, filter_map)
+            seq_regions = DUMP_SEQ_REGIONS(server, db)
             seq_regions_checked = CHECK_JSON_SCHEMA(seq_regions)
             db_files = db_files.concat(seq_regions_checked)
         }
 
         // Events
         if (params.selection.contains("events")) {
-            events = DUMP_EVENTS(server, db, filter_map)
+            events = DUMP_EVENTS(server, db)
             db_files = db_files.concat(events)
         }
 
         // Genome metadata
         if (params.selection.contains("genome_metadata")) {
-            genome_meta = DUMP_GENOME_META(server, db, filter_map)
+            genome_meta = DUMP_GENOME_META(server, db)
             db_files = db_files.concat(genome_meta)
         }
 
@@ -79,6 +78,6 @@ workflow DUMP_METADATA {
         // Collect, create manifest, and publish
         collect_dir = COLLECT_FILES(db_files)
         manifested_dir = MANIFEST(collect_dir)
-        manifest_checked = CHECK_INTEGRITY(manifested_dir, filter_map)
+        manifest_checked = CHECK_INTEGRITY(manifested_dir)
         PUBLISH_DIR(manifest_checked, params.output_dir)
 }
diff --git a/pipelines/nextflow/subworkflows/dump_sql/main.nf b/pipelines/nextflow/subworkflows/dump_sql/main.nf
@@ -19,7 +19,6 @@ workflow DUMP_SQL {
     take:
         server
         dbs
-        filter_map
 
     emit:
         dbs

diff --git a/pipelines/nextflow/subworkflows/genome_prepare/main.nf b/pipelines/nextflow/subworkflows/genome_prepare/main.nf
@@ -82,7 +82,7 @@ workflow GENOME_PREPARE {
         fasta_dna = PROCESS_FASTA_DNA(download_min, 0).processed_fasta
 
         // Amend genome data find any additional sequence regions
-        amended_genome = AMEND_GENOME_DATA(genome_data_files, params.brc_mode).amended_json
+        amended_genome = AMEND_GENOME_DATA(genome_data_files).amended_json
 
         // Group files
         prepared_files = amended_genome.concat(

diff --git a/pipelines/nextflow/tests/workflows/test_genome_prepare_brc_mode_off.yml b/pipelines/nextflow/tests/workflows/test_genome_prepare_brc_mode_off.yml
@@ -0,0 +1,51 @@
+# See the NOTICE file distributed with this work for additional information
+# regarding copyright ownership.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This test runs the genome_prepare pipeline for the INSDC accession GCA_017607445.1 from Genbank
+# It uses a cache file to avoid downloading assembly related data files (--cache_dir)
+# The cached data has been reduced to two seq regions, 1 nuclear and 1 mitochondrial sequence
+- name: genome_prepare_pipeline
+  command: nextflow run ./pipelines/nextflow/workflows/genome_prepare/main.nf \\
+    -c ./pipelines/nextflow/tests/workflows/nextflow_test.config \\
+    --input_dir ./data/test/genome_prepare/inputDir_nobrc \\
+    --cache_dir ./data/test/genome_prepare/cache/ \\
+    --output_dir ./test_genome_prepare_output \\
+    --brc_mode 0 \\
+    --ncbi_check 0
+
+  # Check that all the expected files are produced
+  # Make sure to update those if the processing of the files changes!
+  files:
+    - path: ./test_genome_prepare_output/GCA_017607445.1/fasta_dna.fa
+      md5sum: 68d26226b04950883edecd6095d1db1f
+    - path: ./test_genome_prepare_output/GCA_017607445.1/fasta_pep.fa
+      md5sum: d3be87e392cc53ded62987c28952cc3d
+    - path: ./test_genome_prepare_output/GCA_017607445.1/functional_annotation.json
+      md5sum: eb834948fb9363dd71d02cb591848345
+    - path: ./test_genome_prepare_output/GCA_017607445.1/gene_models.gff3
+      md5sum: 3303f5a000173812ba53d01571037b30
+    # Genome contains fields that depend on the date, so can't check md5sum
+    - path: ./test_genome_prepare_output/GCA_017607445.1/genome.json
+      contains:
+        - GCA_017607445.1
+      must_not_contain:
+        - organAbrev123
+        - OrganismDB
+    # Manifest depends on the genome checksum, so also date dependent
+    - path: ./test_genome_prepare_output/GCA_017607445.1/manifest.json
+    - path: ./test_genome_prepare_output/GCA_017607445.1/seq_region.json
+      md5sum: 585da9f97f094e83702860ce43da652f
+    - path: ./test_genome_prepare_output/GCA_017607445.1/stats.txt
+      md5sum: 6104869d437ec4a9d13a2d070c307b0f
diff --git a/pipelines/nextflow/tests/workflows/test_genome_prepare_brc_mode_on.yml b/pipelines/nextflow/tests/workflows/test_genome_prepare_brc_mode_on.yml
@@ -0,0 +1,50 @@
+# See the NOTICE file distributed with this work for additional information
+# regarding copyright ownership.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This test runs the genome_prepare pipeline for the INSDC accession GCA_017607445.1 from Genbank
+# It uses a cache file to avoid downloading assembly related data files (--cache_dir)
+# The cached data has been reduced to two seq regions, 1 nuclear and 1 mitochondrial sequence
+- name: genome_prepare_pipeline
+  command: nextflow run ./pipelines/nextflow/workflows/genome_prepare/main.nf \\
+    -c ./pipelines/nextflow/tests/workflows/nextflow_test.config \\
+    --input_dir ./data/test/genome_prepare/inputDir_brc \\
+    --cache_dir ./data/test/genome_prepare/cache/ \\
+    --output_dir ./test_genome_prepare_output \\
+    --brc_mode 1 \\
+    --ncbi_check 0
+
+  # Check that all the expected files are produced
+  # Make sure to update those if the processing of the files changes!
+  files:
+    - path: ./test_genome_prepare_output/GCA_017607445.1/fasta_dna.fa
+      md5sum: 68d26226b04950883edecd6095d1db1f
+    - path: ./test_genome_prepare_output/GCA_017607445.1/fasta_pep.fa
+      md5sum: d3be87e392cc53ded62987c28952cc3d
+    - path: ./test_genome_prepare_output/GCA_017607445.1/functional_annotation.json
+      md5sum: eb834948fb9363dd71d02cb591848345
+    - path: ./test_genome_prepare_output/GCA_017607445.1/gene_models.gff3
+      md5sum: 3303f5a000173812ba53d01571037b30
+    # Genome contains fields that depend on the date, so can't check md5sum
+    - path: ./test_genome_prepare_output/GCA_017607445.1/genome.json
+      contains:
+        - GCA_017607445.1
+        - organAbrev123
+        - OrganismDB
+    # Manifest depends on the genome checksum, so also date dependent
+    - path: ./test_genome_prepare_output/GCA_017607445.1/manifest.json
+    - path: ./test_genome_prepare_output/GCA_017607445.1/seq_region.json
+      md5sum: 585da9f97f094e83702860ce43da652f
+    - path: ./test_genome_prepare_output/GCA_017607445.1/stats.txt
+      md5sum: 6104869d437ec4a9d13a2d070c307b0f
diff --git a/pipelines/nextflow/workflows/additional_seq_prepare/main.nf b/pipelines/nextflow/workflows/additional_seq_prepare/main.nf
@@ -15,7 +15,6 @@
 
 //default params
 params.help = false
-params.brc_mode = 1
 
 // mandatory params
 params.accession = null
@@ -38,7 +37,7 @@ def helpMessage() {
         --output_dir                   Output directory to place final output
         --cache_dir                    Cache directory for downloaded files
         --help                         This usage statement.
-        --brc_mode                     By default it is set to 1, set it to 0 if you are not using it for brc
+        --brc_mode                     Set to 1 to use with BRC data (default: ${params.brc_mode})
         """
 }
 
@@ -48,6 +47,10 @@ if (params.help) {
     exit 0
 }
 
+if (params.brc_mode) {
+    params.brc_mode = params.brc_mode as Integer
+}
+
 assert params.accession, "Parameter 'accession' is not specified"
 assert params.prefix, "Parameter 'prefix' is not specified"
 assert params.production_name, "Parameter 'production_name' is not specified"

diff --git a/pipelines/nextflow/workflows/dumper_pipeline/main.nf b/pipelines/nextflow/workflows/dumper_pipeline/main.nf
@@ -17,7 +17,6 @@
 // default params
 params.help = false
 params.prefix = ''
-params.brc_mode = 0
 params.dbname_re = ''
 params.output_dir = './dumper_output'
 params.password = ''
@@ -34,13 +33,13 @@ default_selection = [
 def helpMessage() {
   log.info """
         Mandatory arguments:
-        --host, --port, --user           Connection parameters to the SQL servers we getting core db(s) from
+        --host, --port, --user         Connection parameters to the SQL servers we getting core db(s) from
 
         Optional arguments:
         --password                     Password part of the connection parameters
         --prefix                       Core dabase(s) name prefixes
         --dbname_re                    Regexp to match core db name(s) against
-        --brc_mode	               Override Ensembl 'species' and 'division' with the corresponding BRC4 ones ('organism_abbrev' and 'component')
+        --brc_mode	               Override Ensembl 'species' and 'division' with the corresponding BRC ones ('organism_abbrev' and 'component')
         --output_dir                   Name of Output directory to gather prepared outfiles. (default: ${params.output_dir})
         --select_dump                  Comma-separated list of items to dump (all by default, or choose among ${default_selection})
         --cache_dir                    Directory where some files are cached (e.g. NCBI stats files)
@@ -66,6 +65,10 @@ if (params.help) {
     exit 0
 }
 
+if (params.brc_mode) {
+    params.brc_mode = params.brc_mode as Integer
+}
+
 def create_server(params) {
     server = [
         "host": params.host,
@@ -81,13 +84,9 @@ def create_server(params) {
 
 def create_filter_map(params) {
     filter_map = [
-        "brc_mode": 0,
         "prefix": "",
         "dbname_re": ""
     ]
-    if (params.brc_mode) {
-        filter_map["brc_mode"] = 1
-    }
     if (params.prefix) {
         filter_map["prefix"] = params.prefix
     }
@@ -129,7 +128,7 @@ workflow {
         .flatten()
 
     if (params.selection.contains('sql')) {
-        DUMP_SQL(server, dbs, filter_map)
+        DUMP_SQL(server, dbs)
     }
-    DUMP_METADATA(server, dbs, filter_map)
+    DUMP_METADATA(server, dbs)
 }
diff --git a/pipelines/nextflow/workflows/genome_prepare/main.nf b/pipelines/nextflow/workflows/genome_prepare/main.nf
@@ -49,6 +49,10 @@ if (params.input_dir) {
     exit 1, 'Input directory not specified!'
 }
 
+if (params.brc_mode) {
+    params.brc_mode = params.brc_mode as Integer
+}
+
 // Import subworkflow
 include { GENOME_PREPARE } from '../../subworkflows/genome_prepare/main.nf'
 // Import module

diff --git a/pipelines/nextflow/workflows/genome_prepare/nextflow.config b/pipelines/nextflow/workflows/genome_prepare/nextflow.config
@@ -21,7 +21,6 @@ workDir = "$NXF_WORK/genome_prepare"
 params {
     input_dir = ""
     regions_to_exclude = ""
-    brc_mode = "1"
     output_dir = "Output_GenomePrepare"
     ncbi_check = 1