diff --git a/lib/Bio/EnsEMBL/DataCheck/Checks/BlankNulls.pm b/lib/Bio/EnsEMBL/DataCheck/Checks/BlankNulls.pm index 0475845f..e6d84c49 100644 --- a/lib/Bio/EnsEMBL/DataCheck/Checks/BlankNulls.pm +++ b/lib/Bio/EnsEMBL/DataCheck/Checks/BlankNulls.pm @@ -57,13 +57,14 @@ sub tests { foreach my $nullable (@$nullables) { my ($table, $column) = @$nullable; - - my $desc = "Nullable column $table.$column has no '' or 'NULL' string values"; - my $sql = qq/ - SELECT COUNT(*) FROM $table - WHERE $column = '' OR $column = 'NULL' - /; + if ($table ne "meta"){ + my $desc = "Nullable column $table.$column has no '' or 'NULL' string values"; + my $sql = qq/ + SELECT COUNT(*) FROM $table + WHERE $column = '' OR $column = 'NULL' + /; is_rows_zero($self->dba, $sql, $desc); + } } } diff --git a/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledMetaKeys.pm b/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledMetaKeys.pm index 68491742..6c0d0f4a 100644 --- a/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledMetaKeys.pm +++ b/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledMetaKeys.pm @@ -36,22 +36,27 @@ use constant { sub tests { my ($self) = @_; - my $species_id = $self->dba->species_id; my $group = $self->dba->group; - my $sql = qq/ SELECT meta_key, COUNT(*) FROM meta WHERE species_id = $species_id OR species_id IS NULL GROUP BY meta_key /; + my $helper = $self->dba->dbc->sql_helper; my %meta_keys = %{ $helper->execute_into_hash(-SQL => $sql) }; + #check target site is main / new and select mandatory metakeys + my $filter_metakeys = ''; + if (defined $self->target_site){ + $filter_metakeys = " AND target_site like '\%".$self->target_site."\%' "; + } + my $prod_sql = qq/ SELECT name, is_optional FROM meta_key - WHERE FIND_IN_SET('$group', db_type) AND is_current = 1 + WHERE FIND_IN_SET('$group', db_type) AND is_current = 1 $filter_metakeys /; my $prod_dba = $self->get_dba('multi', 'production'); my $prod_helper = $prod_dba->dbc->sql_helper; diff --git a/lib/Bio/EnsEMBL/DataCheck/Checks/DisplayNameFormat.pm b/lib/Bio/EnsEMBL/DataCheck/Checks/DisplayNameFormat.pm index 68a9f667..c9f86fb8 100644 --- a/lib/Bio/EnsEMBL/DataCheck/Checks/DisplayNameFormat.pm +++ b/lib/Bio/EnsEMBL/DataCheck/Checks/DisplayNameFormat.pm @@ -40,7 +40,7 @@ sub tests { my $mca = $self->dba->get_adaptor("MetaContainer"); # Check that the format of the display name conforms to expectations. - my $format = '[A-Za-z0-9\ ]+ \([A-Za-z0-9\(\)\/\-\_,\#\. ]+\) \- GCA_\d+\.\d+(?:\s\[[\w ]+\])?'; + my $format = '[A-Za-z0-9\ ]+ \([A-Za-z0-9\(\)\/\-\_,\#\. ]+\) \- GC[AF]_\d+\.\d+(?:\s\[[\w ]+\])?'; my $desc = "Display name has correct format"; my $display_name = $mca->single_value_by_key('species.display_name'); diff --git a/lib/Bio/EnsEMBL/DataCheck/Checks/MetaKeyFormat.pm b/lib/Bio/EnsEMBL/DataCheck/Checks/MetaKeyFormat.pm index 26df166c..6ea62ebd 100644 --- a/lib/Bio/EnsEMBL/DataCheck/Checks/MetaKeyFormat.pm +++ b/lib/Bio/EnsEMBL/DataCheck/Checks/MetaKeyFormat.pm @@ -44,7 +44,8 @@ sub tests { my %formats = ( 'annotation.provider_url' => '(https?:\/\/.+|www.*\.ensembl\.org)', 'assembly.provider_url' => '(https?:\/\/.+|www.*\.ensembl\.org)', - 'assembly.accession' => 'GCA_\d+\.\d+', + 'assembly.accession' => 'GC[AF]_\d+\.\d+', + 'assembly.alt_accession' => 'GCA_\d+\.\d+', 'assembly.date' => '\d{4}-\d{2}', 'assembly.default' => '[\w\.\-]+', 'genebuild.id' => '\d+', diff --git a/lib/Bio/EnsEMBL/DataCheck/Checks/SpeciesTaxonomy.pm b/lib/Bio/EnsEMBL/DataCheck/Checks/SpeciesTaxonomy.pm index de8745c4..f66abbed 100644 --- a/lib/Bio/EnsEMBL/DataCheck/Checks/SpeciesTaxonomy.pm +++ b/lib/Bio/EnsEMBL/DataCheck/Checks/SpeciesTaxonomy.pm @@ -49,7 +49,7 @@ sub tests { # scientific name, to disambiguate in the case of multiple strains # or assemblies of the same species. Since the taxonomy database does # not always have that information, remove it before comparing. - $sci_name =~ s/ \(GCA_\d+\)//; + $sci_name =~ s/ \(GC[AF]_\d+\)//; $sci_name =~ s/ (str\.|strain) .*//; my $desc_1 = 'Species-related meta data exists'; diff --git a/lib/Bio/EnsEMBL/DataCheck/DbCheck.pm b/lib/Bio/EnsEMBL/DataCheck/DbCheck.pm index f959ddae..b4576af7 100755 --- a/lib/Bio/EnsEMBL/DataCheck/DbCheck.pm +++ b/lib/Bio/EnsEMBL/DataCheck/DbCheck.pm @@ -57,6 +57,15 @@ subtype 'Registry', as 'Str', where { =head1 METHODS +=head2 target_site + Description: Fetch mandatory meta keys based on target site + current values are main/new +=cut +has 'target_site' => ( + is => 'ro', + isa => 'Str | Undef', +); + =head2 db_types Description: Database types for which this datacheck is appropriate. =cut diff --git a/lib/Bio/EnsEMBL/DataCheck/Pipeline/DataCheckSubmission.pm b/lib/Bio/EnsEMBL/DataCheck/Pipeline/DataCheckSubmission.pm index 8f66bc1b..38d60b8c 100644 --- a/lib/Bio/EnsEMBL/DataCheck/Pipeline/DataCheckSubmission.pm +++ b/lib/Bio/EnsEMBL/DataCheck/Pipeline/DataCheckSubmission.pm @@ -122,6 +122,7 @@ sub write_output { json_by_species => $self->param('json_by_species'), submission_job_id => $self->input_job->dbID, + target_site => $self->param('target_site'), }; $self->dataflow_output_id($params, 1); diff --git a/lib/Bio/EnsEMBL/DataCheck/Pipeline/DbDataChecks_conf.pm b/lib/Bio/EnsEMBL/DataCheck/Pipeline/DbDataChecks_conf.pm index 782c1768..e302dd6a 100644 --- a/lib/Bio/EnsEMBL/DataCheck/Pipeline/DbDataChecks_conf.pm +++ b/lib/Bio/EnsEMBL/DataCheck/Pipeline/DbDataChecks_conf.pm @@ -32,57 +32,58 @@ use Bio::EnsEMBL::Hive::PipeConfig::HiveGeneric_conf; use Bio::EnsEMBL::Hive::Version 2.5; sub default_options { - my ($self) = @_; - return { - %{$self->SUPER::default_options}, - - pipeline_name => 'db_datachecks', - - species => [], - taxons => [], - division => [], - run_all => 0, - antispecies => [], - antitaxons => [], - meta_filters => {}, - dbname => [], - db_type => 'core', - - datacheck_dir => undef, - index_file => undef, - history_file => undef, - output_dir => undef, - config_file => undef, - overwrite_files => 1, - datacheck_names => [], - datacheck_patterns => [], - datacheck_groups => [], - datacheck_types => [], - registry_file => undef, - server_uri => undef, - old_server_uri => undef, - data_file_path => undef, - - failures_fatal => 0, - - parallelize_datachecks => 0, - - tag => undef, - timestamp => undef, - email => undef, - report_per_db => 0, - report_all => 0, - - tap_to_json => 1, - json_passed => 0, - json_by_species => 1, - shout_db_not_found_in_registry => 1, - store_to_es => 0, - es_host => 'es.production.ensembl.org', - es_port => undef, - es_index => 'datacheck_results', - es_log_dir => '/hps/scratch/flicek/ensembl/' . $self->o('ENV', 'USER') . '/datacheck_results_' . $self->o('ENV', 'ENS_VERSION'), - }; + my ($self) = @_; + return { + %{$self->SUPER::default_options}, + + pipeline_name => 'db_datachecks', + + species => [], + taxons => [], + division => [], + run_all => 0, + antispecies => [], + antitaxons => [], + meta_filters => {}, + dbname => [], + db_type => 'core', + + datacheck_dir => undef, + index_file => undef, + history_file => undef, + output_dir => undef, + config_file => undef, + overwrite_files => 1, + datacheck_names => [], + datacheck_patterns => [], + datacheck_groups => [], + datacheck_types => [], + registry_file => undef, + server_uri => undef, + old_server_uri => undef, + data_file_path => undef, + + failures_fatal => 0, + + parallelize_datachecks => 0, + + tag => undef, + timestamp => undef, + email => undef, + report_per_db => 0, + report_all => 0, + + tap_to_json => 1, + json_passed => 0, + json_by_species => 1, + shout_db_not_found_in_registry => 1, + store_to_es => 0, + es_host => 'es.ensembl-production.ebi.ac.uk', + es_port => undef, + es_index => 'datacheck_results_'.$self->o('ENV', 'ENS_VERSION'), + es_log_dir => '/hps/scratch/flicek/ensembl/'.$self->o('ENV', 'USER').'/datacheck_results_'.$self->o('ENV', 'ENS_VERSION'), + target_site => 'main', + }; } # Implicit parameter propagation throughout the pipeline. @@ -147,236 +148,238 @@ sub pipeline_create_commands { } sub pipeline_analyses { - my $self = shift @_; - - return [ - { - -logic_name => 'DataCheckSubmission', - -module => 'Bio::EnsEMBL::DataCheck::Pipeline::DataCheckSubmission', - -analysis_capacity => 1, - -max_retry_count => 1, - -parameters => { - species => $self->o('species'), - taxons => $self->o('taxons'), - division => $self->o('division'), - run_all => $self->o('run_all'), - antispecies => $self->o('antispecies'), - antitaxons => $self->o('antitaxons'), - meta_filters => $self->o('meta_filters'), - dbname => $self->o('dbname'), - db_type => $self->o('db_type'), - - datacheck_dir => $self->o('datacheck_dir'), - index_file => $self->o('index_file'), - history_file => $self->o('history_file'), - output_dir => $self->o('output_dir'), - config_file => $self->o('config_file'), - overwrite_files => $self->o('overwrite_files'), - datacheck_names => $self->o('datacheck_names'), - datacheck_patterns => $self->o('datacheck_patterns'), - datacheck_groups => $self->o('datacheck_groups'), - datacheck_types => $self->o('datacheck_types'), - registry_file => $self->o('registry_file'), - server_uri => $self->o('server_uri'), - old_server_uri => $self->o('old_server_uri'), - data_file_path => $self->o('data_file_path'), - - failures_fatal => $self->o('failures_fatal'), - - parallelize_datachecks => $self->o('parallelize_datachecks'), - - tag => $self->o('tag'), - timestamp => $self->o('timestamp'), - email => $self->o('email'), - report_per_db => $self->o('report_per_db'), - report_all => $self->o('report_all'), - - tap_to_json => $self->o('tap_to_json'), - json_passed => $self->o('json_passed'), - json_by_species => $self->o('json_by_species'), - }, - -rc_name => 'default', - -flow_into => { - '1' => [ 'DbFactory' ], - '3' => [ '?table_name=datacheck_submission' ], - }, - }, - - { - -logic_name => 'DbFactory', - -module => 'Bio::EnsEMBL::Production::Pipeline::Common::DbFactory', - -parameters => { - shout_db_not_found_in_registry => $self->o('shout_db_not_found_in_registry'), - }, - -analysis_capacity => 10, - -max_retry_count => 0, - -flow_into => { - '2->A' => - WHEN('#parallelize_datachecks#' => - [ 'DataCheckFactory' ], - ELSE - [ 'RunDataChecks' ] - ), - 'A->1' => - WHEN('scalar @{#all_dbs#}' => - [ 'DataCheckResults' ] - ), - - }, - -rc_name => 'default', - }, - - { - -logic_name => 'RunDataChecks', - -module => 'Bio::EnsEMBL::DataCheck::Pipeline::RunDataChecks', - -analysis_capacity => 10, - -max_retry_count => 0, - -rc_name => '2GB', - -flow_into => { - '1' => [ 'StoreResults' ], - '-1' => [ 'RunDataChecks_High_mem' ] - }, - }, - - { - -logic_name => 'RunDataChecks_High_mem', - -module => 'Bio::EnsEMBL::DataCheck::Pipeline::RunDataChecks', - -analysis_capacity => 10, - -max_retry_count => 0, - -rc_name => '8GB', - -flow_into => { - '1' => [ 'StoreResults' ], - }, - }, - - { - -logic_name => 'DataCheckFactory', - -module => 'Bio::EnsEMBL::DataCheck::Pipeline::DataCheckFactory', - -analysis_capacity => 10, - -max_retry_count => 0, - -rc_name => 'default', - -flow_into => { - '2->A' => [ 'DataCheckFan' ], - 'A->1' => [ 'DataCheckFunnel' ], - }, - }, - - { - -logic_name => 'DataCheckFan', - -module => 'Bio::EnsEMBL::DataCheck::Pipeline::DataCheckFan', - -analysis_capacity => 100, - -max_retry_count => 0, - -rc_name => '2GB', - -flow_into => { - '1' => [ '?accu_name=results&accu_address=[]' ], - '-1' => [ 'DataCheckFan_High_mem' ] - }, - }, - - { - -logic_name => 'DataCheckFan_High_mem', - -module => 'Bio::EnsEMBL::DataCheck::Pipeline::DataCheckFan', - -analysis_capacity => 100, - -max_retry_count => 0, - -rc_name => '8GB', - -flow_into => { - '1' => [ '?accu_name=results&accu_address=[]' ], - }, - }, - - { - -logic_name => 'DataCheckFunnel', - -module => 'Bio::EnsEMBL::DataCheck::Pipeline::DataCheckFunnel', - -analysis_capacity => 1, - -batch_size => 100, - -max_retry_count => 0, - -rc_name => '2GB', - -flow_into => { - '1' => [ 'StoreResults' ], - }, - }, - - { - -logic_name => 'StoreResults', - -module => 'Bio::EnsEMBL::DataCheck::Pipeline::StoreResults', - -analysis_capacity => 10, - -max_retry_count => 1, - -rc_name => 'default', - -flow_into => { - '3' => [ '?table_name=datacheck_results' ], - '4' => [ 'EmailReport' ], - }, - }, - - { - -logic_name => 'EmailReport', - -module => 'Bio::EnsEMBL::DataCheck::Pipeline::EmailReport', - -analysis_capacity => 10, - -batch_size => 100, - -max_retry_count => 0, - -rc_name => 'default', - }, - - { - -logic_name => 'DataCheckResults', - -module => 'Bio::EnsEMBL::Hive::RunnableDB::Dummy', - -max_retry_count => 0, - -parameters => {}, - -rc_name => 'default', - -flow_into => { - '1' => - WHEN('#output_dir# && #tap_to_json#' => - [ 'ConvertTapToJson' ], - ELSE - [ 'DataCheckSummary' ], - ), - }, - }, - - { - -logic_name => 'ConvertTapToJson', - -module => 'Bio::EnsEMBL::DataCheck::Pipeline::ConvertTapToJson', - -analysis_capacity => 10, - -max_retry_count => 0, - -parameters => { - tap => '#output_dir#', - store_to_es => $self->o('store_to_es'), - - }, - -rc_name => 'default', - -flow_into => WHEN('#store_to_es#' => - [ 'DataCheckSummary', 'StoreToES' ], - ELSE - [ 'DataCheckSummary' ] - ), - - }, - - { - -logic_name => 'DataCheckSummary', - -module => 'Bio::EnsEMBL::DataCheck::Pipeline::DataCheckSummary', - -analysis_capacity => 10, - -max_retry_count => 0, - -rc_name => 'default', - -flow_into => [ '?table_name=result' ], - }, - { - -logic_name => 'StoreToES', - -module => 'Bio::EnsEMBL::DataCheck::Pipeline::StoreResultToES', - -analysis_capacity => 1, - -max_retry_count => 3, - -parameters => { - es_host => $self->o('es_host'), - es_port => $self->o('es_port'), - es_index => $self->o('es_index'), - es_log_file => $self->o('es_log_dir') . '/' . $self->o('pipeline_name') . 'err', - - }, - -rc_name => 'default', - }, - - - ]; + my $self = shift @_; + + return [ + { + -logic_name => 'DataCheckSubmission', + -module => 'Bio::EnsEMBL::DataCheck::Pipeline::DataCheckSubmission', + -analysis_capacity => 1, + -max_retry_count => 1, + -parameters => { + species => $self->o('species'), + taxons => $self->o('taxons'), + division => $self->o('division'), + run_all => $self->o('run_all'), + antispecies => $self->o('antispecies'), + antitaxons => $self->o('antitaxons'), + meta_filters => $self->o('meta_filters'), + dbname => $self->o('dbname'), + db_type => $self->o('db_type'), + + datacheck_dir => $self->o('datacheck_dir'), + index_file => $self->o('index_file'), + history_file => $self->o('history_file'), + output_dir => $self->o('output_dir'), + config_file => $self->o('config_file'), + overwrite_files => $self->o('overwrite_files'), + datacheck_names => $self->o('datacheck_names'), + datacheck_patterns => $self->o('datacheck_patterns'), + datacheck_groups => $self->o('datacheck_groups'), + datacheck_types => $self->o('datacheck_types'), + registry_file => $self->o('registry_file'), + server_uri => $self->o('server_uri'), + old_server_uri => $self->o('old_server_uri'), + data_file_path => $self->o('data_file_path'), + + failures_fatal => $self->o('failures_fatal'), + + parallelize_datachecks => $self->o('parallelize_datachecks'), + + tag => $self->o('tag'), + timestamp => $self->o('timestamp'), + email => $self->o('email'), + report_per_db => $self->o('report_per_db'), + report_all => $self->o('report_all'), + + tap_to_json => $self->o('tap_to_json'), + json_passed => $self->o('json_passed'), + json_by_species => $self->o('json_by_species'), + + target_site => $self->o('target_site'), + }, + -rc_name => 'default', + -flow_into => { + '1' => ['DbFactory'], + '3' => ['?table_name=datacheck_submission'], + }, + }, + + { + -logic_name => 'DbFactory', + -module => 'Bio::EnsEMBL::Production::Pipeline::Common::DbFactory', + -parameters => { + shout_db_not_found_in_registry => $self->o('shout_db_not_found_in_registry'), + }, + -analysis_capacity => 10, + -max_retry_count => 0, + -flow_into => { + '2->A' => + WHEN('#parallelize_datachecks#' => + ['DataCheckFactory'], + ELSE + ['RunDataChecks'] + ), + 'A->1' => + WHEN('scalar @{#all_dbs#}' => + ['DataCheckResults'] + ), + + }, + -rc_name => 'default', + }, + + { + -logic_name => 'RunDataChecks', + -module => 'Bio::EnsEMBL::DataCheck::Pipeline::RunDataChecks', + -analysis_capacity => 10, + -max_retry_count => 0, + -rc_name => '2GB', + -flow_into => { + '1' => ['StoreResults'], + '-1' => ['RunDataChecks_High_mem'] + }, + }, + + { + -logic_name => 'RunDataChecks_High_mem', + -module => 'Bio::EnsEMBL::DataCheck::Pipeline::RunDataChecks', + -analysis_capacity => 10, + -max_retry_count => 0, + -rc_name => '8GB', + -flow_into => { + '1' => ['StoreResults'], + }, + }, + + { + -logic_name => 'DataCheckFactory', + -module => 'Bio::EnsEMBL::DataCheck::Pipeline::DataCheckFactory', + -analysis_capacity => 10, + -max_retry_count => 0, + -rc_name => 'default', + -flow_into => { + '2->A' => ['DataCheckFan'], + 'A->1' => ['DataCheckFunnel'], + }, + }, + + { + -logic_name => 'DataCheckFan', + -module => 'Bio::EnsEMBL::DataCheck::Pipeline::DataCheckFan', + -analysis_capacity => 100, + -max_retry_count => 0, + -rc_name => '2GB', + -flow_into => { + '1' => ['?accu_name=results&accu_address=[]'], + '-1' => ['DataCheckFan_High_mem'] + }, + }, + + { + -logic_name => 'DataCheckFan_High_mem', + -module => 'Bio::EnsEMBL::DataCheck::Pipeline::DataCheckFan', + -analysis_capacity => 100, + -max_retry_count => 0, + -rc_name => '8GB', + -flow_into => { + '1' => ['?accu_name=results&accu_address=[]'], + }, + }, + + { + -logic_name => 'DataCheckFunnel', + -module => 'Bio::EnsEMBL::DataCheck::Pipeline::DataCheckFunnel', + -analysis_capacity => 1, + -batch_size => 100, + -max_retry_count => 0, + -rc_name => '2GB', + -flow_into => { + '1' => ['StoreResults'], + }, + }, + + { + -logic_name => 'StoreResults', + -module => 'Bio::EnsEMBL::DataCheck::Pipeline::StoreResults', + -analysis_capacity => 10, + -max_retry_count => 1, + -rc_name => 'default', + -flow_into => { + '3' => ['?table_name=datacheck_results'], + '4' => ['EmailReport'], + }, + }, + + { + -logic_name => 'EmailReport', + -module => 'Bio::EnsEMBL::DataCheck::Pipeline::EmailReport', + -analysis_capacity => 10, + -batch_size => 100, + -max_retry_count => 0, + -rc_name => 'default', + }, + + { + -logic_name => 'DataCheckResults', + -module => 'Bio::EnsEMBL::Hive::RunnableDB::Dummy', + -max_retry_count => 0, + -parameters => {}, + -rc_name => 'default', + -flow_into => { + '1' => + WHEN('#output_dir# && #tap_to_json#' => + ['ConvertTapToJson'], + ELSE + ['DataCheckSummary'], + ), + }, + }, + + { + -logic_name => 'ConvertTapToJson', + -module => 'Bio::EnsEMBL::DataCheck::Pipeline::ConvertTapToJson', + -analysis_capacity => 10, + -max_retry_count => 0, + -parameters => { + tap => '#output_dir#', + store_to_es => $self->o('store_to_es'), + + }, + -rc_name => 'default', + -flow_into => WHEN('#store_to_es#' => + ['DataCheckSummary', 'StoreToES'], + ELSE + ['DataCheckSummary'] + ), + + }, + + { + -logic_name => 'DataCheckSummary', + -module => 'Bio::EnsEMBL::DataCheck::Pipeline::DataCheckSummary', + -analysis_capacity => 10, + -max_retry_count => 0, + -rc_name => 'default', + -flow_into => ['?table_name=result'], + }, + { + -logic_name => 'StoreToES', + -module => 'Bio::EnsEMBL::DataCheck::Pipeline::StoreResultToES', + -analysis_capacity => 1, + -max_retry_count => 3, + -parameters => { + es_host => $self->o('es_host'), + es_port => $self->o('es_port'), + es_index => $self->o('es_index'), + es_log_file => $self->o('es_log_dir').'/'.$self->o('pipeline_name').'err', + + }, + -rc_name => 'default', + }, + + + ]; } sub resource_classes { diff --git a/lib/Bio/EnsEMBL/DataCheck/Pipeline/RunDataChecks.pm b/lib/Bio/EnsEMBL/DataCheck/Pipeline/RunDataChecks.pm index 8d4f728a..03729c26 100644 --- a/lib/Bio/EnsEMBL/DataCheck/Pipeline/RunDataChecks.pm +++ b/lib/Bio/EnsEMBL/DataCheck/Pipeline/RunDataChecks.pm @@ -262,11 +262,13 @@ sub set_datacheck_params { my $server_uri = $self->param('server_uri'); my $old_server_uri = $self->param('old_server_uri'); my $data_file_path = $self->param('data_file_path'); + my $target_site = $self->param('target_site'); $$params{registry_file} = $registry_file if defined $registry_file; $$params{server_uri} = $server_uri if defined $server_uri; $$params{old_server_uri} = $old_server_uri if defined $old_server_uri; $$params{data_file_path} = $data_file_path if defined $data_file_path; + $$params{target_site} = (defined $target_site) ? $target_site: 'main'; } 1; diff --git a/lib/Bio/EnsEMBL/DataCheck/Registry/compara.pl b/lib/Bio/EnsEMBL/DataCheck/Registry/compara.pl new file mode 100644 index 00000000..57daff00 --- /dev/null +++ b/lib/Bio/EnsEMBL/DataCheck/Registry/compara.pl @@ -0,0 +1,18 @@ +use strict; +use warnings; +use Bio::EnsEMBL::MetaData::DBSQL::MetaDataDBAdaptor; +use Bio::EnsEMBL::Production::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Taxonomy::DBSQL::TaxonomyDBAdaptor; +use Bio::EnsEMBL::Registry; + +# Metadata +Bio::EnsEMBL::Registry->load_registry_from_url("$METADATA_URI/$METADATA_DB?species=multi&group=metadata"); +# Production +Bio::EnsEMBL::Registry->load_registry_from_url("$PRODUCTION_URI/$PRODUCTION_DB?species=multi&group=production"); +# Taxonomy +Bio::EnsEMBL::Registry->load_registry_from_url("$TAXONOMY_URI/$TAXONOMY_DB?species=multi&group=taxonomy"); +# DB to checks +# TODO add loop over list of DBs +Bio::EnsEMBL::Registry->load_registry_from_url("$SRC_URI/$DB_NAME?species=$SPECIES&group=compara"); + +1; \ No newline at end of file diff --git a/lib/Bio/EnsEMBL/DataCheck/Registry/core.pl b/lib/Bio/EnsEMBL/DataCheck/Registry/core.pl new file mode 100644 index 00000000..665b3a73 --- /dev/null +++ b/lib/Bio/EnsEMBL/DataCheck/Registry/core.pl @@ -0,0 +1,18 @@ +use strict; +use warnings; +use Bio::EnsEMBL::MetaData::DBSQL::MetaDataDBAdaptor; +use Bio::EnsEMBL::Production::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Taxonomy::DBSQL::TaxonomyDBAdaptor; +use Bio::EnsEMBL::Registry; + +# Metadata +Bio::EnsEMBL::Registry->load_registry_from_url("$METADATA_URI/$METADATA_DB?species=multi&group=metadata"); +# Production +Bio::EnsEMBL::Registry->load_registry_from_url("$PRODUCTION_URI/$PRODUCTION_DB?species=multi&group=production"); +# Taxonomy +Bio::EnsEMBL::Registry->load_registry_from_url("$TAXONOMY_URI/$TAXONOMY_DB?species=multi&group=taxonomy"); +# DB to checks +# TODO add loop over list of DBs +Bio::EnsEMBL::Registry->load_registry_from_url("$SRC_URI/$DB_NAME?species=$SPECIES&group=core"); + +1; \ No newline at end of file diff --git a/lib/Bio/EnsEMBL/DataCheck/Registry/funcgen.pl b/lib/Bio/EnsEMBL/DataCheck/Registry/funcgen.pl new file mode 100644 index 00000000..3066ce82 --- /dev/null +++ b/lib/Bio/EnsEMBL/DataCheck/Registry/funcgen.pl @@ -0,0 +1,18 @@ +use strict; +use warnings; +use Bio::EnsEMBL::MetaData::DBSQL::MetaDataDBAdaptor; +use Bio::EnsEMBL::Production::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Taxonomy::DBSQL::TaxonomyDBAdaptor; +use Bio::EnsEMBL::Registry; + +# Metadata +Bio::EnsEMBL::Registry->load_registry_from_url("$METADATA_URI/$METADATA_DB?species=multi&group=metadata"); +# Production +Bio::EnsEMBL::Registry->load_registry_from_url("$PRODUCTION_URI/$PRODUCTION_DB?species=multi&group=production"); +# Taxonomy +Bio::EnsEMBL::Registry->load_registry_from_url("$TAXONOMY_URI/$TAXONOMY_DB?species=multi&group=taxonomy"); +# DB to checks +# TODO add loop over list of DBs +Bio::EnsEMBL::Registry->load_registry_from_url("$SRC_URI/$DB_NAME?species=$SPECIES&group=funcgen"); + +1; \ No newline at end of file diff --git a/lib/Bio/EnsEMBL/DataCheck/Registry/variation.pl b/lib/Bio/EnsEMBL/DataCheck/Registry/variation.pl new file mode 100644 index 00000000..50224f19 --- /dev/null +++ b/lib/Bio/EnsEMBL/DataCheck/Registry/variation.pl @@ -0,0 +1,19 @@ +use strict; +use warnings; +use Bio::EnsEMBL::Registry; + +# Metadata +Bio::EnsEMBL::Registry->load_registry_from_url("$METADATA_URI/$METADATA_DB?species=multi&group=metadata"); +# Production +Bio::EnsEMBL::Registry->load_registry_from_url("$PRODUCTION_URI/$PRODUCTION_DB?species=multi&group=production"); +# Taxonomy +Bio::EnsEMBL::Registry->load_registry_from_url("$TAXONOMY_URI/$TAXONOMY_DB?species=multi&group=taxonomy"); +# Regulation +my $funcgen_db=$DB_NAME; +$funcgen_db =~ s/variation/funcgen/r; +Bio::EnsEMBL::Registry->load_registry_from_url("$SRC_URI/$funcgen_db?species=$SPECIES&group=funcgen"); +# DB to checks +# TODO add loop over list of DBs +Bio::EnsEMBL::Registry->load_registry_from_url("$SRC_URI/$DB_NAME?species=$SPECIES&group=variation"); + +1; \ No newline at end of file diff --git a/scripts/run_datachecks.pl b/scripts/run_datachecks.pl index 90edaf66..cde3a9db 100755 --- a/scripts/run_datachecks.pl +++ b/scripts/run_datachecks.pl @@ -126,6 +126,11 @@ =head1 OPTIONS Path to a file to store full output in TAP format. If the file already exists, it will be overwritten. +=item B<-ta[rget_site]>
+ +Filter mandatory metkeys based on targetsite + + =item B<-h[elp]> Print usage information. @@ -154,7 +159,7 @@ =head1 OPTIONS $host, $port, $user, $pass, $dbname, $dbtype, $registry_file, @server_uri, @old_server_uri, $data_file_path, $config_file, @names, @patterns, @groups, @datacheck_types, - $datacheck_dir, $index_file, $history_file, $output_file, + $datacheck_dir, $index_file, $history_file, $output_file, $target_site ); GetOptions( @@ -178,6 +183,7 @@ =head1 OPTIONS "index_file:s", \$index_file, "history_file:s", \$history_file, "output_file:s", \$output_file, + "target_site:s", \$target_site, ); pod2usage(1) if $help; @@ -244,6 +250,9 @@ =head1 OPTIONS die "datacheck_dir is mandatory if index_file is specified"; } +if (! defined $target_site){ + $target_site = 'main'; +} # If datacheck parameters have been specified as comma-separated strings, # convert them into arrays. @names = map { split(/[,\s]+/, $_) } @names if scalar @names; @@ -270,6 +279,7 @@ =head1 OPTIONS $datacheck_params{server_uri} = \@server_uri if scalar @server_uri; $datacheck_params{old_server_uri} = \@old_server_uri if scalar @old_server_uri; $datacheck_params{data_file_path} = $data_file_path if defined $data_file_path; +$datacheck_params{target_site} = $target_site if defined $target_site; my $manager = Bio::EnsEMBL::DataCheck::Manager->new(%manager_params); diff --git a/scripts/run_pipeline.pl b/scripts/run_pipeline.pl index dcc5b2d5..04866ffd 100644 --- a/scripts/run_pipeline.pl +++ b/scripts/run_pipeline.pl @@ -207,6 +207,10 @@ =head1 OPTIONS reports if any datachecks fail. This parameter enables reports for databases that pass all datachecks. +=item B<-ta[rget_site]>
+ +Filter mandatory metkeys based on targetsite + =item B<-h[elp]> Print usage information. @@ -240,7 +244,7 @@ =head1 OPTIONS @names, @patterns, @groups, @datacheck_types, $datacheck_dir, $index_file, $history_file, $output_dir, $json_passed, $parallelize_datachecks, - $tag, $email, $report_per_db, $report_all, $es_host, $es_port, $es_index, + $tag, $email, $report_per_db, $report_all, $es_host, $es_port, $es_index, $target_site ); GetOptions( @@ -286,7 +290,8 @@ =head1 OPTIONS "es_host:s", \$es_host, "es_port:s", \$es_port, "es_index:s", \$es_index, - "store_to_es:i", \(my $store_to_es = 0), + "store_to_es:i", \(my $store_to_es = 0), + "target_site:s", \$target_site ); @@ -330,6 +335,10 @@ =head1 OPTIONS die "datacheck_dir is mandatory if index_file is specified"; } +if (! defined $target_site) { + $target_site = 'main'; +} + # If species parameters have been specified as comma-separated strings, # convert them into arrays. @species = map { split(/[,\s]+/, $_) } @species if scalar @species; @@ -414,6 +423,7 @@ =head1 OPTIONS $input_id{es_port} = $es_port if defined $es_port; $input_id{es_index} = $es_index if defined $es_index; $input_id{store_to_es} = $store_to_es; +$input_id{target_site} = $target_site; my $input_id = Dumper(\%input_id); @@ -421,7 +431,7 @@ =head1 OPTIONS "seed_pipeline.pl ". " -url $url". " -logic_name DataCheckSubmission". - " -input_id \"$input_id\""; + " -input_id \"$input_id\""; my $seed_return = system($seed_cmd);