From 95483cfd5a1e9be16b531044b9d509f0fafdf92b Mon Sep 17 00:00:00 2001 From: vinay-ebi Date: Thu, 11 Jan 2024 11:37:56 +0000 Subject: [PATCH 01/15] added new param target_site to datacheck to filter metakeys --- .../DataCheck/Checks/ControlledMetaKeys.pm | 11 ++++++++--- lib/Bio/EnsEMBL/DataCheck/DbCheck.pm | 9 +++++++++ .../EnsEMBL/DataCheck/Pipeline/RunDataChecks.pm | 2 ++ scripts/run_datachecks.pl | 12 +++++++++++- scripts/run_pipeline.pl | 16 +++++++++++++--- 5 files changed, 43 insertions(+), 7 deletions(-) diff --git a/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledMetaKeys.pm b/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledMetaKeys.pm index 75162847..3b1d708d 100644 --- a/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledMetaKeys.pm +++ b/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledMetaKeys.pm @@ -36,22 +36,27 @@ use constant { sub tests { my ($self) = @_; - my $species_id = $self->dba->species_id; my $group = $self->dba->group; - my $sql = qq/ SELECT meta_key, COUNT(*) FROM meta WHERE species_id = $species_id OR species_id IS NULL GROUP BY meta_key /; + my $helper = $self->dba->dbc->sql_helper; my %meta_keys = %{ $helper->execute_into_hash(-SQL => $sql) }; + #check target site is main / new and select mandatory metakeys + my $filter_metakeys = ''; + if (defined $self->target_site){ + $filter_metakeys = " AND target_site like '\%new\%' "; + } + my $prod_sql = qq/ SELECT name, is_optional FROM meta_key - WHERE FIND_IN_SET('$group', db_type) AND is_current = 1 + WHERE FIND_IN_SET('$group', db_type) AND is_current = 1 $filter_metakeys /; my $prod_dba = $self->get_dba('multi', 'production'); my $prod_helper = $prod_dba->dbc->sql_helper; diff --git a/lib/Bio/EnsEMBL/DataCheck/DbCheck.pm b/lib/Bio/EnsEMBL/DataCheck/DbCheck.pm index ef401f3e..d41b5d36 100755 --- a/lib/Bio/EnsEMBL/DataCheck/DbCheck.pm +++ b/lib/Bio/EnsEMBL/DataCheck/DbCheck.pm @@ -57,6 +57,15 @@ subtype 'Registry', as 'Str', where { =head1 METHODS +=head2 target_site + Description: Fetch mandatory meta keys based on target site + current values are main/new +=cut +has 'target_site' => ( + is => 'ro', + isa => 'Str | Undef', +); + =head2 db_types Description: Database types for which this datacheck is appropriate. =cut diff --git a/lib/Bio/EnsEMBL/DataCheck/Pipeline/RunDataChecks.pm b/lib/Bio/EnsEMBL/DataCheck/Pipeline/RunDataChecks.pm index fad4b9a1..18bb922d 100644 --- a/lib/Bio/EnsEMBL/DataCheck/Pipeline/RunDataChecks.pm +++ b/lib/Bio/EnsEMBL/DataCheck/Pipeline/RunDataChecks.pm @@ -262,11 +262,13 @@ sub set_datacheck_params { my $server_uri = $self->param('server_uri'); my $old_server_uri = $self->param('old_server_uri'); my $data_file_path = $self->param('data_file_path'); + my $target_site = $self->param('target_site'); $$params{registry_file} = $registry_file if defined $registry_file; $$params{server_uri} = $server_uri if defined $server_uri; $$params{old_server_uri} = $old_server_uri if defined $old_server_uri; $$params{data_file_path} = $data_file_path if defined $data_file_path; + $$params{target_site} = (defined $target_site) ? $target_site: 'main'; } 1; diff --git a/scripts/run_datachecks.pl b/scripts/run_datachecks.pl index 037c2a07..0e9ad99c 100755 --- a/scripts/run_datachecks.pl +++ b/scripts/run_datachecks.pl @@ -126,6 +126,11 @@ =head1 OPTIONS Path to a file to store full output in TAP format. If the file already exists, it will be overwritten. +=item B<-ta[rget_site]>
+ +Filter mandatory metkeys based on targetsite + + =item B<-h[elp]> Print usage information. @@ -154,7 +159,7 @@ =head1 OPTIONS $host, $port, $user, $pass, $dbname, $dbtype, $registry_file, @server_uri, @old_server_uri, $data_file_path, $config_file, @names, @patterns, @groups, @datacheck_types, - $datacheck_dir, $index_file, $history_file, $output_file, + $datacheck_dir, $index_file, $history_file, $output_file,, $target_site ); GetOptions( @@ -178,6 +183,7 @@ =head1 OPTIONS "index_file:s", \$index_file, "history_file:s", \$history_file, "output_file:s", \$output_file, + "target_site:s", \$target_site, ); pod2usage(1) if $help; @@ -244,6 +250,9 @@ =head1 OPTIONS die "datacheck_dir is mandatory if index_file is specified"; } +if (! defined $target_site){ + $target_site = 'main'; +} # If datacheck parameters have been specified as comma-separated strings, # convert them into arrays. @names = map { split(/[,\s]+/, $_) } @names if scalar @names; @@ -270,6 +279,7 @@ =head1 OPTIONS $datacheck_params{server_uri} = \@server_uri if scalar @server_uri; $datacheck_params{old_server_uri} = \@old_server_uri if scalar @old_server_uri; $datacheck_params{data_file_path} = $data_file_path if defined $data_file_path; +$datacheck_params{target_site} = $target_site if defined $target_site; my $manager = Bio::EnsEMBL::DataCheck::Manager->new(%manager_params); diff --git a/scripts/run_pipeline.pl b/scripts/run_pipeline.pl index fa26fc88..f1a0900d 100644 --- a/scripts/run_pipeline.pl +++ b/scripts/run_pipeline.pl @@ -207,6 +207,10 @@ =head1 OPTIONS reports if any datachecks fail. This parameter enables reports for databases that pass all datachecks. +=item B<-ta[rget_site]>
+ +Filter mandatory metkeys based on targetsite + =item B<-h[elp]> Print usage information. @@ -240,7 +244,7 @@ =head1 OPTIONS @names, @patterns, @groups, @datacheck_types, $datacheck_dir, $index_file, $history_file, $output_dir, $json_passed, $parallelize_datachecks, - $tag, $email, $report_per_db, $report_all, $es_host, $es_port, $es_index, + $tag, $email, $report_per_db, $report_all, $es_host, $es_port, $es_index, $target_site ); GetOptions( @@ -286,7 +290,8 @@ =head1 OPTIONS "es_host:s", \$es_host, "es_port:s", \$es_port, "es_index:s", \$es_index, - "store_to_es:i", \(my $store_to_es = 0), + "store_to_es:i", \(my $store_to_es = 0), + "target_site:s", \$target_site ); @@ -330,6 +335,10 @@ =head1 OPTIONS die "datacheck_dir is mandatory if index_file is specified"; } +if (! defined $target_site) { + $target_site = 'main'; +} + # If species parameters have been specified as comma-separated strings, # convert them into arrays. @species = map { split(/[,\s]+/, $_) } @species if scalar @species; @@ -414,6 +423,7 @@ =head1 OPTIONS $input_id{es_port} = $es_port if defined $es_port; $input_id{es_index} = $es_index if defined $es_index; $input_id{store_to_es} = $store_to_es; +$input_id{target_site} = $target_site; my $input_id = Dumper(\%input_id); @@ -421,7 +431,7 @@ =head1 OPTIONS "seed_pipeline.pl ". " -url $url". " -logic_name DataCheckSubmission". - " -input_id \"$input_id\""; + " -input_id \"$input_id\""; my $seed_return = system($seed_cmd); From bc32e3a3d54f06249f80e3dfc2b4616a171d50d2 Mon Sep 17 00:00:00 2001 From: vinay-ebi <59567245+vinay-ebi@users.noreply.github.com> Date: Fri, 12 Jan 2024 15:55:07 +0000 Subject: [PATCH 02/15] Update lib/Bio/EnsEMBL/DataCheck/Checks/ControlledMetaKeys.pm Co-authored-by: Marc Chakiachvili --- lib/Bio/EnsEMBL/DataCheck/Checks/ControlledMetaKeys.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledMetaKeys.pm b/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledMetaKeys.pm index 3b1d708d..a7263ac2 100644 --- a/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledMetaKeys.pm +++ b/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledMetaKeys.pm @@ -50,7 +50,7 @@ sub tests { #check target site is main / new and select mandatory metakeys my $filter_metakeys = ''; if (defined $self->target_site){ - $filter_metakeys = " AND target_site like '\%new\%' "; + $filter_metakeys = " AND target_site like '\%".$self->target_site."\%' "; } my $prod_sql = qq/ From 4aac9296bc868473cada149c2340da4bfba05391 Mon Sep 17 00:00:00 2001 From: vinay-ebi Date: Fri, 12 Jan 2024 15:57:08 +0000 Subject: [PATCH 03/15] fix PR changes --- scripts/run_datachecks.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/run_datachecks.pl b/scripts/run_datachecks.pl index 0e9ad99c..f10291b8 100755 --- a/scripts/run_datachecks.pl +++ b/scripts/run_datachecks.pl @@ -159,7 +159,7 @@ =head1 OPTIONS $host, $port, $user, $pass, $dbname, $dbtype, $registry_file, @server_uri, @old_server_uri, $data_file_path, $config_file, @names, @patterns, @groups, @datacheck_types, - $datacheck_dir, $index_file, $history_file, $output_file,, $target_site + $datacheck_dir, $index_file, $history_file, $output_file, $target_site ); GetOptions( From e92e37bbb1f031afd451571e7eceb2723cd1dfae Mon Sep 17 00:00:00 2001 From: Marc Chakiachvili Date: Tue, 6 Feb 2024 21:14:21 +0000 Subject: [PATCH 04/15] Update DbDataChecks_conf.pm Updated default es params --- lib/Bio/EnsEMBL/DataCheck/Pipeline/DbDataChecks_conf.pm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/Bio/EnsEMBL/DataCheck/Pipeline/DbDataChecks_conf.pm b/lib/Bio/EnsEMBL/DataCheck/Pipeline/DbDataChecks_conf.pm index 1cb36b6d..1e779965 100644 --- a/lib/Bio/EnsEMBL/DataCheck/Pipeline/DbDataChecks_conf.pm +++ b/lib/Bio/EnsEMBL/DataCheck/Pipeline/DbDataChecks_conf.pm @@ -78,8 +78,8 @@ sub default_options { json_by_species => 1, shout_db_not_found_in_registry => 1, store_to_es => 0, - es_host => 'es.production.ensembl.org', - es_port => '80', + es_host => 'es.ensembl-production.ebi.ac.uk', + es_port => undef, es_index => 'datacheck_results_'.$self->o('ENV', 'ENS_VERSION'), es_log_dir => '/hps/scratch/flicek/ensembl/'.$self->o('ENV', 'USER').'/datacheck_results_'.$self->o('ENV', 'ENS_VERSION'), }; From 956a22ddb70678311dc6cd160fb8dee871fc5f7f Mon Sep 17 00:00:00 2001 From: vinay-ebi Date: Thu, 11 Jan 2024 11:37:56 +0000 Subject: [PATCH 05/15] added new param target_site to datacheck to filter metakeys --- .../DataCheck/Checks/ControlledMetaKeys.pm | 11 ++++++++--- lib/Bio/EnsEMBL/DataCheck/DbCheck.pm | 9 +++++++++ .../EnsEMBL/DataCheck/Pipeline/RunDataChecks.pm | 2 ++ scripts/run_datachecks.pl | 12 +++++++++++- scripts/run_pipeline.pl | 16 +++++++++++++--- 5 files changed, 43 insertions(+), 7 deletions(-) diff --git a/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledMetaKeys.pm b/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledMetaKeys.pm index 68491742..c6402fc8 100644 --- a/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledMetaKeys.pm +++ b/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledMetaKeys.pm @@ -36,22 +36,27 @@ use constant { sub tests { my ($self) = @_; - my $species_id = $self->dba->species_id; my $group = $self->dba->group; - my $sql = qq/ SELECT meta_key, COUNT(*) FROM meta WHERE species_id = $species_id OR species_id IS NULL GROUP BY meta_key /; + my $helper = $self->dba->dbc->sql_helper; my %meta_keys = %{ $helper->execute_into_hash(-SQL => $sql) }; + #check target site is main / new and select mandatory metakeys + my $filter_metakeys = ''; + if (defined $self->target_site){ + $filter_metakeys = " AND target_site like '\%new\%' "; + } + my $prod_sql = qq/ SELECT name, is_optional FROM meta_key - WHERE FIND_IN_SET('$group', db_type) AND is_current = 1 + WHERE FIND_IN_SET('$group', db_type) AND is_current = 1 $filter_metakeys /; my $prod_dba = $self->get_dba('multi', 'production'); my $prod_helper = $prod_dba->dbc->sql_helper; diff --git a/lib/Bio/EnsEMBL/DataCheck/DbCheck.pm b/lib/Bio/EnsEMBL/DataCheck/DbCheck.pm index f959ddae..b4576af7 100755 --- a/lib/Bio/EnsEMBL/DataCheck/DbCheck.pm +++ b/lib/Bio/EnsEMBL/DataCheck/DbCheck.pm @@ -57,6 +57,15 @@ subtype 'Registry', as 'Str', where { =head1 METHODS +=head2 target_site + Description: Fetch mandatory meta keys based on target site + current values are main/new +=cut +has 'target_site' => ( + is => 'ro', + isa => 'Str | Undef', +); + =head2 db_types Description: Database types for which this datacheck is appropriate. =cut diff --git a/lib/Bio/EnsEMBL/DataCheck/Pipeline/RunDataChecks.pm b/lib/Bio/EnsEMBL/DataCheck/Pipeline/RunDataChecks.pm index 8d4f728a..03729c26 100644 --- a/lib/Bio/EnsEMBL/DataCheck/Pipeline/RunDataChecks.pm +++ b/lib/Bio/EnsEMBL/DataCheck/Pipeline/RunDataChecks.pm @@ -262,11 +262,13 @@ sub set_datacheck_params { my $server_uri = $self->param('server_uri'); my $old_server_uri = $self->param('old_server_uri'); my $data_file_path = $self->param('data_file_path'); + my $target_site = $self->param('target_site'); $$params{registry_file} = $registry_file if defined $registry_file; $$params{server_uri} = $server_uri if defined $server_uri; $$params{old_server_uri} = $old_server_uri if defined $old_server_uri; $$params{data_file_path} = $data_file_path if defined $data_file_path; + $$params{target_site} = (defined $target_site) ? $target_site: 'main'; } 1; diff --git a/scripts/run_datachecks.pl b/scripts/run_datachecks.pl index 90edaf66..2742cc40 100755 --- a/scripts/run_datachecks.pl +++ b/scripts/run_datachecks.pl @@ -126,6 +126,11 @@ =head1 OPTIONS Path to a file to store full output in TAP format. If the file already exists, it will be overwritten. +=item B<-ta[rget_site]>
+ +Filter mandatory metkeys based on targetsite + + =item B<-h[elp]> Print usage information. @@ -154,7 +159,7 @@ =head1 OPTIONS $host, $port, $user, $pass, $dbname, $dbtype, $registry_file, @server_uri, @old_server_uri, $data_file_path, $config_file, @names, @patterns, @groups, @datacheck_types, - $datacheck_dir, $index_file, $history_file, $output_file, + $datacheck_dir, $index_file, $history_file, $output_file,, $target_site ); GetOptions( @@ -178,6 +183,7 @@ =head1 OPTIONS "index_file:s", \$index_file, "history_file:s", \$history_file, "output_file:s", \$output_file, + "target_site:s", \$target_site, ); pod2usage(1) if $help; @@ -244,6 +250,9 @@ =head1 OPTIONS die "datacheck_dir is mandatory if index_file is specified"; } +if (! defined $target_site){ + $target_site = 'main'; +} # If datacheck parameters have been specified as comma-separated strings, # convert them into arrays. @names = map { split(/[,\s]+/, $_) } @names if scalar @names; @@ -270,6 +279,7 @@ =head1 OPTIONS $datacheck_params{server_uri} = \@server_uri if scalar @server_uri; $datacheck_params{old_server_uri} = \@old_server_uri if scalar @old_server_uri; $datacheck_params{data_file_path} = $data_file_path if defined $data_file_path; +$datacheck_params{target_site} = $target_site if defined $target_site; my $manager = Bio::EnsEMBL::DataCheck::Manager->new(%manager_params); diff --git a/scripts/run_pipeline.pl b/scripts/run_pipeline.pl index dcc5b2d5..04866ffd 100644 --- a/scripts/run_pipeline.pl +++ b/scripts/run_pipeline.pl @@ -207,6 +207,10 @@ =head1 OPTIONS reports if any datachecks fail. This parameter enables reports for databases that pass all datachecks. +=item B<-ta[rget_site]>
+ +Filter mandatory metkeys based on targetsite + =item B<-h[elp]> Print usage information. @@ -240,7 +244,7 @@ =head1 OPTIONS @names, @patterns, @groups, @datacheck_types, $datacheck_dir, $index_file, $history_file, $output_dir, $json_passed, $parallelize_datachecks, - $tag, $email, $report_per_db, $report_all, $es_host, $es_port, $es_index, + $tag, $email, $report_per_db, $report_all, $es_host, $es_port, $es_index, $target_site ); GetOptions( @@ -286,7 +290,8 @@ =head1 OPTIONS "es_host:s", \$es_host, "es_port:s", \$es_port, "es_index:s", \$es_index, - "store_to_es:i", \(my $store_to_es = 0), + "store_to_es:i", \(my $store_to_es = 0), + "target_site:s", \$target_site ); @@ -330,6 +335,10 @@ =head1 OPTIONS die "datacheck_dir is mandatory if index_file is specified"; } +if (! defined $target_site) { + $target_site = 'main'; +} + # If species parameters have been specified as comma-separated strings, # convert them into arrays. @species = map { split(/[,\s]+/, $_) } @species if scalar @species; @@ -414,6 +423,7 @@ =head1 OPTIONS $input_id{es_port} = $es_port if defined $es_port; $input_id{es_index} = $es_index if defined $es_index; $input_id{store_to_es} = $store_to_es; +$input_id{target_site} = $target_site; my $input_id = Dumper(\%input_id); @@ -421,7 +431,7 @@ =head1 OPTIONS "seed_pipeline.pl ". " -url $url". " -logic_name DataCheckSubmission". - " -input_id \"$input_id\""; + " -input_id \"$input_id\""; my $seed_return = system($seed_cmd); From 902c42ab52889da3249552ed61a58c117710908b Mon Sep 17 00:00:00 2001 From: vinay-ebi <59567245+vinay-ebi@users.noreply.github.com> Date: Fri, 12 Jan 2024 15:55:07 +0000 Subject: [PATCH 06/15] Update lib/Bio/EnsEMBL/DataCheck/Checks/ControlledMetaKeys.pm Co-authored-by: Marc Chakiachvili --- lib/Bio/EnsEMBL/DataCheck/Checks/ControlledMetaKeys.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledMetaKeys.pm b/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledMetaKeys.pm index c6402fc8..6c0d0f4a 100644 --- a/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledMetaKeys.pm +++ b/lib/Bio/EnsEMBL/DataCheck/Checks/ControlledMetaKeys.pm @@ -50,7 +50,7 @@ sub tests { #check target site is main / new and select mandatory metakeys my $filter_metakeys = ''; if (defined $self->target_site){ - $filter_metakeys = " AND target_site like '\%new\%' "; + $filter_metakeys = " AND target_site like '\%".$self->target_site."\%' "; } my $prod_sql = qq/ From 3cf83e7dd5acf96f2a8a90e8ac9a6d9c496aad29 Mon Sep 17 00:00:00 2001 From: vinay-ebi Date: Fri, 12 Jan 2024 15:57:08 +0000 Subject: [PATCH 07/15] fix PR changes --- scripts/run_datachecks.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/run_datachecks.pl b/scripts/run_datachecks.pl index 2742cc40..cde3a9db 100755 --- a/scripts/run_datachecks.pl +++ b/scripts/run_datachecks.pl @@ -159,7 +159,7 @@ =head1 OPTIONS $host, $port, $user, $pass, $dbname, $dbtype, $registry_file, @server_uri, @old_server_uri, $data_file_path, $config_file, @names, @patterns, @groups, @datacheck_types, - $datacheck_dir, $index_file, $history_file, $output_file,, $target_site + $datacheck_dir, $index_file, $history_file, $output_file, $target_site ); GetOptions( From 2954a72769071a3156ac3f249cf5e443ccf1dfcd Mon Sep 17 00:00:00 2001 From: Marc Chakiachvili Date: Tue, 6 Feb 2024 21:14:21 +0000 Subject: [PATCH 08/15] Update DbDataChecks_conf.pm Updated default es params --- .../DataCheck/Pipeline/DbDataChecks_conf.pm | 4 ++-- lib/Bio/EnsEMBL/DataCheck/Registry/compara.pl | 18 ++++++++++++++++++ lib/Bio/EnsEMBL/DataCheck/Registry/core.pl | 18 ++++++++++++++++++ lib/Bio/EnsEMBL/DataCheck/Registry/funcgen.pl | 18 ++++++++++++++++++ .../EnsEMBL/DataCheck/Registry/variation.pl | 19 +++++++++++++++++++ 5 files changed, 75 insertions(+), 2 deletions(-) create mode 100644 lib/Bio/EnsEMBL/DataCheck/Registry/compara.pl create mode 100644 lib/Bio/EnsEMBL/DataCheck/Registry/core.pl create mode 100644 lib/Bio/EnsEMBL/DataCheck/Registry/funcgen.pl create mode 100644 lib/Bio/EnsEMBL/DataCheck/Registry/variation.pl diff --git a/lib/Bio/EnsEMBL/DataCheck/Pipeline/DbDataChecks_conf.pm b/lib/Bio/EnsEMBL/DataCheck/Pipeline/DbDataChecks_conf.pm index 782c1768..836021ad 100644 --- a/lib/Bio/EnsEMBL/DataCheck/Pipeline/DbDataChecks_conf.pm +++ b/lib/Bio/EnsEMBL/DataCheck/Pipeline/DbDataChecks_conf.pm @@ -78,9 +78,9 @@ sub default_options { json_by_species => 1, shout_db_not_found_in_registry => 1, store_to_es => 0, - es_host => 'es.production.ensembl.org', + es_host => 'es.ensembl-production.ebi.ac.uk', es_port => undef, - es_index => 'datacheck_results', + es_index => 'datacheck_results_' . $self->o('ENV', 'ENS_VERSION'), es_log_dir => '/hps/scratch/flicek/ensembl/' . $self->o('ENV', 'USER') . '/datacheck_results_' . $self->o('ENV', 'ENS_VERSION'), }; } diff --git a/lib/Bio/EnsEMBL/DataCheck/Registry/compara.pl b/lib/Bio/EnsEMBL/DataCheck/Registry/compara.pl new file mode 100644 index 00000000..57daff00 --- /dev/null +++ b/lib/Bio/EnsEMBL/DataCheck/Registry/compara.pl @@ -0,0 +1,18 @@ +use strict; +use warnings; +use Bio::EnsEMBL::MetaData::DBSQL::MetaDataDBAdaptor; +use Bio::EnsEMBL::Production::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Taxonomy::DBSQL::TaxonomyDBAdaptor; +use Bio::EnsEMBL::Registry; + +# Metadata +Bio::EnsEMBL::Registry->load_registry_from_url("$METADATA_URI/$METADATA_DB?species=multi&group=metadata"); +# Production +Bio::EnsEMBL::Registry->load_registry_from_url("$PRODUCTION_URI/$PRODUCTION_DB?species=multi&group=production"); +# Taxonomy +Bio::EnsEMBL::Registry->load_registry_from_url("$TAXONOMY_URI/$TAXONOMY_DB?species=multi&group=taxonomy"); +# DB to checks +# TODO add loop over list of DBs +Bio::EnsEMBL::Registry->load_registry_from_url("$SRC_URI/$DB_NAME?species=$SPECIES&group=compara"); + +1; \ No newline at end of file diff --git a/lib/Bio/EnsEMBL/DataCheck/Registry/core.pl b/lib/Bio/EnsEMBL/DataCheck/Registry/core.pl new file mode 100644 index 00000000..665b3a73 --- /dev/null +++ b/lib/Bio/EnsEMBL/DataCheck/Registry/core.pl @@ -0,0 +1,18 @@ +use strict; +use warnings; +use Bio::EnsEMBL::MetaData::DBSQL::MetaDataDBAdaptor; +use Bio::EnsEMBL::Production::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Taxonomy::DBSQL::TaxonomyDBAdaptor; +use Bio::EnsEMBL::Registry; + +# Metadata +Bio::EnsEMBL::Registry->load_registry_from_url("$METADATA_URI/$METADATA_DB?species=multi&group=metadata"); +# Production +Bio::EnsEMBL::Registry->load_registry_from_url("$PRODUCTION_URI/$PRODUCTION_DB?species=multi&group=production"); +# Taxonomy +Bio::EnsEMBL::Registry->load_registry_from_url("$TAXONOMY_URI/$TAXONOMY_DB?species=multi&group=taxonomy"); +# DB to checks +# TODO add loop over list of DBs +Bio::EnsEMBL::Registry->load_registry_from_url("$SRC_URI/$DB_NAME?species=$SPECIES&group=core"); + +1; \ No newline at end of file diff --git a/lib/Bio/EnsEMBL/DataCheck/Registry/funcgen.pl b/lib/Bio/EnsEMBL/DataCheck/Registry/funcgen.pl new file mode 100644 index 00000000..3066ce82 --- /dev/null +++ b/lib/Bio/EnsEMBL/DataCheck/Registry/funcgen.pl @@ -0,0 +1,18 @@ +use strict; +use warnings; +use Bio::EnsEMBL::MetaData::DBSQL::MetaDataDBAdaptor; +use Bio::EnsEMBL::Production::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Taxonomy::DBSQL::TaxonomyDBAdaptor; +use Bio::EnsEMBL::Registry; + +# Metadata +Bio::EnsEMBL::Registry->load_registry_from_url("$METADATA_URI/$METADATA_DB?species=multi&group=metadata"); +# Production +Bio::EnsEMBL::Registry->load_registry_from_url("$PRODUCTION_URI/$PRODUCTION_DB?species=multi&group=production"); +# Taxonomy +Bio::EnsEMBL::Registry->load_registry_from_url("$TAXONOMY_URI/$TAXONOMY_DB?species=multi&group=taxonomy"); +# DB to checks +# TODO add loop over list of DBs +Bio::EnsEMBL::Registry->load_registry_from_url("$SRC_URI/$DB_NAME?species=$SPECIES&group=funcgen"); + +1; \ No newline at end of file diff --git a/lib/Bio/EnsEMBL/DataCheck/Registry/variation.pl b/lib/Bio/EnsEMBL/DataCheck/Registry/variation.pl new file mode 100644 index 00000000..50224f19 --- /dev/null +++ b/lib/Bio/EnsEMBL/DataCheck/Registry/variation.pl @@ -0,0 +1,19 @@ +use strict; +use warnings; +use Bio::EnsEMBL::Registry; + +# Metadata +Bio::EnsEMBL::Registry->load_registry_from_url("$METADATA_URI/$METADATA_DB?species=multi&group=metadata"); +# Production +Bio::EnsEMBL::Registry->load_registry_from_url("$PRODUCTION_URI/$PRODUCTION_DB?species=multi&group=production"); +# Taxonomy +Bio::EnsEMBL::Registry->load_registry_from_url("$TAXONOMY_URI/$TAXONOMY_DB?species=multi&group=taxonomy"); +# Regulation +my $funcgen_db=$DB_NAME; +$funcgen_db =~ s/variation/funcgen/r; +Bio::EnsEMBL::Registry->load_registry_from_url("$SRC_URI/$funcgen_db?species=$SPECIES&group=funcgen"); +# DB to checks +# TODO add loop over list of DBs +Bio::EnsEMBL::Registry->load_registry_from_url("$SRC_URI/$DB_NAME?species=$SPECIES&group=variation"); + +1; \ No newline at end of file From 603c24b38c6810e3903d5bcfd27a3796bee43a32 Mon Sep 17 00:00:00 2001 From: vinay-ebi Date: Wed, 7 Feb 2024 07:20:48 +0000 Subject: [PATCH 09/15] add target_site param to DbDataChecks_conf --- lib/Bio/EnsEMBL/DataCheck/Pipeline/DbDataChecks_conf.pm | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/Bio/EnsEMBL/DataCheck/Pipeline/DbDataChecks_conf.pm b/lib/Bio/EnsEMBL/DataCheck/Pipeline/DbDataChecks_conf.pm index 1e779965..ed2829fa 100644 --- a/lib/Bio/EnsEMBL/DataCheck/Pipeline/DbDataChecks_conf.pm +++ b/lib/Bio/EnsEMBL/DataCheck/Pipeline/DbDataChecks_conf.pm @@ -82,6 +82,7 @@ sub default_options { es_port => undef, es_index => 'datacheck_results_'.$self->o('ENV', 'ENS_VERSION'), es_log_dir => '/hps/scratch/flicek/ensembl/'.$self->o('ENV', 'USER').'/datacheck_results_'.$self->o('ENV', 'ENS_VERSION'), + target_site => 'main', }; } @@ -194,6 +195,8 @@ sub pipeline_analyses { tap_to_json => $self->o('tap_to_json'), json_passed => $self->o('json_passed'), json_by_species => $self->o('json_by_species'), + + target_site => $self->o('target_site'), }, -rc_name => 'default', -flow_into => { From 087f32621fb38eae102304b733a630e4d72c7d86 Mon Sep 17 00:00:00 2001 From: vinay-ebi Date: Wed, 7 Feb 2024 08:16:12 +0000 Subject: [PATCH 10/15] add target_site params to dbsubmission module --- lib/Bio/EnsEMBL/DataCheck/Pipeline/DataCheckSubmission.pm | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/Bio/EnsEMBL/DataCheck/Pipeline/DataCheckSubmission.pm b/lib/Bio/EnsEMBL/DataCheck/Pipeline/DataCheckSubmission.pm index ad723bf8..a328bbd2 100644 --- a/lib/Bio/EnsEMBL/DataCheck/Pipeline/DataCheckSubmission.pm +++ b/lib/Bio/EnsEMBL/DataCheck/Pipeline/DataCheckSubmission.pm @@ -122,6 +122,7 @@ sub write_output { json_by_species => $self->param('json_by_species'), submission_job_id => $self->input_job->dbID, + target_site => $self->o('target_site'), }; $self->dataflow_output_id($params, 1); From 6310788496babf9863e4c04084691d3a66df60b6 Mon Sep 17 00:00:00 2001 From: vinay-ebi Date: Wed, 7 Feb 2024 08:18:47 +0000 Subject: [PATCH 11/15] add target_site params to dbsubmission module --- lib/Bio/EnsEMBL/DataCheck/Pipeline/DataCheckSubmission.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Bio/EnsEMBL/DataCheck/Pipeline/DataCheckSubmission.pm b/lib/Bio/EnsEMBL/DataCheck/Pipeline/DataCheckSubmission.pm index a328bbd2..0ddf9896 100644 --- a/lib/Bio/EnsEMBL/DataCheck/Pipeline/DataCheckSubmission.pm +++ b/lib/Bio/EnsEMBL/DataCheck/Pipeline/DataCheckSubmission.pm @@ -122,7 +122,7 @@ sub write_output { json_by_species => $self->param('json_by_species'), submission_job_id => $self->input_job->dbID, - target_site => $self->o('target_site'), + target_site => $self->param('target_site'), }; $self->dataflow_output_id($params, 1); From 0799ab2dec3c429ad055af4ad3e807b1a05bf9e3 Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Thu, 8 Feb 2024 11:25:33 +0000 Subject: [PATCH 12/15] add support for GCF in beta/rapid --- lib/Bio/EnsEMBL/DataCheck/Checks/DisplayNameFormat.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Bio/EnsEMBL/DataCheck/Checks/DisplayNameFormat.pm b/lib/Bio/EnsEMBL/DataCheck/Checks/DisplayNameFormat.pm index e615a5ec..fd04fb69 100644 --- a/lib/Bio/EnsEMBL/DataCheck/Checks/DisplayNameFormat.pm +++ b/lib/Bio/EnsEMBL/DataCheck/Checks/DisplayNameFormat.pm @@ -40,7 +40,7 @@ sub tests { my $mca = $self->dba->get_adaptor("MetaContainer"); # Check that the format of the display name conforms to expectations. - my $format = '[A-Za-z0-9\ ]+ \([A-Za-z0-9\(\)\/\-\_,\#\. ]+\) \- GCA_\d+\.\d+(?:\s\[[\w ]+\])?'; + my $format = '[A-Za-z0-9\ ]+ \([A-Za-z0-9\(\)\/\-\_,\#\. ]+\) \- GC[AF]_\d+\.\d+(?:\s\[[\w ]+\])?'; my $desc = "Display name has correct format"; my $display_name = $mca->single_value_by_key('species.display_name'); From 7710e50ca23619d5b432588296ca5e84d8cf77b5 Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Thu, 8 Feb 2024 14:11:33 +0000 Subject: [PATCH 13/15] accept GCF for accession and add alt_accession meta key --- lib/Bio/EnsEMBL/DataCheck/Checks/MetaKeyFormat.pm | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/Bio/EnsEMBL/DataCheck/Checks/MetaKeyFormat.pm b/lib/Bio/EnsEMBL/DataCheck/Checks/MetaKeyFormat.pm index d77aea32..0205a62e 100644 --- a/lib/Bio/EnsEMBL/DataCheck/Checks/MetaKeyFormat.pm +++ b/lib/Bio/EnsEMBL/DataCheck/Checks/MetaKeyFormat.pm @@ -44,7 +44,8 @@ sub tests { my %formats = ( 'annotation.provider_url' => '(https?:\/\/.+|www.*\.ensembl\.org)', 'assembly.provider_url' => '(https?:\/\/.+|www.*\.ensembl\.org)', - 'assembly.accession' => 'GCA_\d+\.\d+', + 'assembly.accession' => 'GC[AF]_\d+\.\d+', + 'assembly.alt_accession' => 'GCA_\d+\.\d+', 'assembly.date' => '\d{4}-\d{2}', 'assembly.default' => '[\w\.\-]+', 'genebuild.id' => '\d+', From 2c79241cc4f0e44c0f5c7bd9dd57db3f86fcd4ef Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Thu, 8 Feb 2024 14:12:08 +0000 Subject: [PATCH 14/15] allow GCFs as well --- lib/Bio/EnsEMBL/DataCheck/Checks/SpeciesTaxonomy.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Bio/EnsEMBL/DataCheck/Checks/SpeciesTaxonomy.pm b/lib/Bio/EnsEMBL/DataCheck/Checks/SpeciesTaxonomy.pm index c97fec5c..ea6172fc 100644 --- a/lib/Bio/EnsEMBL/DataCheck/Checks/SpeciesTaxonomy.pm +++ b/lib/Bio/EnsEMBL/DataCheck/Checks/SpeciesTaxonomy.pm @@ -49,7 +49,7 @@ sub tests { # scientific name, to disambiguate in the case of multiple strains # or assemblies of the same species. Since the taxonomy database does # not always have that information, remove it before comparing. - $sci_name =~ s/ \(GCA_\d+\)//; + $sci_name =~ s/ \(GC[AF]_\d+\)//; $sci_name =~ s/ (str\.|strain) .*//; my $desc_1 = 'Species-related meta data exists'; From d53d027fbecf7a59fcfede566cab20bde3c5f7e4 Mon Sep 17 00:00:00 2001 From: Leanne Haggerty Date: Fri, 23 Feb 2024 13:48:56 +0000 Subject: [PATCH 15/15] Skip checking the meta table for blanks --- lib/Bio/EnsEMBL/DataCheck/Checks/BlankNulls.pm | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/lib/Bio/EnsEMBL/DataCheck/Checks/BlankNulls.pm b/lib/Bio/EnsEMBL/DataCheck/Checks/BlankNulls.pm index 3531443a..e3bb5d23 100644 --- a/lib/Bio/EnsEMBL/DataCheck/Checks/BlankNulls.pm +++ b/lib/Bio/EnsEMBL/DataCheck/Checks/BlankNulls.pm @@ -57,13 +57,14 @@ sub tests { foreach my $nullable (@$nullables) { my ($table, $column) = @$nullable; - - my $desc = "Nullable column $table.$column has no '' or 'NULL' string values"; - my $sql = qq/ - SELECT COUNT(*) FROM $table - WHERE $column = '' OR $column = 'NULL' - /; + if ($table ne "meta"){ + my $desc = "Nullable column $table.$column has no '' or 'NULL' string values"; + my $sql = qq/ + SELECT COUNT(*) FROM $table + WHERE $column = '' OR $column = 'NULL' + /; is_rows_zero($self->dba, $sql, $desc); + } } }