From 8f38d9a20c744a3846087af79521875ed3bfbf38 Mon Sep 17 00:00:00 2001 From: Alex Fernandez Date: Fri, 7 Jun 2024 12:53:50 -0300 Subject: [PATCH 01/22] TG-734: [UI] Create a Cancel button to kill processes --- pyproject.toml | 1 + .../commands/queries/execute_tests_query.py | 2 + testgen/commands/queries/profiling_query.py | 3 + testgen/commands/run_execute_tests.py | 33 ++++++---- testgen/commands/run_profiling_bridge.py | 31 ++++++++-- testgen/common/process_service.py | 62 +++++++++++++++++++ testgen/settings.py | 1 + .../030_initialize_new_schema_structure.sql | 6 +- .../dbsetup/060_create_standard_views.sql | 5 +- .../dbupgrade/0102_incremental_upgrade.sql | 4 ++ .../ex_write_test_record_to_testrun_table.sql | 5 +- .../get_entities/get_test_run_list.sql | 3 +- .../project_profile_run_record_insert.sql | 6 +- ...oject_profile_run_record_update_status.sql | 5 ++ testgen/ui/app.py | 2 +- testgen/ui/queries/test_run_queries.py | 15 +++++ testgen/ui/services/test_run_service.py | 5 ++ testgen/ui/views/profiling_summary.py | 30 ++++++++- testgen/ui/views/test_runs.py | 59 +++++++++++++----- 19 files changed, 233 insertions(+), 45 deletions(-) create mode 100644 testgen/common/process_service.py create mode 100644 testgen/template/dbupgrade/0102_incremental_upgrade.sql create mode 100644 testgen/template/profiling/project_profile_run_record_update_status.sql diff --git a/pyproject.toml b/pyproject.toml index 6611847..a22a420 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,6 +59,7 @@ dependencies = [ "beautifulsoup4==4.12.3", "trino==0.327.0", "xlsxwriter==3.2.0", + "psutil==5.9.8", ] [project.optional-dependencies] diff --git a/testgen/commands/queries/execute_tests_query.py b/testgen/commands/queries/execute_tests_query.py index f38cc43..820f14e 100644 --- a/testgen/commands/queries/execute_tests_query.py +++ b/testgen/commands/queries/execute_tests_query.py @@ -32,6 +32,7 @@ class CTestExecutionSQL: test_suite = "" test_run_id = "" exception_message = "" + process_id = "" # Test Set Parameters dctTestParms: typing.ClassVar = {} @@ -58,6 +59,7 @@ def _ReplaceParms(self, strInputString: str): strInputString = strInputString.replace("{MULTI_COLUMN_ERROR_CONDITION}", self.multi_column_error_condition) strInputString = strInputString.replace("{EXCEPTION_MESSAGE}", self.exception_message) strInputString = strInputString.replace("{START_TIME}", self.today) + strInputString = strInputString.replace("{PROCESS_ID}", str(self.process_id)) strInputString = strInputString.replace( "{NOW}", date_service.get_now_as_string_with_offset(self.minutes_offset) ) diff --git a/testgen/commands/queries/profiling_query.py b/testgen/commands/queries/profiling_query.py index fd8b180..b937195 100644 --- a/testgen/commands/queries/profiling_query.py +++ b/testgen/commands/queries/profiling_query.py @@ -46,6 +46,8 @@ class CProfilingSQL: sampling_table = "" sample_ratio = "" + process_id = None + contingency_max_values = "4" contingency_columns = "" @@ -95,6 +97,7 @@ def ReplaceParms(self, strInputString): strInputString = strInputString.replace("{PARM_MAX_PATTERN_LENGTH}", str(self.parm_max_pattern_length)) strInputString = strInputString.replace("{CONTINGENCY_COLUMNS}", self.contingency_columns) strInputString = strInputString.replace("{CONTINGENCY_MAX_VALUES}", self.contingency_max_values) + strInputString = strInputString.replace("{PROCESS_ID}", str(self.process_id)) return strInputString diff --git a/testgen/commands/run_execute_tests.py b/testgen/commands/run_execute_tests.py index 17e054e..f998574 100644 --- a/testgen/commands/run_execute_tests.py +++ b/testgen/commands/run_execute_tests.py @@ -1,7 +1,10 @@ import logging +import subprocess import threading import uuid +import testgen.common.process_service as process_service +from testgen import settings from testgen.commands.queries.execute_tests_query import CTestExecutionSQL from testgen.common import ( AssignConnectParms, @@ -49,6 +52,7 @@ def run_test_queries(strTestRunID, strTestTime, strProjectCode, strTestSuite, mi clsExecute = CTestExecutionSQL(strProjectCode, dctParms["sql_flavor"], strTestSuite, minutes_offset) clsExecute.run_date = strTestTime clsExecute.test_run_id = strTestRunID + clsExecute.process_id = process_service.get_current_process_id() booClean = False # Add a record in Test Run table for the new Test Run @@ -109,18 +113,23 @@ def run_test_queries(strTestRunID, strTestTime, strProjectCode, strTestSuite, mi return booErrors, error_msg -def run_execution_steps_in_background(strProjectCode, strTestSuite, minutes_offset=0): - LOG.info(f"Starting run_execution_steps_in_background against test suite: {strTestSuite}") - empty_cache() - background_thread = threading.Thread( - target=run_execution_steps, - args=( - strProjectCode, - strTestSuite, - minutes_offset, - ), - ) - background_thread.start() +def run_execution_steps_in_background(project_code, test_suite): + msg = f"Starting run_execution_steps_in_background against test suite: {test_suite}" + if settings.IS_DEBUG: + LOG.info(msg + ". Running in debug mode (new thread instead of new process).") + empty_cache() + background_thread = threading.Thread( + target=run_execution_steps, + args=( + project_code, + test_suite + ), + ) + background_thread.start() + else: + LOG.info(msg) + script = ["testgen", "run-tests", "--project-key", project_code, "--test-suite-key", test_suite] + subprocess.Popen(script) # NOQA S603 def run_execution_steps(strProjectCode, strTestSuite, minutes_offset=0, spinner=None): diff --git a/testgen/commands/run_profiling_bridge.py b/testgen/commands/run_profiling_bridge.py index a303f31..4bfac33 100644 --- a/testgen/commands/run_profiling_bridge.py +++ b/testgen/commands/run_profiling_bridge.py @@ -1,9 +1,12 @@ import logging +import subprocess import threading import uuid import pandas as pd +import testgen.common.process_service as process_service +from testgen import settings from testgen.commands.queries.profiling_query import CProfilingSQL from testgen.common import ( AssignConnectParms, @@ -13,6 +16,8 @@ RunActionQueryList, RunThreadedRetrievalQueryList, WriteListToDB, + date_service, + read_template_sql_file, ) from testgen.common.database.database_service import empty_cache @@ -215,10 +220,16 @@ def RunPairwiseContingencyCheck(clsProfiling, threshold_ratio): def run_profiling_in_background(table_group_id): - LOG.info(f"Starting run_profiling_in_background against table group_id: {table_group_id}") - empty_cache() - background_thread = threading.Thread(target=run_profiling_queries, args=(table_group_id,)) - background_thread.start() + msg = f"Starting run_profiling_in_background against table group_id: {table_group_id}" + if settings.IS_DEBUG: + LOG.info(msg + ". Running in debug mode (new thread instead of new process).") + empty_cache() + background_thread = threading.Thread(target=run_profiling_queries, args=(table_group_id,)) + background_thread.start() + else: + LOG.info(msg) + script = ["testgen", "run-profile", "-tg", table_group_id] + subprocess.Popen(script) # NOQA S603 def run_profiling_queries(strTableGroupsID, spinner=None): @@ -274,6 +285,7 @@ def run_profiling_queries(strTableGroupsID, spinner=None): clsProfiling.profile_use_sampling = dctParms["profile_use_sampling"] clsProfiling.profile_sample_percent = dctParms["profile_sample_percent"] clsProfiling.profile_sample_min_count = dctParms["profile_sample_min_count"] + clsProfiling.process_id = process_service.get_current_process_id() # Add a record in profiling_runs table for the new profile strProfileRunQuery = clsProfiling.GetProfileRunInfoRecordsQuery() @@ -465,3 +477,14 @@ def run_profiling_queries(strTableGroupsID, spinner=None): str_error_status = "successfully." message += str_error_status return message + + +def update_profile_run_status(profile_run_id, status): + sql_template = read_template_sql_file("project_profile_run_record_update_status.sql", sub_directory="profiling") + + sql_template = sql_template.replace("{STATUS}", status) + sql_template = sql_template.replace("{NOW}", date_service.get_now_as_string()) + sql_template = sql_template.replace("{EXCEPTION_MESSAGE}", "") + sql_template = sql_template.replace("{PROFILE_RUN_ID}", profile_run_id) + + RunActionQueryList("DKTG", [sql_template]) diff --git a/testgen/common/process_service.py b/testgen/common/process_service.py new file mode 100644 index 0000000..9cfa229 --- /dev/null +++ b/testgen/common/process_service.py @@ -0,0 +1,62 @@ +import logging +import os + +import psutil + +from testgen import settings + +logger = logging.getLogger("testgen.cli") + + +def get_current_process_id(): + return os.getpid() + + +def kill_profile_run(process_id): + keywords = ["run-profile"] + status, message = kill_process(process_id, keywords) + return status, message + + +def kill_test_run(process_id): + keywords = ["run-tests"] + status, message = kill_process(process_id, keywords) + return status, message + + +def kill_process(process_id, keywords=None): + if settings.IS_DEBUG: + msg = "Cannot kill processes in debug mode (threads are used instead of new process)" + logger.warn(msg) + return False, msg + try: + process = psutil.Process(process_id) + if process.name().lower() != "testgen": + message = f"The process was not killed because the process_id {process_id} is not a testgen process. Details: {process.name()}" + logger.error(f"kill_process: {message}") + return False, message + + if keywords: + for keyword in keywords: + if keyword.lower() not in process.cmdline(): + message = f"The process was not killed because the keyword {keyword} was not found. Details: {process.cmdline()}" + logger.error(f"kill_process: {message}") + return False, message + + process.terminate() + process.wait(timeout=10) + message = f"Process {process_id} has been terminated." + except psutil.NoSuchProcess: + message = f"No such process with PID {process_id}." + logger.exception(f"kill_process: {message}") + return False, message + except psutil.AccessDenied: + message = f"Access denied when trying to terminate process {process_id}." + logger.exception(f"kill_process: {message}") + return False, message + except psutil.TimeoutExpired: + message = f"Process {process_id} did not terminate within the timeout period." + logger.exception(f"kill_process: {message}") + return False, message + logger.info(f"kill_process: Success. {message}") + return True, message diff --git a/testgen/settings.py b/testgen/settings.py index 5e58209..8eae0f5 100644 --- a/testgen/settings.py +++ b/testgen/settings.py @@ -1,5 +1,6 @@ import os +IS_DEBUG_LOG_LEVEL: bool = os.getenv("TESTGEN_DEBUG_LOG_LEVEL", "no").lower() == "yes" IS_DEBUG: bool = os.getenv("TESTGEN_DEBUG", "no").lower() == "yes" """ When True invalidates the cache with the bootstrapped application diff --git a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql index fbd5c86..8b4ca07 100644 --- a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql +++ b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql @@ -102,7 +102,8 @@ CREATE TABLE profiling_runs ( column_ct BIGINT, anomaly_ct BIGINT, anomaly_table_ct BIGINT, - anomaly_column_ct BIGINT + anomaly_column_ct BIGINT, + process_id INTEGER ); CREATE TABLE test_suites ( @@ -495,7 +496,8 @@ CREATE TABLE test_runs ( table_ct INTEGER, column_ct INTEGER, column_failed_ct INTEGER, - column_warning_ct INTEGER + column_warning_ct INTEGER, + process_id INTEGER ); CREATE TABLE test_results ( diff --git a/testgen/template/dbsetup/060_create_standard_views.sql b/testgen/template/dbsetup/060_create_standard_views.sql index 615d797..5379797 100644 --- a/testgen/template/dbsetup/060_create_standard_views.sql +++ b/testgen/template/dbsetup/060_create_standard_views.sql @@ -70,7 +70,7 @@ SELECT r.id as profiling_run_id, r.log_message, r.table_ct, r.column_ct, - r.anomaly_ct, r.anomaly_table_ct, r.anomaly_column_ct + r.anomaly_ct, r.anomaly_table_ct, r.anomaly_column_ct, process_id FROM profiling_runs r INNER JOIN table_groups tg ON r.table_groups_id = tg.id @@ -94,7 +94,8 @@ SELECT r.id as test_run_id, COUNT(*) as test_ct, SUM(result_code) as passed_ct, COALESCE(SUM(CASE WHEN tr.result_status = 'Failed' THEN 1 END), 0) as failed_ct, - COALESCE(SUM(CASE WHEN tr.result_status = 'Warning' THEN 1 END), 0) as warning_ct + COALESCE(SUM(CASE WHEN tr.result_status = 'Warning' THEN 1 END), 0) as warning_ct, + process_id FROM test_runs r INNER JOIN projects p ON (r.project_code = p.project_code) diff --git a/testgen/template/dbupgrade/0102_incremental_upgrade.sql b/testgen/template/dbupgrade/0102_incremental_upgrade.sql new file mode 100644 index 0000000..1f3c1f9 --- /dev/null +++ b/testgen/template/dbupgrade/0102_incremental_upgrade.sql @@ -0,0 +1,4 @@ +SET SEARCH_PATH TO {SCHEMA_NAME}; + +alter table {SCHEMA_NAME}.profiling_runs add column process_id INTEGER; +alter table {SCHEMA_NAME}.test_runs add column process_id INTEGER; diff --git a/testgen/template/execution/ex_write_test_record_to_testrun_table.sql b/testgen/template/execution/ex_write_test_record_to_testrun_table.sql index 929a46c..7505bd0 100644 --- a/testgen/template/execution/ex_write_test_record_to_testrun_table.sql +++ b/testgen/template/execution/ex_write_test_record_to_testrun_table.sql @@ -1,5 +1,6 @@ -INSERT INTO test_runs (id, project_code, test_suite, test_starttime) +INSERT INTO test_runs (id, project_code, test_suite, test_starttime, process_id) (SELECT '{TEST_RUN_ID}' :: UUID as id, '{PROJECT_CODE}' as project_code, '{TEST_SUITE}' as test_suite, - '{RUN_DATE}' as test_starttime); + '{RUN_DATE}' as test_starttime, + '{PROCESS_ID}'as process_id); diff --git a/testgen/template/get_entities/get_test_run_list.sql b/testgen/template/get_entities/get_test_run_list.sql index f1430b0..f3ad50c 100644 --- a/testgen/template/get_entities/get_test_run_list.sql +++ b/testgen/template/get_entities/get_test_run_list.sql @@ -10,7 +10,8 @@ Select tr.test_suite as test_suite_key, COUNT(DISTINCT lower(r.schema_name || '.' || table_name)) as table_ct, COUNT(*) as result_ct, SUM(CASE WHEN r.result_code = 0 THEN 1 END) as fail_ct, - SUM(CASE WHEN r.observability_status = 'Sent' THEN 1 END) as sent_to_obs + SUM(CASE WHEN r.observability_status = 'Sent' THEN 1 END) as sent_to_obs, + process_id from test_runs tr INNER JOIN test_results r ON (tr.id = r.test_run_id) diff --git a/testgen/template/profiling/project_profile_run_record_insert.sql b/testgen/template/profiling/project_profile_run_record_insert.sql index fe22b35..04b902f 100644 --- a/testgen/template/profiling/project_profile_run_record_insert.sql +++ b/testgen/template/profiling/project_profile_run_record_insert.sql @@ -1,6 +1,8 @@ -INSERT INTO profiling_runs (id, project_code, connection_id, table_groups_id, profiling_starttime) +INSERT INTO profiling_runs (id, project_code, connection_id, table_groups_id, profiling_starttime, process_id) (SELECT '{PROFILE_RUN_ID}' :: UUID as id, '{PROJECT_CODE}' as project_code, {CONNECTION_ID} as connection_id, '{TABLE_GROUPS_ID}' :: UUID as table_groups_id, - '{RUN_DATE}' as profiling_starttime); + '{RUN_DATE}' as profiling_starttime, + '{PROCESS_ID}' as process_id + ); diff --git a/testgen/template/profiling/project_profile_run_record_update_status.sql b/testgen/template/profiling/project_profile_run_record_update_status.sql new file mode 100644 index 0000000..8751c03 --- /dev/null +++ b/testgen/template/profiling/project_profile_run_record_update_status.sql @@ -0,0 +1,5 @@ +UPDATE profiling_runs +SET status = '{STATUS}', + profiling_endtime = '{NOW}', + log_message = '{EXCEPTION_MESSAGE}' +where id = '{PROFILE_RUN_ID}' :: UUID; diff --git a/testgen/ui/app.py b/testgen/ui/app.py index 132e257..b6def0c 100644 --- a/testgen/ui/app.py +++ b/testgen/ui/app.py @@ -95,6 +95,6 @@ def set_current_project(project_code: str) -> None: if __name__ == "__main__": log_level = logging.INFO - if settings.IS_DEBUG or "--debug" in sys.argv: + if settings.IS_DEBUG_LOG_LEVEL or "--debug" in sys.argv: log_level = logging.DEBUG render(log_level=log_level) diff --git a/testgen/ui/queries/test_run_queries.py b/testgen/ui/queries/test_run_queries.py index d9b4b10..fbc0e60 100644 --- a/testgen/ui/queries/test_run_queries.py +++ b/testgen/ui/queries/test_run_queries.py @@ -1,5 +1,6 @@ import streamlit as st +import testgen.common.date_service as date_service import testgen.ui.services.database_service as db @@ -15,3 +16,17 @@ def cascade_delete(schema: str, test_suite_names: list[str]) -> None: delete from {schema}.execution_queue where test_suite in ({",".join(items)});""" db.execute_sql(sql) st.cache_data.clear() + + +def update_status(schema: str, test_run_id: str, status: str) -> None: + if not all([test_run_id, status]): + raise ValueError("Missing query parameters.") + + now = date_service.get_now_as_string() + + sql = f"""UPDATE {schema}.test_runs +SET status = '{status}', + test_endtime = '{now}' +where id = '{test_run_id}' :: UUID;""" + db.execute_sql(sql) + st.cache_data.clear() diff --git a/testgen/ui/services/test_run_service.py b/testgen/ui/services/test_run_service.py index 6833668..c3b7b2c 100644 --- a/testgen/ui/services/test_run_service.py +++ b/testgen/ui/services/test_run_service.py @@ -6,3 +6,8 @@ def cascade_delete(test_suite_names): schema = st.session_state["dbschema"] test_run_queries.cascade_delete(schema, test_suite_names) + + +def update_status(test_run_id, status): + schema = st.session_state["dbschema"] + test_run_queries.update_status(schema, test_run_id, status) diff --git a/testgen/ui/views/profiling_summary.py b/testgen/ui/views/profiling_summary.py index 8f7d7e3..987acc8 100644 --- a/testgen/ui/views/profiling_summary.py +++ b/testgen/ui/views/profiling_summary.py @@ -2,10 +2,12 @@ import streamlit as st +import testgen.common.process_service as process_service import testgen.ui.services.database_service as db import testgen.ui.services.form_service as fm import testgen.ui.services.query_service as dq import testgen.ui.services.toolbar_service as tb +from testgen.commands.run_profiling_bridge import update_profile_run_status from testgen.common import date_service from testgen.ui.navigation.menu import MenuItem from testgen.ui.navigation.page import Page @@ -85,7 +87,7 @@ def get_db_profiling_runs(str_project_code, str_tg=None): END as status, COALESCE(log_message, '(No Errors)') as log_message, table_ct, column_ct, - anomaly_ct, anomaly_table_ct, anomaly_column_ct + anomaly_ct, anomaly_table_ct, anomaly_column_ct, process_id FROM {str_schema}.v_profiling_runs WHERE project_code = '{str_project_code}' {str_tg_condition} ORDER BY start_time DESC; @@ -135,9 +137,9 @@ def open_drill_downs(dct_selected_rows, button_slots): def show_record_detail(dct_selected_row): - layout_column_1, _ = st.columns([0.5, 0.5]) + bottom_left_column, bottom_right_column = st.columns([0.5, 0.5]) - with layout_column_1: + with bottom_left_column: str_header = "Profiling Run Information" lst_columns = [ "connection_name", @@ -151,3 +153,25 @@ def show_record_detail(dct_selected_row): "anomaly_column_ct", ] fm.render_html_list(dct_selected_row, lst_columns, str_header, FORM_DATA_WIDTH) + + with bottom_right_column: + st.write("

", unsafe_allow_html=True) + _, button_column = st.columns([0.3, 0.7]) + with button_column: + enable_kill_button = dct_selected_row and dct_selected_row["process_id"] is not None and dct_selected_row["status"] == "Running" + + if enable_kill_button: + if st.button( + ":red[Cancel Run]", + help="Kill the selected profile run", + use_container_width=True, + disabled=not enable_kill_button, + ): + process_id = dct_selected_row["process_id"] + profile_run_id = dct_selected_row["profiling_run_id"] + status, message = process_service.kill_profile_run(process_id) + + if status: + update_profile_run_status(profile_run_id, "Cancelled") + + fm.reset_post_updates(str_message=f":{'green' if status else 'red'}[{message}]", as_toast=True) diff --git a/testgen/ui/views/test_runs.py b/testgen/ui/views/test_runs.py index ed070a3..0dd9b85 100644 --- a/testgen/ui/views/test_runs.py +++ b/testgen/ui/views/test_runs.py @@ -2,9 +2,11 @@ import streamlit as st +import testgen.common.process_service as process_service import testgen.ui.services.database_service as db import testgen.ui.services.form_service as fm import testgen.ui.services.query_service as dq +import testgen.ui.services.test_run_service as test_run_service import testgen.ui.services.toolbar_service as tb from testgen.common import date_service from testgen.ui.navigation.menu import MenuItem @@ -124,7 +126,7 @@ def get_db_test_runs(str_project_code, str_tg=None, str_ts=None): ROUND(100.0 * (r.column_ct - r.column_failed_ct - r.column_warning_ct)::DECIMAL(12, 4) / r.column_ct::DECIMAL(12, 4), 3) as column_passed_pct, r.id::VARCHAR as test_run_id, p.project_name, - s.table_groups_id::VARCHAR, tg.table_groups_name, tg.table_group_schema + s.table_groups_id::VARCHAR, tg.table_groups_name, tg.table_group_schema, process_id FROM {str_schema}.test_runs r INNER JOIN {str_schema}.projects p ON (r.project_code = p.project_code) @@ -151,18 +153,43 @@ def get_db_test_runs(str_project_code, str_tg=None, str_ts=None): def open_record_detail(dct_selected_row): - # Show Run Detail - lst_detail_columns = [ - "test_suite", - "test_suite_description", - "run_date", - "status", - "log_message", - "table_groups_name", - "test_ct", - "passed_ct", - "failed_ct", - "warning_ct", - "error_ct", - ] - fm.render_html_list(dct_selected_row, lst_detail_columns, "Run Information", 500) + bottom_left_column, bottom_right_column = st.columns([0.5, 0.5]) + + with bottom_left_column: + # Show Run Detail + lst_detail_columns = [ + "test_suite", + "test_suite_description", + "run_date", + "status", + "log_message", + "table_groups_name", + "test_ct", + "passed_ct", + "failed_ct", + "warning_ct", + "error_ct", + ] + fm.render_html_list(dct_selected_row, lst_detail_columns, "Run Information", 500) + + with bottom_right_column: + st.write("

", unsafe_allow_html=True) + _, button_column = st.columns([0.3, 0.7]) + with button_column: + enable_kill_button = dct_selected_row and dct_selected_row["process_id"] is not None and dct_selected_row["status"] == "Running" + + if enable_kill_button: + if st.button( + ":red[Cancel Run]", + help="Kill the selected test run", + use_container_width=True, + disabled=not enable_kill_button, + ): + process_id = dct_selected_row["process_id"] + test_run_id = dct_selected_row["test_run_id"] + status, message = process_service.kill_test_run(process_id) + + if status: + test_run_service.update_status(test_run_id, "Cancelled") + + fm.reset_post_updates(str_message=f":{'green' if status else 'red'}[{message}]", as_toast=True) From 9d625736cf1f58784b0db4caf34a71a1234fa0ec Mon Sep 17 00:00:00 2001 From: "Chip.Bloche" Date: Tue, 11 Jun 2024 10:38:45 -0400 Subject: [PATCH 02/22] feat(test types): new multi-table test types, log viewer, and assorted tweaks and fixes New test types: Aggregate Balance, Aggregate Minimum, Combo Match, Timeframe Match, Timeframe Keep, Distribution Shift. Log viewer for troubleshooting from UI. Assorted tweaks and fixes. --- .../commands/queries/execute_tests_query.py | 51 +- .../commands/queries/generate_tests_query.py | 2 +- testgen/common/clean_sql.py | 25 +- testgen/common/docker_service.py | 2 +- .../030_initialize_new_schema_structure.sql | 3 + .../050_populate_new_schema_metadata.sql | 591 +++++++++++++++--- .../dbupgrade/0103_incremental_upgrade.sql | 22 + .../ex_finalize_test_run_results.sql | 3 +- .../ex_aggregate_match_no_drops_generic.sql | 59 +- ..._aggregate_match_percent_above_generic.sql | 49 ++ ...aggregate_match_percent_within_generic.sql | 49 ++ .../ex_aggregate_match_same_generic.sql | 42 +- .../ex_custom_query_generic.sql | 4 +- .../ex_data_match_2way_generic.sql | 58 ++ .../ex_data_match_generic.sql | 76 +-- .../ex_relative_entropy_generic.sql | 29 +- .../ex_window_match_no_drops_generic.sql | 73 ++- .../ex_window_match_same_generic.sql | 70 ++- .../generation/gen_delete_old_tests.sql | 14 +- .../gen_standard_test_type_list.sql | 2 +- .../profiling/functional_datatype.sql | 20 +- testgen/ui/bootstrap.py | 11 +- testgen/ui/navigation/router.py | 6 +- testgen/ui/queries/test_definition_queries.py | 24 +- testgen/ui/services/query_service.py | 24 +- testgen/ui/views/app_log_modal.py | 89 +++ testgen/ui/views/project_settings.py | 5 +- testgen/ui/views/test_definitions.py | 108 ++-- testgen/ui/views/test_results.py | 62 +- testgen/ui/views/test_suites.py | 2 +- 30 files changed, 1207 insertions(+), 368 deletions(-) create mode 100644 testgen/template/dbupgrade/0103_incremental_upgrade.sql create mode 100644 testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_percent_above_generic.sql create mode 100644 testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_percent_within_generic.sql create mode 100644 testgen/template/flavors/generic/exec_query_tests/ex_data_match_2way_generic.sql create mode 100644 testgen/ui/views/app_log_modal.py diff --git a/testgen/commands/queries/execute_tests_query.py b/testgen/commands/queries/execute_tests_query.py index 820f14e..bc9fb4a 100644 --- a/testgen/commands/queries/execute_tests_query.py +++ b/testgen/commands/queries/execute_tests_query.py @@ -1,29 +1,6 @@ import typing -from testgen.common import CleanSQL, date_service, read_template_sql_file - - -def add_quote_to_identifiers(strInput): - keywords = [ - "select", - "from", - "where", - "order", - "by", - "having", - ] # NOTE: In future we might have to expand the list of keywords - - quoted_values = [] - for value in strInput.split(","): - value = value.strip() - if value.startswith('"') and value.endswith('"'): - quoted_values.append(value) - elif any(c.isupper() or c.isspace() or value.lower() in keywords for c in value): - quoted_values.append(f'"{value}"') - else: - quoted_values.append(value) - return ", ".join(quoted_values) - +from testgen.common import CleanSQL, AddQuotesToIdentifierCSV, date_service, read_template_sql_file class CTestExecutionSQL: flavor = "" @@ -47,11 +24,26 @@ def __init__(self, strProjectCode, strFlavor, strTestSuite, minutes_offset=0): self.today = date_service.get_now_as_string_with_offset(minutes_offset) self.minutes_offset = minutes_offset + def _AssembleDisplayParameters(self): + + lst_parms = ["column_name", "skip_errors", "baseline_ct", "baseline_unique_ct", "baseline_value", + "baseline_value_ct", "baseline_sum", "baseline_avg", "baseline_sd", "subset_condition", + "groupby_names", "having_condition", "window_date_column", "window_days", + "match_column_names", "match_subset_condition", "match_schema_name", "match_table_name", + "match_groupby_names", "match_having_condition", + ] + str_parms = "; ".join(f"{key}={self.dctTestParms[key]}" + for key in lst_parms + if key.lower() in self.dctTestParms and self.dctTestParms[key] not in [None, ""]) + str_parms = str_parms.replace("'", "`") + return str_parms + def _ReplaceParms(self, strInputString: str): strInputString = strInputString.replace("{PROJECT_CODE}", self.project_code) strInputString = strInputString.replace("{TEST_SUITE}", self.test_suite) strInputString = strInputString.replace("{SQL_FLAVOR}", self.flavor) strInputString = strInputString.replace("{TEST_RUN_ID}", self.test_run_id) + strInputString = strInputString.replace("{INPUT_PARAMETERS}", self._AssembleDisplayParameters()) strInputString = strInputString.replace("{RUN_DATE}", self.run_date) strInputString = strInputString.replace("{SUM_COLUMNS}", self.sum_columns) @@ -69,21 +61,24 @@ def _ReplaceParms(self, strInputString: str): # "COLUMN_NAMES", # "COL_NAME", # "COL_NAMES", - "MATCH_COLUMN_NAMES", - "MATCH_GROUPBY_NAMES", + # "MATCH_COLUMN_NAMES", + # "MATCH_GROUPBY_NAMES", # "MATCH_SUM_COLUMNS", ] for parm, value in self.dctTestParms.items(): if value: if parm.upper() in column_designators: - strInputString = strInputString.replace("{" + parm.upper() + "}", add_quote_to_identifiers(value)) + strInputString = strInputString.replace("{" + parm.upper() + "}", AddQuotesToIdentifierCSV(value)) else: strInputString = strInputString.replace("{" + parm.upper() + "}", value) else: strInputString = strInputString.replace("{" + parm.upper() + "}", "") if parm == "column_name": - strInputString = strInputString.replace("{COLUMN_NAME_DISPLAY}", value if value else "") + # Shows contents without double-quotes for display and aggregate expressions + strInputString = strInputString.replace("{COLUMN_NAME_NO_QUOTES}", value if value else "") + if parm == "subset_condition": + strInputString = strInputString.replace("{SUBSET_DISPLAY}", value.replace("'", "''") if value else "") # Adding escape character where ':' is referenced strInputString = strInputString.replace(":", "\\:") diff --git a/testgen/commands/queries/generate_tests_query.py b/testgen/commands/queries/generate_tests_query.py index 3828afc..273af1c 100644 --- a/testgen/commands/queries/generate_tests_query.py +++ b/testgen/commands/queries/generate_tests_query.py @@ -75,7 +75,7 @@ def GetTestDerivationQueriesAsList(self, booClean): lstTemplate = [CleanSQL(q) for q in lstTemplate] if len(lstQueries) == 0: - LOG.warning("No test templates were found") + LOG.warning("No funny CAT test generation templates were found") return lstTemplate diff --git a/testgen/common/clean_sql.py b/testgen/common/clean_sql.py index 641f20c..7443bd6 100644 --- a/testgen/common/clean_sql.py +++ b/testgen/common/clean_sql.py @@ -1,4 +1,4 @@ -__all__ = ["CleanSQL"] +__all__ = ["CleanSQL", "AddQuotesToIdentifierCSV"] import re @@ -14,3 +14,26 @@ def CleanSQL(strInput: str) -> str: parts = re.split(r"""("[^"]*"|'[^']*')""", strInput) parts[::2] = (" ".join(s.split()) for s in parts[::2]) # outside quotes return " ".join(parts) + + +def AddQuotesToIdentifierCSV(strInput: str) -> str: + # Keywords -- identifiers to quote + keywords = [ + "select", + "from", + "where", + "order", + "by", + "having", + ] + + quoted_values = [] + for value in strInput.split(","): + value = value.strip() + if value.startswith('"') and value.endswith('"'): + quoted_values.append(value) + elif any(c.isupper() or c.isspace() or value.lower() in keywords for c in value): + quoted_values.append(f'"{value}"') + else: + quoted_values.append(value) + return ", ".join(quoted_values) diff --git a/testgen/common/docker_service.py b/testgen/common/docker_service.py index 4a53145..3763fe3 100644 --- a/testgen/common/docker_service.py +++ b/testgen/common/docker_service.py @@ -24,7 +24,7 @@ def check_for_new_docker_release() -> str: if latest_tag != settings.VERSION: logger.warning( - f"There is a new TestGen docker image. Please pull the latest image version {latest_tag} at your leisure." + f"A new TestGen upgrade is available. Please update to version {latest_tag} for new features and improvements." ) return latest_tag # noqa: TRY300 diff --git a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql index 8b4ca07..d29aef3 100644 --- a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql +++ b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql @@ -445,6 +445,8 @@ CREATE TABLE test_types ( measure_uom VARCHAR(100), measure_uom_description VARCHAR(200), selection_criteria TEXT, + column_name_prompt TEXT, + column_name_help TEXT, default_parm_columns TEXT, default_parm_values TEXT, default_parm_prompts TEXT, @@ -509,6 +511,7 @@ CREATE TABLE test_results ( REFERENCES test_types, test_suite VARCHAR(200), test_definition_id UUID, + auto_gen BOOLEAN, test_time TIMESTAMP, starttime TIMESTAMP, endtime TIMESTAMP, diff --git a/testgen/template/dbsetup/050_populate_new_schema_metadata.sql b/testgen/template/dbsetup/050_populate_new_schema_metadata.sql index 2401eb9..57278de 100644 --- a/testgen/template/dbsetup/050_populate_new_schema_metadata.sql +++ b/testgen/template/dbsetup/050_populate_new_schema_metadata.sql @@ -100,57 +100,59 @@ n controls over data ingested and to make values more efficient, consistent and TRUNCATE TABLE test_types; INSERT INTO test_types - (id, test_type, test_name_short, test_name_long, test_description, except_message, measure_uom, - measure_uom_description, selection_criteria, default_parm_columns, default_parm_values, default_parm_prompts, - default_parm_help, default_severity, run_type, test_scope, dq_dimension, health_dimension, threshold_description, - usage_notes, active) -VALUES ('1001', 'Aggregate_No_Drops', 'Aggregate Minimum', 'Aggregate values per group are at or above reference', 'Tests that aggregate values for each set of column values are at least the same as reference dataset', 'Aggregate measure per set of column values is not at least the same as reference dataset.', 'Mismatched measures', NULL, NULL, 'subset_condition,groupby_names,having_condition,match_column_names,match_schema_name,match_table_name,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'TODO Fill in default_parm_prompts', NULL, 'Fail', 'QUERY', 'multi-column', 'Accuracy', 'Data Drift', 'Expected count of value combinations with lower or missing aggregate measure', NULL, 'N'), - ('1002', 'Aggregate_Increase', 'Aggregate Increase', 'Aggregate values per group exceed reference', 'Tests that aggregate values for each set of column values exceed values for reference dataset', 'Aggregate measure per set of column values fails to exceed the reference dataset.', 'Mismatched measures', NULL, NULL, 'subset_condition,groupby_names,having_condition,match_column_names,match_schema_name,match_table_name,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'TODO Fill in default_parm_prompts match_schema_name,TODO Fill in default_parm_prompts match_table_name,TODO Fill in default_parm_prompts match_column_names,TODO Fill in default_parm_prompts match_subset_condition,TODO Fill in default_parm_prompts match_groupby_names,TODO Fill in default_parm_prompts match_having_condition,TODO Fill in default_parm_prompts subset_condition,TODO Fill in default_parm_prompts groupby_names,TODO Fill in default_parm_prompts having_condition', NULL, 'Fail', 'QUERY', 'multi-column', 'Accuracy', 'Data Drift', 'Expected count of value combinations with not exceeding aggregate measure', NULL, 'N'), - ('1003', 'Aggregate_Match', 'Aggregate Match', 'Aggregate values per group match reference', 'Tests for exact match in aggregate values for each set of column values vs. reference dataset', 'Aggregate measure per set of column values does not exactly match reference dataset.', 'Mismatched measures', NULL, NULL, 'subset_condition,groupby_names,having_condition,match_column_names,match_schema_name,match_table_name,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'TODO Fill in default_parm_prompts', NULL, 'Fail', 'QUERY', 'multi-column', 'Consistency', 'Data Drift', 'Expected count of value combinations with non-matching aggregate measure', NULL, 'N'), - ('1004', 'Alpha_Trunc', 'Alpha Truncation', 'Maximum character count consistent', 'Tests that the maximum count of characters in a column value has not dropped vs. baseline data', 'Maximum length of values has dropped from prior expected length.', 'Values over max', NULL, 'general_type =''A'' AND max_length > 0 AND ( (min_length = avg_length AND max_length = avg_length) OR (numeric_ct <> value_ct ) ) AND functional_table_type NOT LIKE ''%window%'' /* The conditions below are to eliminate overlap with : LOV_Match (excluded selection criteria for this test_type), Pattern_Match (excluded selection criteria for this test_type), Constant (excluded functional_data_type Constant and Boolean) */ AND ( (distinct_value_ct NOT BETWEEN 2 AND 10 AND functional_data_type NOT IN ( ''Constant'', ''Boolean'') ) AND NOT ( fn_charcount(top_patterns, E'' \| '' ) = 1 AND fn_charcount(top_patterns, E'' \| '' ) IS NOT NULL AND REPLACE(SPLIT_PART(top_patterns, ''|'' , 2), ''N'' , '''' ) > ''''))', 'threshold_value', 'max_length', 'Maximum String Length at Baseline', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Maximum length expected', 'Alpha Truncation tests that the longest text value in a column hasn''t become shorter than the longest value at baseline. This could indicate a problem in a cumulative dataset, where prior values should still exist unchanged. A failure here would suggest that some process changed data that you would still expect to be present and matching its value when the column was profiled. This test would not be appropriate for an incremental or windowed dataset.', 'Y'), - ('1005', 'Avg_Shift', 'Average Shift', 'Column mean is consistent with reference', 'Tests for statistically-significant shift in mean value for column from average calculated at baseline.', 'Standardized difference between averages is over the selected threshold level.', 'Difference Measure', 'Cohen''s D Difference (0.20 small, 0.5 mod, 0.8 large, 1.2 very large, 2.0 huge)', 'general_type=''N'' AND distinct_value_ct > 10 AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%''', 'baseline_value_ct,baseline_avg,baseline_sd,threshold_value', 'value_ct,avg_value,stdev_value,0.5::VARCHAR', 'Value Ct at Baseline,Mean at Baseline,Std Deviation at Baseline,Threshold Difference Measure ', NULL, 'Warning', 'CAT', 'column', 'Consistency', 'Data Drift', 'Standardized Difference Measure', 'Average Shift tests that the average of a numeric column has not significantly changed since baseline, when profiling was done. A significant shift may indicate errors in processing, differences in source data, or valid changes that may nevertheless impact assumptions in downstream data products. The test uses Cohen''s D, a statistical technique to identify significant shifts in a value. Cohen''s D measures the difference between the two averages, reporting results on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. Depending on your data, some difference may be expected, so it''s reasonable to adjust the threshold value that triggers test failure. This test works well for measures, or even for identifiers if you expect them to increment consistently. You may want to periodically adjust the expected threshold, or even the expected average value if you expect shifting over time. Consider this test along with Variability Increase. If variability rises too, process or measurement flaws could be at work. If variability remains consistent, the issue is more likely to be with the source data itself. ', 'Y'), - ('1006', 'Condition_Flag', 'Custom Condition', 'Column values match pre-defined condition', 'Tests that each record in the table matches a pre-defined, custom condition', 'Value(s) found not matching defined condition.', 'Values Failing', NULL, NULL, 'threshold_value,custom_query', NULL, 'Threshold Error Count,Custom SQL Expression', 'The number of errors that are acceptable before test fails.|Expression should evaluate to TRUE to register an error or FALSE if no error. An expression can reference only columns in the selected table.', 'Fail', 'CAT', 'custom', 'Validity', 'Schema Drift', 'Count of records that don''t meet test condition', 'Custom Condition is a business-rule test for a user-defined error condition based on the value of one or more columns. The condition is applied to each record within the table, and the count of records failing the condition is added up. If that count exceeds a threshold of errors, the test as a whole is failed. This test is ideal for error conditions that TestGen cannot automatically infer, and any condition that involves the values of more than one column in the same record. Performance of this test is fast, since it is performed together with other aggregate tests. Interpretation is based on the user-defined meaning of the test.', 'Y'), - ('1007', 'Constant', 'Constant Match', 'All column values match constant value', 'Tests that all values in the column match the constant value identified in baseline data', 'A constant value is expected for this column.', 'Mismatched values', NULL, NULL, 'baseline_value,threshold_value', NULL, 'Constant Value at Baseline,Threshold Error Count', 'The single, unchanging value of the column, per baseline|The number of errors that are acceptable before test fails.', 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Count of records with unexpected values', 'Constant Match tests that a single value determined to be a constant in baseline profiling is still the only value for the column that appears in subsequent versions of the dataset. Sometimes new data or business knowledge may reveal that the value is not a constant at all, even though only one value was present at profiling. In this case, you will want to disable this test. Alternatively, you can use the Value Match test to provide a limited number of valid values for the column.', 'Y'), - ('1008', 'CUSTOM', 'Custom Test', 'Custom-defined business rule', 'Custom SQL Test', 'Errors were detected according to test definition.', 'Errors found', 'Count of errors identified by query', NULL, 'custom_query', NULL, 'Custom SQL Query Returning Error Records', 'Query should return records indicating one or more errors. The test passes if no records are returned. Results of the query will be shown when you click `Review Source Data` for a failed test, so be sure to include enough data in your results to follow-up. + (id, test_type, test_name_short, test_name_long, test_description, except_message, measure_uom, measure_uom_description, selection_criteria, column_name_prompt, column_name_help, default_parm_columns, default_parm_values, default_parm_prompts, default_parm_help, default_severity, run_type, test_scope, dq_dimension, health_dimension, threshold_description, usage_notes, active) +VALUES ('1004', 'Alpha_Trunc', 'Alpha Truncation', 'Maximum character count consistent', 'Tests that the maximum count of characters in a column value has not dropped vs. baseline data', 'Maximum length of values has dropped from prior expected length.', 'Values over max', NULL, 'general_type =''A'' AND max_length > 0 AND ( (min_length = avg_length AND max_length = avg_length) OR (numeric_ct <> value_ct ) ) AND functional_table_type NOT LIKE ''%window%'' /* The conditions below are to eliminate overlap with : LOV_Match (excluded selection criteria for this test_type), Pattern_Match (excluded selection criteria for this test_type), Constant (excluded functional_data_type Constant and Boolean) */ AND ( (distinct_value_ct NOT BETWEEN 2 AND 10 AND functional_data_type NOT IN ( ''Constant'', ''Boolean'') ) AND NOT ( fn_charcount(top_patterns, E'' \| '' ) = 1 AND fn_charcount(top_patterns, E'' \| '' ) IS NOT NULL AND REPLACE(SPLIT_PART(top_patterns, ''|'' , 2), ''N'' , '''' ) > ''''))', NULL, NULL, 'threshold_value', 'max_length', 'Maximum String Length at Baseline', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Maximum length expected', 'Alpha Truncation tests that the longest text value in a column hasn''t become shorter than the longest value at baseline. This could indicate a problem in a cumulative dataset, where prior values should still exist unchanged. A failure here would suggest that some process changed data that you would still expect to be present and matching its value when the column was profiled. This test would not be appropriate for an incremental or windowed dataset.', 'Y'), + ('1005', 'Avg_Shift', 'Average Shift', 'Column mean is consistent with reference', 'Tests for statistically-significant shift in mean value for column from average calculated at baseline.', 'Standardized difference between averages is over the selected threshold level.', 'Difference Measure', 'Cohen''s D Difference (0.20 small, 0.5 mod, 0.8 large, 1.2 very large, 2.0 huge)', 'general_type=''N'' AND distinct_value_ct > 10 AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%''', NULL, NULL, 'baseline_value_ct,baseline_avg,baseline_sd,threshold_value', 'value_ct,avg_value,stdev_value,0.5::VARCHAR', 'Value Ct at Baseline,Mean at Baseline,Std Deviation at Baseline,Threshold Difference Measure ', NULL, 'Warning', 'CAT', 'column', 'Consistency', 'Data Drift', 'Standardized Difference Measure', 'Average Shift tests that the average of a numeric column has not significantly changed since baseline, when profiling was done. A significant shift may indicate errors in processing, differences in source data, or valid changes that may nevertheless impact assumptions in downstream data products. The test uses Cohen''s D, a statistical technique to identify significant shifts in a value. Cohen''s D measures the difference between the two averages, reporting results on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. Depending on your data, some difference may be expected, so it''s reasonable to adjust the threshold value that triggers test failure. This test works well for measures, or even for identifiers if you expect them to increment consistently. You may want to periodically adjust the expected threshold, or even the expected average value if you expect shifting over time. Consider this test along with Variability Increase. If variability rises too, process or measurement flaws could be at work. If variability remains consistent, the issue is more likely to be with the source data itself. ', 'Y'), + ('1007', 'Constant', 'Constant Match', 'All column values match constant value', 'Tests that all values in the column match the constant value identified in baseline data', 'A constant value is expected for this column.', 'Mismatched values', NULL, 'TEMPLATE', NULL, NULL, 'baseline_value,threshold_value', NULL, 'Constant Value at Baseline,Threshold Error Count', 'The single, unchanging value of the column, per baseline|The number of errors that are acceptable before test fails.', 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Count of records with unexpected values', 'Constant Match tests that a single value determined to be a constant in baseline profiling is still the only value for the column that appears in subsequent versions of the dataset. Sometimes new data or business knowledge may reveal that the value is not a constant at all, even though only one value was present at profiling. In this case, you will want to disable this test. Alternatively, you can use the Value Match test to provide a limited number of valid values for the column.', 'Y'), + ('1009', 'Daily_Record_Ct', 'Daily Records', 'All dates present within date range', 'Tests for presence of every calendar date within min/max date range, per baseline data', 'Not every date value between min and max dates is present, unlike at baseline.', 'Missing dates', NULL, 'general_type= ''D'' AND date_days_present > 21 AND date_days_present - (DATEDIFF(''day'', ''1800-01-05''::DATE, max_date) - DATEDIFF(''day'', ''1800-01-05''::DATE, min_date) + 1) = 0 AND future_date_ct::FLOAT / NULLIF(value_ct, 0) <= 0.75', NULL, NULL, 'threshold_value', '0', 'Threshold Missing Calendar Days', NULL, 'Warning', 'CAT', 'column', 'Completeness', 'Volume', 'Missing calendar days within min/max range', 'Daily Records tests that at least one record is present for every day within the minimum and maximum date range for the column. The test is relevant for transactional data, where you would expect at least one transaction to be recorded each day. A failure here would suggest missing records for the number of days identified without data. You can adjust the threshold to accept a number of days that you know legitimately have no records. ', 'Y'), + ('1011', 'Dec_Trunc', 'Decimal Truncation', 'Sum of fractional values at or above reference', 'Tests for decimal truncation by confirming that the sum of fractional values in data is no less than the sum at baseline', 'The sum of fractional values is under baseline, which may indicate decimal truncation', 'Fractional sum', 'The sum of all decimal values from all data for this column', 'fractional_sum IS NOT NULL AND functional_table_type LIKE''%cumulative%''', NULL, NULL, 'threshold_value', 'ROUND(fractional_sum, 0)', 'Sum of Fractional Values at Baseline', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Minimum expected sum of all fractional values', 'Decimal Truncation tests that the fractional (decimal) part of a numeric column has not been truncated since Baseline. This works by summing all the fractional values after the decimal point and confirming that the total is at least equal to the fractional total at baseline. This could indicate a problem in a cumulative dataset, where prior values should still exist unchanged. A failure here would suggest that some process changed data that you would still expect to be present and matching its value when the column was profiled. This test would not be appropriate for an incremental or windowed dataset.', 'Y'), + ('1012', 'Distinct_Date_Ct', 'Date Count', 'Count of distinct dates at or above reference', 'Tests that the count of distinct dates referenced in the column has not dropped vs. baseline data', 'Drop in count of unique dates recorded in column.', 'Unique dates', 'Count of unique dates in transactional date column', 'general_type=''D'' and date_days_present IS NOT NULL AND functional_table_type NOT LIKE ''%window%''', NULL, NULL, 'baseline_value,threshold_value', 'date_days_present,date_days_present', 'Distinct Date Count at Baseline,Min Expected Date Count', NULL, 'Fail', 'CAT', 'column', 'Timeliness', 'Recency', 'Minimum distinct date count expected', 'Date Count tests that the count of distinct dates present in the column has not dropped since baseline. The test is relevant for cumulative datasets, where old records are retained. A failure here would indicate missing records, which could be caused by a processing error or changed upstream data sources.', 'Y'), + ('1013', 'Distinct_Value_Ct', 'Value Count', 'Count of distinct values has not dropped', 'Tests that the count of unique values in the column has not changed from baseline.', 'Count of unique values in column has changed from baseline.', 'Unique Values', NULL, 'distinct_value_ct between 2 and 10 AND value_ct > 0 AND NOT (coalesce(top_freq_values,'''') > '''' AND distinct_value_ct BETWEEN 2 and 10) AND NOT (lower(functional_data_type) LIKE ''%sequence%'' OR lower(functional_data_type) LIKE ''%measurement%'' OR functional_data_type LIKE ''%date%'' OR general_type = ''D'')', NULL, NULL, 'baseline_value_ct,threshold_value', 'distinct_value_ct,distinct_value_ct', 'Distinct Value Count at Baseline,Min Expected Value Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected distinct value count', 'Value Count tests that the count of unique values present in the column has not dropped since baseline. The test is relevant for cumulative datasets, where old records are retained, or for any dataset where you would expect a set number of distinct values should be present. A failure here would indicate missing records or a change in categories or value assignment.', 'Y'), + ('1014', 'Email_Format', 'Email Format', 'Email is correctly formatted', 'Tests that non-blank, non-empty email addresses match the standard format', 'Invalid email address formats found.', 'Invalid emails', 'Number of emails that do not match standard format', 'std_pattern_match=''EMAIL''', NULL, NULL, 'threshold_value', '0', 'Maximum Invalid Email Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of invalid email addresses', NULL, 'Y'), + ('1015', 'Future_Date', 'Past Dates', 'Latest date is prior to test run date', 'Tests that the maximum date referenced in the column is no greater than the test date, consistent with baseline data', 'Future date found when absent in baseline data.', 'Future dates', NULL, 'general_type=''D''AND future_date_ct = 0', NULL, NULL, 'threshold_value', '0', 'Maximum Future Date Count', NULL, 'Fail', 'CAT', 'column', 'Timeliness', 'Recency', 'Expected count of future dates', NULL, 'Y'), + ('1016', 'Future_Date_1Y', 'Future Year', 'Future dates within year of test run date', 'Tests that the maximum date referenced in the column is no greater than one year beyond the test date, consistent with baseline data', 'Future date beyond one-year found when absent in baseline.', 'Future dates post 1 year', NULL, 'general_type=''D''AND future_date_ct > 0 AND max_date <=''{AS_OF_DATE}''::DATE + INTERVAL''365 DAYS''', NULL, NULL, 'threshold_value', '0', 'Maximum Post 1-Year Future Date Count', NULL, 'Fail', 'CAT', 'column', 'Timeliness', 'Recency', 'Expected count of future dates beyond one year', 'Future Year looks for date values in the column that extend beyond one year after the test date. This would be appropriate for transactional dates where you would expect to find dates in the near future, but not beyond one year ahead. Errors could indicate invalid entries or possibly dummy dates representing blank values.', 'Y'), + ('1017', 'Incr_Avg_Shift', 'New Shift', 'New record mean is consistent with reference', 'Tests for statistically-significant shift in mean of new values for column compared to average calculated at baseline.', 'Significant shift in average of new values vs. baseline avg', 'Z-score of mean shift', 'Absolute Z-score (number of SD''s outside mean) of prior avg - incremental avg', 'general_type=''N'' AND distinct_value_ct > 10 AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%''', NULL, NULL, 'baseline_value_ct,baseline_sum,baseline_avg,baseline_sd,threshold_value', 'value_ct,(avg_value * value_ct)::FLOAT,avg_value,stdev_value,2', 'Value Count at Baseline,Sum at Baseline,Mean Value at Baseline,Std Deviation at Baseline,Threshold Max Z-Score', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Maximum Z-Score (number of SD''s beyond mean) expected', 'This is a more sensitive test than Average Shift, because it calculates an incremental difference in the average of new values compared to the average of values at baseline. This is appropriate for a cumulative dataset only, because it calculates the average of new entries based on the assumption that the count and average of records present at baseline are still present at the time of the test. This test compares the mean of new values with the standard deviation of the baseline average to calculate a Z-score. If the new mean falls outside the Z-score threshold, a shift is detected. Potential Z-score thresholds may range from 0 to 3, depending on the sensitivity you prefer. A failed test could indicate a quality issue or a legitimate shift in new data that should be noted and assessed by business users. Consider this test along with Variability Increase. If variability rises too, process, methodology or measurement flaws could be at issue. If variability remains consistent, the problem is more likely to be with the source data itself.', 'Y'), + ('1018', 'LOV_All', 'All Values', 'List of expected values all present in column', 'Tests that all values match a pipe-delimited list of expected values and that all expected values are present', 'Column values found don''t exactly match the expected list of values', 'Values found', NULL, NULL, NULL, NULL, 'threshold_value', NULL, 'List of Expected Values', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'List of values expected, in form (''Val1'',''Val2)', 'This is a more restrictive form of Value Match, testing that all values in the dataset match the list provided, and also that all values present in the list appear at least once in the dataset. This would be appropriate for tables where all category values in the column are represented at least once.', 'Y'), + ('1019', 'LOV_Match', 'Value Match', 'All column values present in expected list', 'Tests that all values in the column match the list-of-values identified in baseline data.', 'Values not matching expected List-of-Values from baseline.', 'Non-matching records', NULL, 'top_freq_values > '''' AND distinct_value_ct BETWEEN 2 and 10 AND NOT (functional_data_type LIKE ''%date%'' OR lower(datatype_suggestion) LIKE ''%date%'' OR general_type = ''D'' OR lower(column_name) IN (''file_name'', ''filename''))', NULL, NULL, 'baseline_value,threshold_value', '''('' || SUBSTRING( CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 2) > '''' THEN '','''''' || TRIM( REPLACE ( SPLIT_PART(top_freq_values, ''|'' , 2), '''''''' , '''''''''''' ) ) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 4) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 4), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 6) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 6), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 8) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 8), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 10) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 10), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 12) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 12), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 14) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 14), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 16) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 16), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 18) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 18), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 20) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 20), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END, 2, 999) || '')'',0', 'List of Expected Values,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'List of values expected, in form (''Val1'',''Val2)', 'This tests that all values in the column match the hard-coded list provided. This is relevant when the list of allowable values is small and not expected to change often. Even if new values might occasionally be added, this test is useful for downstream data products to provide warning that assumptions and logic may need to change.', 'Y'), + ('1020', 'Min_Date', 'Minimum Date', 'All dates on or after set minimum', 'Tests that the earliest date referenced in the column is no earlier than baseline data', 'The earliest date value found is before the earliest value at baseline.', 'Dates prior to limit', NULL, 'general_type=''D''and min_date IS NOT NULL AND distinct_value_ct > 1', NULL, NULL, 'baseline_value,threshold_value', 'min_date,0', 'Minimum Date at Baseline,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of dates prior to minimum', 'This test is appropriate for a cumulative dataset only, because it assumes all prior values are still present. It''s appropriate where new records are added with more recent dates, but old dates dates do not change.', 'Y'), + ('1021', 'Min_Val', 'Minimum Value', 'All values at or above set minimum', 'Tests that the minimum value present in the column is no lower than the minimum value in baseline data', 'Minimum column value less than baseline.', 'Values under limit', NULL, 'general_type=''N''and min_value IS NOT NULL AND (distinct_value_ct >= 2 OR (distinct_value_ct=2 and min_value<>0 and max_value<>1))', NULL, NULL, 'baseline_value,threshold_value', 'min_value,0', 'Minimum Value at Baseline,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of values under limit', 'This test is appropriate for a cumulative dataset only, assuming all prior values are still present. It is also appropriate for any measure that has an absolute, definable minimum value, or a heuristic that makes senes for valid data.', 'Y'), + ('1022', 'Missing_Pct', 'Percent Missing', 'Consistent ratio of missing values', 'Tests for statistically-significant shift in percentage of missing values in column vs. baseline data', 'Significant shift in percent of missing values vs. baseline.', 'Difference measure', 'Cohen''s H Difference (0.20 small, 0.5 mod, 0.8 large, 1.2 very large, 2.0 huge)', 'record_ct <> value_ct', NULL, NULL, 'baseline_ct,baseline_value_ct,threshold_value', 'record_ct,value_ct,2::VARCHAR(10)', 'Baseline Record Count,Baseline Value Count,Standardized Difference Measure', NULL, 'Warning', 'CAT', 'column', 'Completeness', 'Data Drift', 'Expected maximum Cohen''s H Difference', 'This test uses Cohen''s H, a statistical test to identify a significant difference between two ratios. Results are reported on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. An uptick in missing data may indicate a collection issue at the source. A larger change may indicate a processing failure. A drop in missing data may also be significant, if it affects assumptions built into analytic products downstream. You can refine the expected threshold value as you view legitimate results of the measure over time.', 'Y'), + ('1023', 'Monthly_Rec_Ct', 'Monthly Records', 'At least one date per month present within date range', 'Tests for presence of at least one date per calendar month within min/max date range, per baseline data', 'At least one date per month expected in min/max date range.', 'Missing months', 'Calendar months without date values present', 'general_type= ''D'' AND date_days_present > 1 AND date_months_present > 2 AND date_months_present - (datediff( ''MON'' , min_date, max_date) + 1) = 0 AND future_date_ct::FLOAT / NULLIF(value_ct, 0) <= 0.75', NULL, NULL, 'threshold_value', '0', 'Threshold Count of Months without Dates', NULL, 'Fail', 'CAT', 'column', 'Completeness', 'Volume', 'Expected maximum count of calendar months without dates present', 'Monthly Records tests that at least one record is present for every calendar month within the minimum and maximum date range for the column. The test is relevant for transactional data, where you would expect at least one transaction to be recorded each month. A failure here would suggest missing records for the number of months identified without data. You can adjust the threshold to accept a number of month that you know legitimately have no records.', 'Y'), + ('1024', 'Outlier_Pct_Above', 'Outliers Above', 'Consistent outlier counts over 2 SD above mean', 'Tests that percent of outliers over 2 SD above Mean doesn''t exceed threshold', 'Percent of outliers exceeding 2 SD above the mean is greater than expected threshold.', 'Pct records over limit', NULL, 'general_type = ''N'' AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%'' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2)', NULL, NULL, 'baseline_avg,baseline_sd,threshold_value', 'avg_value,stdev_value,0.05', 'Baseline Mean, Baseline Std Deviation, Pct Records over 2 SD', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Expected maximum pct records over upper 2 SD limit', 'This test counts the number of data points that may be considered as outliers, determined by whether their value exceeds 2 standard deviations above the mean at baseline. Assuming a normal distribution, a small percentage (defaulted to 5%) of outliers is expected. The actual number may vary for different distributions. The expected threshold reflects the maximum percentage of outliers you expect to see. This test uses the baseline mean rather than the mean for the latest dataset to capture systemic shift as well as individual outliers. ', 'Y'), + ('1025', 'Outlier_Pct_Below', 'Outliers Below', 'Consistent outlier counts under 2 SD below mean', 'Tests that percent of outliers over 2 SD below Mean doesn''t exceed threshold', 'Percent of outliers exceeding 2 SD below the mean is greater than expected threshold.', 'Pct records under limit', NULL, 'general_type = ''N'' AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%'' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2)', NULL, NULL, 'baseline_avg,baseline_sd,threshold_value', 'avg_value,stdev_value,0.05', 'Baseline Mean, Baseline Std Deviation, Pct Records over 2 SD', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Expected maximum pct records over lower 2 SD limit', 'This test counts the number of data points that may be considered as outliers, determined by whether their value exceeds 2 standard deviations below the mean at baseline. Assuming a normal distribution, a small percentage (defaulted to 5%) of outliers is expected. The actual number may vary for different distributions. The expected threshold reflects the maximum percentage of outliers you expect to see. This test uses the baseline mean rather than the mean for the latest dataset to capture systemic shift as well as individual outliers. ', 'Y'), + ('1026', 'Pattern_Match', 'Pattern Match', 'Column values match alpha-numeric pattern', 'Tests that all values in the column match the same alpha-numeric pattern identified in baseline data', 'Alpha values do not match consistent pattern in baseline.', 'Pattern Mismatches', NULL, 'fn_charcount(top_patterns, E'' \| '' ) = 1 AND REPLACE(SPLIT_PART(top_patterns, ''|'' , 2), ''N'' , '''' ) > '''' AND distinct_value_ct > 10', NULL, NULL, 'baseline_value,threshold_value', 'trim(REPLACE(REPLACE(REPLACE(SPLIT_PART(top_patterns, '' | '', 2), ''A'', ''[A-Z]''), ''N'', ''[0-9]''), ''a'', ''[a-z]'')),0', 'Pattern at Baseline,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of pattern mismatches', 'This test is appropriate for character fields that are expected to appear in a consistent format. It uses pattern matching syntax as appropriate for your database: REGEX matching if available, otherwise LIKE expressions. The expected threshold is the number of records that fail to match the defined pattern.', 'Y'), + ('1028', 'Recency', 'Recency', 'Latest date within expected range of test date', 'Tests that the latest date in column is within a set number of days of the test date', 'Most recent date value not within expected days of test date.', 'Days before test', 'Number of days that most recent date precedes the date of test', 'general_type= ''D'' AND max_date <= run_date AND NOT column_name IN ( ''filedate'' , ''file_date'' ) AND NOT functional_data_type IN (''Future Date'', ''Schedule Date'') AND DATEDIFF( ''DAY'' , max_date, run_date) <= 62', NULL, NULL, 'threshold_value', 'CASE WHEN DATEDIFF( ''DAY'' , max_date, run_date) <= 3 THEN DATEDIFF(''DAY'', max_date, run_date) + 3 WHEN DATEDIFF(''DAY'', max_date, run_date) <= 7 then DATEDIFF(''DAY'', max_date, run_date) + 7 WHEN DATEDIFF( ''DAY'' , max_date, run_date) <= 31 THEN CEILING( DATEDIFF( ''DAY'' , max_date, run_date)::FLOAT / 7.0) * 7 WHEN DATEDIFF( ''DAY'' , max_date, run_date) > 31 THEN CEILING( DATEDIFF( ''DAY'' , max_date, run_date)::FLOAT / 30.0) * 30 END', 'Threshold Maximum Days before Test', NULL, 'Warning', 'CAT', 'column', 'Timeliness', 'Recency', 'Expected maximum count of days preceding test date', 'This test evaluates recency based on the latest referenced dates in the column. The test is appropriate for transactional dates and timestamps. The test can be especially valuable because timely data deliveries themselves may not assure that the most recent data is present. You can adjust the expected threshold to the maximum number of days that you expect the data to age before the dataset is refreshed. ', 'Y'), + ('1030', 'Required', 'Required Entry', 'Required non-null value present', 'Tests that a non-null value is present in each record for the column, consistent with baseline data', 'Every record for this column is expected to be filled, but some are missing.', 'Missing values', NULL, 'record_ct = value_ct', NULL, NULL, 'threshold_value', '0', 'Threshold Missing Value Count', NULL, 'Fail', 'CAT', 'column', 'Completeness', 'Schema Drift', 'Expected count of missing values', NULL, 'Y'), + ('1033', 'Street_Addr_Pattern', 'Street Address', 'Enough street address entries match defined pattern', 'Tests for percent of records matching standard street address pattern.', 'Percent of values matching standard street address format is under expected threshold.', 'Percent matches', 'Percent of records that match street address pattern', '(std_pattern_match=''STREET_ADDR'') AND (avg_length <> round(avg_length)) AND (avg_embedded_spaces BETWEEN 2 AND 6) AND (avg_length < 35)', NULL, NULL, 'threshold_value', '75', 'Threshold Pct that Match Address Pattern', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected percent of records that match standard street address pattern', 'The street address pattern used in this test should match the vast majority of USA addresses. You can adjust the threshold percent of matches based on the results you are getting -- you may well want to tighten it to make the test more sensitive to invalid entries.', 'Y'), + ('1034', 'Unique', 'Unique Values', 'Each column value is unique', 'Tests that no values for the column are repeated in multiple records.', 'Column values should be unique per row.', 'Duplicate values', 'Count of non-unique values', 'record_ct > 500 and record_ct = distinct_value_ct and value_ct > 0', NULL, NULL, 'threshold_value', '0', 'Threshold Duplicate Value Count', NULL, 'Fail', 'CAT', 'column', 'Uniqueness', 'Schema Drift', 'Expected count of duplicate values', 'This test is ideal when the database itself does not enforce a primary key constraint on the table. It serves as an independent check on uniqueness. If''s also useful when there are a small number of exceptions to uniqueness, which can be reflected in the expected threshold count of duplicates.', 'Y'), + ('1035', 'Unique_Pct', 'Percent Unique', 'Consistent ratio of unique values', 'Tests for statistically-significant shift in percentage of unique values vs. baseline data.', 'Significant shift in percent of unique values vs. baseline.', 'Difference measure', 'Cohen''s H Difference (0.20 small, 0.5 mod, 0.8 large, 1.2 very large, 2.0 huge)', 'distinct_value_ct > 10', NULL, NULL, 'baseline_value_ct,baseline_unique_ct,threshold_value', 'value_ct,distinct_value_ct,0.5', 'Value Count at Baseline,Distinct Value Count at Baseline,Standardized Difference Measure (0 to 1)', NULL, 'Warning', 'CAT', 'column', 'Uniqueness', 'Data Drift', 'Expected maximum Cohen''s H Difference', 'You can think of this as a test of similarity that measures whether the percentage of unique values is consistent with the percentage at baseline. A significant change might indicate duplication or a telling shift in cardinality between entities. The test uses Cohen''s H, a statistical test to identify a significant difference between two ratios. Results are reported on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. You can refine the expected threshold value as you view legitimate results of the measure over time.', 'Y'), + ('1036', 'US_State', 'US State', 'Column value is two-letter US state code', 'Tests that the recorded column value is a valid US state.', 'Column Value is not a valid US state.', 'Not US States', 'Values that doo not match 2-character US state abbreviations.', 'general_type= ''A'' AND column_name ILIKE ''%state%'' AND distinct_value_ct < 70 AND max_length = 2', NULL, NULL, 'threshold_value', '0', 'Threshold Count not Matching State Abbreviations', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of values that are not US state abbreviations', 'This test validates entries against a fixed list of two-character US state codes and related Armed Forces codes.', 'Y'), + ('1037', 'Weekly_Rec_Ct', 'Weekly Records', 'At least one date per week present within date range', 'Tests for presence of at least one date per calendar week within min/max date range, per baseline data', 'At least one date per week expected in min/max date range.', 'Missing weeks', 'Calendar weeks without date values present', 'general_type= ''D'' AND date_days_present > 1 AND date_weeks_present > 3 AND date_weeks_present - (DATEDIFF(''week'', ''1800-01-05''::DATE, max_date) - DATEDIFF(''week'', ''1800-01-05''::DATE, min_date) + 1) = 0 AND future_date_ct::FLOAT / NULLIF(value_ct, 0) <= 0.75', NULL, NULL, 'threshold_value', '0', 'Threshold Weeks without Dates', NULL, 'Fail', 'CAT', 'column', 'Completeness', 'Volume', 'Expected maximum count of calendar weeks without dates present', 'Weekly Records tests that at least one record is present for every calendar week within the minimum and maximum date range for the column. The test is relevant for transactional data, where you would expect at least one transaction to be recorded each week. A failure here would suggest missing records for the number of weeks identified without data. You can adjust the threshold to accept a number of weeks that you know legitimately have no records.', 'Y'), + ('1040', 'Variability_Increase', 'Variability Increase', 'Variability has increased above threshold', 'Tests that the spread or dispersion of column values has increased significantly over baseline, indicating a drop in stability of the measure.', 'The Standard Deviation of the measure has increased beyond the defined threshold. This could signal a change in a process or a data quality issue.', 'Pct SD shift', 'Percent of baseline Standard Deviation', 'general_type = ''N'' AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%'' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2)', NULL, NULL, 'baseline_sd,threshold_value', 'stdev_value,120', 'Std Deviation at Baseline,Expected Maximum Percent', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Expected maximum pct of baseline Standard Deviation (SD)', 'This test looks for percent shifts in standard deviation as a measure of the stability of a measure over time. A significant change could indicate that new values are erroneous, or that the cohort being evaluated is significantly different from baseline. An increase in particular could mark new problems in measurement, a more heterogeneous cohort, or that significant outliers have been introduced. Consider this test along with Average Shift and New Shift. If the average shifts as well, there may be a fundamental shift in the dataset or process used to collect the data point. This might suggest a data shift that should be noted and assessed by business users. If the average does not shift, this may point to a data quality or data collection problem. ', 'Y'), + ('1041', 'Variability_Decrease', 'Variability Decrease', 'Variability has decreased below threshold', 'Tests that the spread or dispersion of column values has decreased significantly over baseline, indicating a shift in stability of the measure. This could signal a change in a process or a data quality issue.', 'The Standard Deviation of the measure has decreased below the defined threshold. This could signal a change in a process or a data quality issue.', 'Pct SD shift', 'Percent of baseline Standard Deviation', 'general_type = ''N'' AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%'' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2)', NULL, NULL, 'baseline_sd,threshold_value', 'stdev_value, 80', 'Std Deviation at Baseline,Expected Minimum Percent', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Expected minimum pct of baseline Standard Deviation (SD)', 'This test looks for percent shifts in standard deviation as a measure of the stability of a measure over time. A significant change could indicate that new values are erroneous, or that the cohort being evaluated is significantly different from baseline. A decrease in particular could indicate an improved process, better precision in measurement, the elimination of outliers, or a more homogeneous cohort. ', 'Y'), + ('1042', 'Valid_Month', 'Valid Month', 'Valid calendar month in expected format', 'Tests for the presence of a valid representation of a calendar month consistent with the format at baseline.', 'Column values are not a valid representation of a calendar month consistent with the format at baseline.', 'Invalid months', NULL, 'functional_data_type = ''Period Month''', NULL, NULL, 'threshold_value,baseline_value', '0,CASE WHEN max_length > 3 AND initcap(min_text) = min_text THEN ''''''January'''',''''February'''',''''March'''',''''April'''',''''May'''',''''June'''',''''July'''',''''August'''',''''September'''',''''October'''',''''November'''',''''December'''''' WHEN max_length > 3 AND upper(min_text) = min_text THEN ''''''JANUARY'''',''''FEBRUARY'''',''''MARCH'''',''''APRIL'''',''''MAY'''',''''JUNE'''',''''JULY'''',''''AUGUST'''',''''SEPTEMBER'''',''''OCTOBER'''',''''NOVEMBER'''',''''DECEMBER'''''' WHEN max_length > 3 AND lower(min_text) = min_text THEN ''''''january'''',''''february'''',''''march'''',''''april'''',''''may'''',''''june'''',''''july'''',''''august'''',''''september'''',''''october'''',''''november'''',''''december'''''' WHEN max_length = 3 AND initcap(min_text) = min_text THEN ''''''Jan'''',''''Feb'''',''''Mar'''',''''Apr'''',''''May'''',''''Jun'''',''''Jul'''',''''Aug'''',''''Sep'''',''''Oct'''',''''Nov'''',''''Dec'''''' WHEN max_length = 3 AND upper(min_text) = min_text THEN ''''''JAN'''',''''FEB'''',''''MAR'''',''''APR'''',''''MAY'''',''''JUN'''',''''JUL'''',''''AUG'''',''''SEP'''',''''OCT'''',''''NOV'''',''''DEC'''''' WHEN max_length = 3 AND lower(min_text) = min_text THEN ''''''jan'''',''''feb'''',''''mar'''',''''apr'''',''''may'''',''''jun'''',''''jul'''',''''aug'''',''''sep'''',''''oct'''',''''nov'''',''''dec'''''' WHEN max_length = 2 AND min_text = ''01'' THEN ''''''01'''',''''02'''',''''03'''',''''04'''',''''05'''',''''06'''',''''07'''',''''08'''',''''09'''',''''10'''',''''11'''',''''12'''''' WHEN max_length = 2 AND min_text = ''1'' THEN ''''''1'''',''''2'''',''''3'''',''''4'''',''''5'''',''''6'''',''''7'''',''''8'''',''''9'''',''''10'''',''''11'''',''''12'''''' WHEN min_value = 1 THEN ''1,2,3,4,5,6,7,8,9,10,11,12'' ELSE ''NULL'' END', 'Threshold Invalid Months,Valid Month List', 'The acceptable number of records with invalid months present.|List of valid month values for this field, in quotes if field is numeric, separated by commas.', 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of invalid months', NULL, 'N'), + ('1043', 'Valid_Characters', 'Valid Characters', 'Column contains no invalid characters', 'Tests for the presence of non-printing characters, leading spaces, or surrounding quotes.', 'Invalid characters, such as non-printing characters, leading spaces, or surrounding quotes, were found.', 'Invalid records', 'Expected count of values with invalid characters', 'general_type = ''A''', NULL, NULL, 'threshold_value', '0', NULL, 'The acceptable number of records with invalid character values present.', 'Warning', 'CAT', 'column', 'Validity', 'Schema Drift', 'Threshold Invalid Value Count', 'This test looks for the presence of non-printing ASCII characters that are considered non-standard in basic text processing. It also identifies leading spaces and values enclosed in quotes. Values that fail this test may be artifacts of data conversion, or just more difficult to process or analyze downstream.', 'N'), + ('1044', 'Valid_US_Zip', 'Valid US Zip', 'Valid USA Postal Codes', 'Tests that postal codes match the 5 or 9 digit standard US format', 'Invalid US Zip Code formats found.', 'Invalid Zip Codes', 'Expected count of values with invalid Zip Codes', 'functional_data_type = ''Zip''', NULL, NULL, 'threshold_value', '0', NULL, NULL, 'Warning', 'CAT', 'column', 'Validity', 'Schema Drift', 'Threshold Invalid Value Count', NULL, 'Y'), + ('1045', 'Valid_US_Zip3', 'Valid US Zip-3 ', 'Valid USA Zip-3 Prefix', 'Tests that postal codes match the 3 digit format of a regional prefix.', 'Invalid 3-digit US Zip Code regional prefix formats found.', 'Invalid Zip-3 Prefix', 'Expected count of values with invalid Zip-3 Prefix Codes', 'functional_data_type = ''Zip3''', NULL, NULL, 'threshold_value', '0', NULL, NULL, 'Warning', 'CAT', 'column', 'Validity', 'Schema Drift', 'Threshold Invalid Zip3 Count', 'This test looks for the presence of values that fail to match the three-digit numeric code expected for US Zip Code regional prefixes. These prefixes are often used to roll up Zip Code data to a regional level, and may be critical to anonymize detailed data and protect PID. Depending on your needs and regulatory requirements, longer zip codes could place PID at risk.', 'Y'), + ('1006', 'Condition_Flag', 'Custom Condition', 'Column values match pre-defined condition', 'Tests that each record in the table matches a pre-defined, custom condition', 'Value(s) found not matching defined condition.', 'Values Failing', NULL, NULL, NULL, NULL, 'threshold_value,custom_query', NULL, 'Threshold Error Count,Custom SQL Expression', 'The number of errors that are acceptable before test fails.|Expression should evaluate to TRUE to register an error or FALSE if no error. An expression can reference only columns in the selected table.', 'Fail', 'CAT', 'custom', 'Validity', 'Schema Drift', 'Count of records that don''t meet test condition', 'Custom Condition is a business-rule test for a user-defined error condition based on the value of one or more columns. The condition is applied to each record within the table, and the count of records failing the condition is added up. If that count exceeds a threshold of errors, the test as a whole is failed. This test is ideal for error conditions that TestGen cannot automatically infer, and any condition that involves the values of more than one column in the same record. Performance of this test is fast, since it is performed together with other aggregate tests. Interpretation is based on the user-defined meaning of the test.', 'Y'), + + ('1031', 'Row_Ct', 'Row Count', 'Number of rows is at or above threshold', 'Tests that the count of records has not decreased from the baseline count.', 'Row count less than baseline count.', 'Row count', NULL, 'TEMPLATE', NULL, NULL, 'threshold_value', NULL, 'Threshold Minimum Record Count', NULL, 'Fail', 'CAT', 'table', 'Completeness', 'Volume', 'Expected minimum row count', 'Because this tests the row count against a constant minimum threshold, it''s appropriate for any dataset, as long as the number of rows doesn''t radically change from refresh to refresh. But it''s not responsive to change over time. You may want to adjust the threshold periodically if you are dealing with a cumulative dataset.', 'Y'), + ('1032', 'Row_Ct_Pct', 'Row Range', 'Number of rows within percent range of threshold', 'Tests that the count of records is within a percentage above or below the baseline count.', 'Row Count is outside of threshold percent of baseline count.', 'Percent of baseline', 'Row count percent above or below baseline', 'TEMPLATE', NULL, NULL, 'baseline_ct,threshold_value', NULL, 'Baseline Record Count,Threshold Pct Above or Below Baseline', NULL, 'Fail', 'CAT', 'table', 'Completeness', 'Volume', 'Expected percent window below or above baseline', 'This test is better than Row Count for an incremental or windowed dataset where you would expect the row count to range within a percentage of baseline.', 'Y'), + + ('1008', 'CUSTOM', 'Custom Test', 'Custom-defined business rule', 'Custom SQL Query Test', 'Errors were detected according to test definition.', 'Errors found', 'Count of errors identified by query', NULL, 'Test Focus', 'Specify a brief descriptor of the focus of this test that is unique within this Test Suite for the Table and Test Type. This distinguishes this test from others of the same type on the same table. \n\nExample: if you are testing whether product_code is found in the related table called dim_products', 'custom_query', NULL, 'Custom SQL Query Returning Error Records', 'Query should return records indicating one or more errors. The test passes if no records are returned. Results of the query will be shown when you click `Review Source Data` for a failed test, so be sure to include enough data in your results to follow-up. \n\nA query can refer to any tables in the database. You must hard-code the schema or use `{DATA_SCHEMA}` to represent the schema defined for the Table Group.', 'Fail', 'QUERY', 'custom', 'Accuracy', 'Data Drift', 'Expected count of errors found by custom query', 'This business-rule test is highly flexible, covering any error state that can be expressed by a SQL query against one or more tables in the database. In operation, the user-defined query is embedded within a parent query returning the count of error rows identified. Any row returned by the query is interpreted as a single error condition in the test. Note that this query is run independently of other tests, and that performance will be slower, depending in large part on the efficiency of the query you write. Interpretation is based on the user-defined meaning of the test. Your query might be written to return errors in individual rows identified by joining tables. Or it might return an error based on a multi-column aggregate condition returning a single row if an error is found. This query is run separately when you click `Review Source Data` from Test Results, so be sure to include enough data in your results to follow-up. Interpretation is based on the user-defined meaning of the test.', 'Y'), + + ('1500', 'Aggregate_Balance', 'Aggregate Balance', 'Aggregate values per group match reference', 'Tests for exact match in aggregate values for each set of column values vs. reference dataset', 'Aggregate measure per set of column values does not exactly match reference dataset.', 'Mismatched measures', NULL, NULL, 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_schema_name,match_table_name,match_column_names,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'Record Subset Condition,Grouping Columns,Group Subset Condition,Matching Schema Name,Matching Table Name,Matching Aggregate Expression,Matching Record Subset Condition,Matching Grouping Columns,Matching Group Subset Condition', 'Condition defining a subset of records in main table (e.g. WHERE clause) - OPTIONAL|Category columns in main table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in main table (e.g. HAVING clause) - OPTIONAL|Schema location of matching table|Matching table name|Agregate column expression in matching table: one of `SUM([column_name])` or `COUNT([column_name])`|Condition defining a subset of records in matching table (e.g. WHERE clause) - OPTIONAL|Category columns in matching table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in matching table (e.g. HAVING clause) - OPTIONAL', 'Fail', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected count of group totals not matching aggregate value', 'This test compares sums or counts of a column rolled up to one or more category combinations. It''s ideal for confirming that two datasets exactly match -- that the sum of a measure or count of a value hasn''t changed or shifted between categories. Use this test to compare a raw and processed version of the same dataset, or to confirm that an aggregated table exactly matches the detail table that it''s built from. An error here means that one or more value combinations fail to match. New categories or combinations will cause failure.', 'Y'), + ('1501', 'Aggregate_Minimum', 'Aggregate Minimum', 'Aggregate values per group are at or above reference', 'Tests that aggregate values for each set of column values are at least the same as reference dataset', 'Aggregate measure per set of column values is not at least the same as reference dataset.', 'Mismatched measures', NULL, NULL, 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_schema_name,match_table_name,match_column_names,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'Record Subset Condition,Grouping Columns,Group Subset Condition,Matching Schema Name,Matching Table Name,Matching Aggregate Expression,Matching Record Subset Condition,Matching Grouping Columns,Matching Group Subset Condition', 'Condition defining a subset of records in main table (e.g. WHERE clause) - OPTIONAL,Category columns in main table separated by commas (e.g. GROUP BY columns),Condition defining a subset of aggregate records in main table (e.g. HAVING clause) - OPTIONAL,Schema location of matching table,Matching table name,Aggregate column expression in matching table (e.g. `SUM(sales)`),Condition defining a subset of records in matching table (e.g. WHERE clause) - OPTIONAL,Category columns in matching table separated by commas (e.g. GROUP BY columns),Condition defining a subset of aggregate records in matching table (e.g. HAVING clause) - OPTIONAL', 'Fail', 'QUERY', 'referential', 'Accuracy', 'Data Drift', 'Expected count of group totals below aggregate value', 'This test compares sums or counts of a column rolled up to one or more category combinations, but requires a match or increase in the aggregate value, rather than an exact match. Use it to confirm that aggregate values have not dropped for any set of categories, even if some values may rise. This test is useful to compare an older and newer version of a cumulative dataset. An error here means that one or more values per category set fail to match or exceed the prior dataset. New categories or combinations are allowed (but can be restricted independently with a Combo_Match test). Both tables must be present to run this test.', 'Y'), + ('1502', 'Combo_Match', 'Combo Match', 'Column values or combinations found in reference', 'Tests for the presence of one or a set of column values in a reference table', 'Column value combinations are not found in reference table values.', 'Missing values', NULL, NULL, 'Categorical Column List', 'Specify one or more Categorical columns, separated by commas. \n\nDo not use continuous mesurements here. Do not use numeric values unless they represent discrete categories.', 'subset_condition,having_condition,match_schema_name,match_table_name,match_groupby_names,match_subset_condition,match_having_condition', NULL, 'Record Subset Condition,Group Subset Condition,Reference Schema Name,Reference Table Name,Matching Columns,Matching Record Subset Condition,Matching Group Subset Condition', 'Condition defining a subset of records in main table to evaluate - OPTIONAL|Condition based on aggregate expression used to exclude value combinations in source table (e.g. `SUM(sales) < 100`) - OPTIONAL|Schema location of matching table|Matching table name|Column Names in reference table used to validate source table values (separated by commas)|Condition defining a subset of records in reference table to match against - OPTIONAL|Condition based on aggregate expression used to exclude value combinations in reference table (e.g. `SUM(sales) < 100`) - OPTIONAL', 'Fail', 'QUERY', 'referential', 'Validity', 'Schema Drift', 'Expected count of non-matching value combinations', 'This test verifies that values, or combinations of values, that are present in the main table are also found in a reference table. This is a useful test for referential integrity between fact and dimension tables. You can also use it to confirm the validity of combinations of values found together within each record, such as bottle / pint / milk and carton / dozen / eggs. An error here means that one or more value combinations in the main table are not found in the reference table. Both tables must be present to run this test.', 'Y'), + ('1503', 'Distribution_Shift', 'Distribution Shift', 'Probability distribution consistent with reference', 'Tests the closeness of match between two distributions of aggregate measures across combinations of column values, using Jensen-Shannon Divergence test', 'Divergence between two distributions exceeds specified threshold.', 'Divergence level (0-1)', 'Jensen-Shannon Divergence, from 0 (identical distributions), to 1.0 (max divergence)', NULL, 'Categorical Column List', 'Specify one or more Categorical columns, separated by commas. \n\nDo not use continuous mesurements here. Do not use numeric values unless they represent discrete categories.', 'subset_condition,match_schema_name,match_table_name,match_groupby_names,match_subset_condition', NULL, 'Record Subset Condition,Reference Schema Name,Reference Table Name,Matching Columns to Compare,Matching Record Subset Condition', 'Condition defining a subset of records in main table to evaluate - OPTIONAL|Schema location of matching table|Matching table name|Column Names in reference table used to compare counts with source table values (separated by commas)|Condition defining a subset of records in reference table to match against - OPTIONAL', 'Warning', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected maximum divergence level between 0 and 1', 'This test measures the similarity of two sets of counts per categories, by using their proportional counts as probability distributions. Using Jensen-Shannon divergence, a measure of relative entropy or difference between two distributions, the test assigns a score ranging from 0, meaning that the distributions are identical, to 1, meaning that the distributions are completely unrelated. This test can be used to compare datasets that may not match exactly, but should have similar distributions. For example, it is a useful sanity check for data from different sources that you would expect to have a consistent spread, such as shipment of building materials per state and construction projects by state. Scores can be compared over time even if the distributions are not identical -- a dataset can be expected to maintain a comparable divergence score with a reference dataset over time. Both tables must be present to run this test.', 'Y'), + ('1508', 'Timeframe_Combo_Gain', 'Timeframe Keep', 'Latest timeframe has at least all value combinations from prior period', 'Tests that column values in most recent time-window include at least same as prior time window', 'Column values in most recent time-window don''t include all values in prior window.', 'Mismatched values', NULL, NULL, 'Categorical Column List', 'Specify one or more Categorical columns, separated by commas. Make sure not to use continuous mesurements here. Do not use numeric values unless they represent discrete categories.', 'window_date_column,window_days,subset_condition', NULL, 'Date Column for Time Windows,Time Window in Days,Record Subset Condition', 'The date column used to define the time windows. This must be a DATE or DATETIME type.|Length in days of the time window. The test will compare the most recent period of days to the prior period of the same duration.|Condition defining a subset of records in main table to evaluate - OPTIONAL', 'Fail', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected count of missing value combinations', NULL, 'Y'), + ('1509', 'Timeframe_Combo_Match', 'Timeframe Match', 'Column value combinations from latest timeframe same as prior period', 'Tests for presence of same column values in most recent time-window vs. prior time window', 'Column values don''t match in most recent time-windows.', 'Mismatched values', NULL, NULL, 'Categorical Column List', 'Specify one or more Categorical columns, separated by commas. \n\nDo not use continuous mesurements here. Do not use numeric values unless they represent discrete categories.', 'window_date_column,window_days,subset_condition', NULL, 'Date Column for Time Windows,Time Window in Days,Record Subset Condition', NULL, 'Fail', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected count of non-matching value combinations', NULL, 'Y'), + + ('1504', 'Aggregate_Pct_Above', 'Aggregate Pct Above', 'Aggregate values per group exceed reference', 'Tests that aggregate values for each set of column values exceed values for reference dataset', 'Aggregate measure per set of column values fails to exceed the reference dataset.', 'Mismatched measures', NULL, NULL, 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_column_names,match_schema_name,match_table_name,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'TODO Fill in default_parm_prompts match_schema_name,TODO Fill in default_parm_prompts match_table_name,TODO Fill in default_parm_prompts match_column_names,TODO Fill in default_parm_prompts match_subset_condition,TODO Fill in default_parm_prompts match_groupby_names,TODO Fill in default_parm_prompts match_having_condition,TODO Fill in default_parm_prompts subset_condition,TODO Fill in default_parm_prompts groupby_names,TODO Fill in default_parm_prompts having_condition', NULL, 'Fail', 'QUERY', 'referential', 'Accuracy', 'Data Drift', 'Expected count of group totals with not exceeding aggregate measure', NULL, 'N'), + ('1505', 'Aggregate_Pct_Within', 'Aggregate Pct Within', 'Aggregate values per group exceed reference', 'Tests that aggregate values for each set of column values exceed values for reference dataset', 'Aggregate measure per set of column values fails to exceed the reference dataset.', 'Mismatched measures', NULL, NULL, 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_column_names,match_schema_name,match_table_name,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'TODO Fill in default_parm_prompts match_schema_name,TODO Fill in default_parm_prompts match_table_name,TODO Fill in default_parm_prompts match_column_names,TODO Fill in default_parm_prompts match_subset_condition,TODO Fill in default_parm_prompts match_groupby_names,TODO Fill in default_parm_prompts match_having_condition,TODO Fill in default_parm_prompts subset_condition,TODO Fill in default_parm_prompts groupby_names,TODO Fill in default_parm_prompts having_condition', NULL, 'Fail', 'QUERY', 'referential', 'Accuracy', 'Data Drift', 'Expected count of group totals with not exceeding aggregate measure', NULL, 'N'), + ('1506', 'Aggregate_Increase', 'Aggregate Increase', 'Aggregate values per group exceed reference', 'Tests that aggregate values for each set of column values exceed values for reference dataset', 'Aggregate measure per set of column values fails to exceed the reference dataset.', 'Mismatched measures', NULL, NULL, 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_column_names,match_schema_name,match_table_name,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'TODO Fill in default_parm_prompts match_schema_name,TODO Fill in default_parm_prompts match_table_name,TODO Fill in default_parm_prompts match_column_names,TODO Fill in default_parm_prompts match_subset_condition,TODO Fill in default_parm_prompts match_groupby_names,TODO Fill in default_parm_prompts match_having_condition,TODO Fill in default_parm_prompts subset_condition,TODO Fill in default_parm_prompts groupby_names,TODO Fill in default_parm_prompts having_condition', NULL, 'Fail', 'QUERY', 'referential', 'Accuracy', 'Data Drift', 'Expected count of group totals below reference value', NULL, 'N') +; -A query can refer to any tables in the database. You must hard-code the schema or use `{DATA_SCHEMA}` to represent the schema defined for the Table Group.', 'Fail', 'QUERY', 'custom', 'Accuracy', 'Data Drift', 'Expected count of errors found by custom query', 'This business-rule test is highly flexible, covering any error state that can be expressed by a SQL query against one or more tables in the database. In operation, the user-defined query is embedded within a parent query returning the count of error rows identified. Any row returned by the query is interpreted as a single error condition in the test. Note that this query is run independently of other tests, and that performance will be slower, depending in large part on the efficiency of the query you write. Interpretation is based on the user-defined meaning of the test. Your query might be written to return errors in individual rows identified by joining tables. Or it might return an error based on a multi-column aggregate condition returning a single row if an error is found. This query is run separately when you click `Review Source Data` from Test Results, so be sure to include enough data in your results to follow-up. Interpretation is based on the user-defined meaning of the test.', 'Y'), - ('1009', 'Daily_Record_Ct', 'Daily Records', 'All dates present within date range', 'Tests for presence of every calendar date within min/max date range, per baseline data', 'Not every date value between min and max dates is present, unlike at baseline.', 'Missing dates', NULL, 'general_type= ''D'' AND date_days_present > 21 AND date_days_present - (DATEDIFF(''day'', ''1800-01-05''::DATE, max_date) - DATEDIFF(''day'', ''1800-01-05''::DATE, min_date) + 1) = 0 AND future_date_ct::FLOAT / NULLIF(value_ct, 0) <= 0.75', 'threshold_value', '0', 'Threshold Missing Calendar Days', NULL, 'Warning', 'CAT', 'column', 'Completeness', 'Volume', 'Missing calendar days within min/max range', 'Daily Records tests that at least one record is present for every day within the minimum and maximum date range for the column. The test is relevant for transactional data, where you would expect at least one transaction to be recorded each day. A failure here would suggest missing records for the number of days identified without data. You can adjust the threshold to accept a number of days that you know legitimately have no records. ', 'Y'), - ('1010', 'DATA MATCH', 'Combo Match', 'Column value combinations match reference', 'Tests for the presence of the same set of column values in a reference table', 'Column values don''t match reference table values.', 'Mismatched values', NULL, NULL, 'subset_condition,match_column_names,match_schema_name,match_table_name,match_subset_condition', NULL, 'Record Subset Condition,Column Names to Match,Schema Name,Table Name,Match Schema Name,Match Table Name,Match Table Record Subset Condition', NULL, 'Fail', 'QUERY', 'multi-column', 'Validity', 'Schema Drift', 'Expected count of non-matching value combinations', NULL, 'N'), - ('1011', 'Dec_Trunc', 'Decimal Truncation', 'Sum of fractional values at or above reference', 'Tests for decimal truncation by confirming that the sum of fractional values in data is no less than the sum at baseline', 'The sum of fractional values is under baseline, which may indicate decimal truncation', 'Fractional sum', 'The sum of all decimal values from all data for this column', 'fractional_sum IS NOT NULL AND functional_table_type LIKE''%cumulative%''', 'threshold_value', 'ROUND(fractional_sum, 0)', 'Sum of Fractional Values at Baseline', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Minimum expected sum of all fractional values', 'Decimal Truncation tests that the fractional (decimal) part of a numeric column has not been truncated since Baseline. This works by summing all the fractional values after the decimal point and confirming that the total is at least equal to the fractional total at baseline. This could indicate a problem in a cumulative dataset, where prior values should still exist unchanged. A failure here would suggest that some process changed data that you would still expect to be present and matching its value when the column was profiled. This test would not be appropriate for an incremental or windowed dataset.', 'Y'), - ('1012', 'Distinct_Date_Ct', 'Date Count', 'Count of distinct dates at or above reference', 'Tests that the count of distinct dates referenced in the column has not dropped vs. baseline data', 'Drop in count of unique dates recorded in column.', 'Unique dates', 'Count of unique dates in transactional date column', 'general_type=''D'' and date_days_present IS NOT NULL AND functional_table_type NOT LIKE ''%window%''', 'baseline_value,threshold_value', 'date_days_present,date_days_present', 'Distinct Date Count at Baseline,Min Expected Date Count', NULL, 'Fail', 'CAT', 'column', 'Timeliness', 'Recency', 'Minimum distinct date count expected', 'Date Count tests that the count of distinct dates present in the column has not dropped since baseline. The test is relevant for cumulative datasets, where old records are retained. A failure here would indicate missing records, which could be caused by a processing error or changed upstream data sources.', 'Y'), - ('1013', 'Distinct_Value_Ct', 'Value Count', 'Count of distinct values has not dropped', 'Tests that the count of unique values in the column has not changed from baseline.', 'Count of unique values in column has changed from baseline.', 'Unique Values', NULL, 'distinct_value_ct between 2 and 10 AND value_ct > 0 AND NOT (coalesce(top_freq_values,'''') > '''' AND distinct_value_ct BETWEEN 2 and 10) AND NOT (lower(functional_data_type) LIKE ''%sequence%'' OR lower(functional_data_type) LIKE ''%measurement%'' OR functional_data_type LIKE ''%date%'' OR general_type = ''D'')', 'baseline_value_ct,threshold_value', 'distinct_value_ct,distinct_value_ct', 'Distinct Value Count at Baseline,Min Expected Value Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected distinct value count', 'Value Count tests that the count of unique values present in the column has not dropped since baseline. The test is relevant for cumulative datasets, where old records are retained, or for any dataset where you would expect a set number of distinct values should be present. A failure here would indicate missing records or a change in categories or value assignment.', 'Y'), - ('1014', 'Email_Format', 'Email Format', 'Email is correctly formatted', 'Tests that non-blank, non-empty email addresses match the standard format', 'Invalid email address formats found.', 'Invalid emails', 'Number of emails that do not match standard format', 'std_pattern_match=''EMAIL''', 'threshold_value', '0', 'Maximum Invalid Email Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of invalid email addresses', NULL, 'Y'), - ('1015', 'Future_Date', 'Past Dates', 'Latest date is prior to test run date', 'Tests that the maximum date referenced in the column is no greater than the test date, consistent with baseline data', 'Future date found when absent in baseline data.', 'Future dates', NULL, 'general_type=''D''AND future_date_ct = 0', 'threshold_value', '0', 'Maximum Future Date Count', NULL, 'Fail', 'CAT', 'column', 'Timeliness', 'Recency', 'Expected count of future dates', NULL, 'Y'), - ('1016', 'Future_Date_1Y', 'Future Year', 'Future dates within year of test run date', 'Tests that the maximum date referenced in the column is no greater than one year beyond the test date, consistent with baseline data', 'Future date beyond one-year found when absent in baseline.', 'Future dates post 1 year', NULL, 'general_type=''D''AND future_date_ct > 0 AND max_date <=''{AS_OF_DATE}''::DATE + INTERVAL''365 DAYS''', 'threshold_value', '0', 'Maximum Post 1-Year Future Date Count', NULL, 'Fail', 'CAT', 'column', 'Timeliness', 'Recency', 'Expected count of future dates beyond one year', 'Future Year looks for date values in the column that extend beyond one year after the test date. This would be appropriate for transactional dates where you would expect to find dates in the near future, but not beyond one year ahead. Errors could indicate invalid entries or possibly dummy dates representing blank values.', 'Y'), - ('1017', 'Incr_Avg_Shift', 'New Shift', 'New record mean is consistent with reference', 'Tests for statistically-significant shift in mean of new values for column compared to average calculated at baseline.', 'Significant shift in average of new values vs. baseline avg', 'Z-score of mean shift', 'Absolute Z-score (number of SD''s outside mean) of prior avg - incremental avg', 'general_type=''N'' AND distinct_value_ct > 10 AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%''', 'baseline_value_ct,baseline_sum,baseline_avg,baseline_sd,threshold_value', 'value_ct,(avg_value * value_ct)::FLOAT,avg_value,stdev_value,2', 'Value Count at Baseline,Sum at Baseline,Mean Value at Baseline,Std Deviation at Baseline,Threshold Max Z-Score', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Maximum Z-Score (number of SD''s beyond mean) expected', 'This is a more sensitive test than Average Shift, because it calculates an incremental difference in the average of new values compared to the average of values at baseline. This is appropriate for a cumulative dataset only, because it calculates the average of new entries based on the assumption that the count and average of records present at baseline are still present at the time of the test. This test compares the mean of new values with the standard deviation of the baseline average to calculate a Z-score. If the new mean falls outside the Z-score threshold, a shift is detected. Potential Z-score thresholds may range from 0 to 3, depending on the sensitivity you prefer. A failed test could indicate a quality issue or a legitimate shift in new data that should be noted and assessed by business users. Consider this test along with Variability Increase. If variability rises too, process, methodology or measurement flaws could be at issue. If variability remains consistent, the problem is more likely to be with the source data itself.', 'Y'), - ('1018', 'LOV_All', 'All Values', 'List of expected values all present in column', 'Tests that all values match a pipe-delimited list of expected values and that all expected values are present', 'Column values found don''t exactly match the expected list of values', 'Values found', NULL, NULL, 'threshold_value', NULL, 'List of Expected Values', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'List of values expected, in form (''Val1'',''Val2)', 'This is a more restrictive form of Value Match, testing that all values in the dataset match the list provided, and also that all values present in the list appear at least once in the dataset. This would be appropriate for tables where all category values in the column are represented at least once.', 'Y'), - ('1019', 'LOV_Match', 'Value Match', 'All column values present in expected list', 'Tests that all values in the column match the list-of-values identified in baseline data.', 'Values not matching expected List-of-Values from baseline.', 'Non-matching records', NULL, 'top_freq_values > '''' AND distinct_value_ct BETWEEN 2 and 10 AND NOT (functional_data_type LIKE ''%date%'' OR lower(datatype_suggestion) LIKE ''%date%'' OR general_type = ''D'' OR lower(column_name) IN (''file_name'', ''filename''))', 'baseline_value,threshold_value', '''('' || SUBSTRING( CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 2) > '''' THEN '','''''' || TRIM( REPLACE ( SPLIT_PART(top_freq_values, ''|'' , 2), '''''''' , '''''''''''' ) ) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 4) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 4), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 6) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 6), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 8) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 8), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 10) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 10), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 12) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 12), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 14) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 14), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 16) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 16), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 18) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 18), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 20) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 20), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END, 2, 999) || '')'',0', 'List of Expected Values,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'List of values expected, in form (''Val1'',''Val2)', 'This tests that all values in the column match the hard-coded list provided. This is relevant when the list of allowable values is small and not expected to change often. Even if new values might occasionally be added, this test is useful for downstream data products to provide warning that assumptions and logic may need to change.', 'Y'), - ('1020', 'Min_Date', 'Minimum Date', 'All dates on or after set minimum', 'Tests that the earliest date referenced in the column is no earlier than baseline data', 'The earliest date value found is before the earliest value at baseline.', 'Dates prior to limit', NULL, 'general_type=''D''and min_date IS NOT NULL AND distinct_value_ct > 1', 'baseline_value,threshold_value', 'min_date,0', 'Minimum Date at Baseline,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of dates prior to minimum', 'This test is appropriate for a cumulative dataset only, because it assumes all prior values are still present. It''s appropriate where new records are added with more recent dates, but old dates dates do not change.', 'Y'), - ('1021', 'Min_Val', 'Minimum Value', 'All values at or above set minimum', 'Tests that the minimum value present in the column is no lower than the minimum value in baseline data', 'Minimum column value less than baseline.', 'Values under limit', NULL, 'general_type=''N''and min_value IS NOT NULL AND (distinct_value_ct >= 2 OR (distinct_value_ct=2 and min_value<>0 and max_value<>1))', 'baseline_value,threshold_value', 'min_value,0', 'Minimum Value at Baseline,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of values under limit', 'This test is appropriate for a cumulative dataset only, assuming all prior values are still present. It is also appropriate for any measure that has an absolute, definable minimum value, or a heuristic that makes senes for valid data.', 'Y'), - ('1022', 'Missing_Pct', 'Percent Missing', 'Consistent ratio of missing values', 'Tests for statistically-significant shift in percentage of missing values in column vs. baseline data', 'Significant shift in percent of missing values vs. baseline.', 'Difference measure', 'Cohen''s H Difference (0.20 small, 0.5 mod, 0.8 large, 1.2 very large, 2.0 huge)', 'record_ct <> value_ct', 'baseline_ct,baseline_value_ct,threshold_value', 'record_ct,value_ct,2::VARCHAR(10)', 'Baseline Record Count,Baseline Value Count,Standardized Difference Measure', NULL, 'Warning', 'CAT', 'column', 'Completeness', 'Data Drift', 'Expected maximum Cohen''s H Difference', 'This test uses Cohen''s H, a statistical test to identify a significant difference between two ratios. Results are reported on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. An uptick in missing data may indicate a collection issue at the source. A larger change may indicate a processing failure. A drop in missing data may also be significant, if it affects assumptions built into analytic products downstream. You can refine the expected threshold value as you view legitimate results of the measure over time.', 'Y'), - ('1023', 'Monthly_Rec_Ct', 'Monthly Records', 'At least one date per month present within date range', 'Tests for presence of at least one date per calendar month within min/max date range, per baseline data', 'At least one date per month expected in min/max date range.', 'Missing months', 'Calendar months without date values present', 'general_type= ''D'' AND date_days_present > 1 AND date_months_present > 2 AND date_months_present - (datediff( ''MON'' , min_date, max_date) + 1) = 0 AND future_date_ct::FLOAT / NULLIF(value_ct, 0) <= 0.75', 'threshold_value', '0', 'Threshold Count of Months without Dates', NULL, 'Fail', 'CAT', 'column', 'Completeness', 'Volume', 'Expected maximum count of calendar months without dates present', 'Monthly Records tests that at least one record is present for every calendar month within the minimum and maximum date range for the column. The test is relevant for transactional data, where you would expect at least one transaction to be recorded each month. A failure here would suggest missing records for the number of months identified without data. You can adjust the threshold to accept a number of month that you know legitimately have no records.', 'Y'), - ('1024', 'Outlier_Pct_Above', 'Outliers Above', 'Consistent outlier counts over 2 SD above mean', 'Tests that percent of outliers over 2 SD above Mean doesn''t exceed threshold', 'Percent of outliers exceeding 2 SD above the mean is greater than expected threshold.', 'Pct records over limit', NULL, 'general_type = ''N'' AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%'' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2)', 'baseline_avg,baseline_sd,threshold_value', 'avg_value,stdev_value,0.05', 'Baseline Mean, Baseline Std Deviation, Pct Records over 2 SD', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Expected maximum pct records over upper 2 SD limit', 'This test counts the number of data points that may be considered as outliers, determined by whether their value exceeds 2 standard deviations above the mean at baseline. Assuming a normal distribution, a small percentage (defaulted to 5%) of outliers is expected. The actual number may vary for different distributions. The expected threshold reflects the maximum percentage of outliers you expect to see. This test uses the baseline mean rather than the mean for the latest dataset to capture systemic shift as well as individual outliers. ', 'Y'), - ('1025', 'Outlier_Pct_Below', 'Outliers Below', 'Consistent outlier counts under 2 SD below mean', 'Tests that percent of outliers over 2 SD below Mean doesn''t exceed threshold', 'Percent of outliers exceeding 2 SD below the mean is greater than expected threshold.', 'Pct records under limit', NULL, 'general_type = ''N'' AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%'' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2)', 'baseline_avg,baseline_sd,threshold_value', 'avg_value,stdev_value,0.05', 'Baseline Mean, Baseline Std Deviation, Pct Records over 2 SD', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Expected maximum pct records over lower 2 SD limit', 'This test counts the number of data points that may be considered as outliers, determined by whether their value exceeds 2 standard deviations below the mean at baseline. Assuming a normal distribution, a small percentage (defaulted to 5%) of outliers is expected. The actual number may vary for different distributions. The expected threshold reflects the maximum percentage of outliers you expect to see. This test uses the baseline mean rather than the mean for the latest dataset to capture systemic shift as well as individual outliers. ', 'Y'), - ('1026', 'Pattern_Match', 'Pattern Match', 'Column values match alpha-numeric pattern', 'Tests that all values in the column match the same alpha-numeric pattern identified in baseline data', 'Alpha values do not match consistent pattern in baseline.', 'Pattern Mismatches', NULL, 'fn_charcount(top_patterns, E'' \| '' ) = 1 AND REPLACE(SPLIT_PART(top_patterns, ''|'' , 2), ''N'' , '''' ) > '''' AND distinct_value_ct > 10', 'baseline_value,threshold_value', 'trim(REPLACE(REPLACE(REPLACE(SPLIT_PART(top_patterns, '' | '', 2), ''A'', ''[A-Z]''), ''N'', ''[0-9]''), ''a'', ''[a-z]'')),0', 'Pattern at Baseline,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of pattern mismatches', 'This test is appropriate for character fields that are expected to appear in a consistent format. It uses pattern matching syntax as appropriate for your database: REGEX matching if available, otherwise LIKE expressions. The expected threshold is the number of records that fail to match the defined pattern.', 'Y'), - ('1027', 'PRIOR MATCH', 'Prior Match', 'Column value combinations match prior reference', 'Tests that the same set of column values are present in the current dataset as a different, prior schema.', 'Column values don''t match prior schema values.', 'Mismatched values', NULL, NULL, 'subset_condition,match_schema_name', NULL, 'TODO Fill in default_parm_prompts', NULL, 'Fail', 'QUERY', 'multi-column', 'Consistency', 'Data Drift', 'Expected count of non-matching value combinations', NULL, 'N'), - ('1028', 'Recency', 'Recency', 'Latest date within expected range of test date', 'Tests that the latest date in column is within a set number of days of the test date', 'Most recent date value not within expected days of test date.', 'Days before test', 'Number of days that most recent date precedes the date of test', 'general_type= ''D'' AND max_date <= run_date AND NOT column_name IN ( ''filedate'' , ''file_date'' ) AND NOT functional_data_type IN (''Future Date'', ''Schedule Date'') AND DATEDIFF( ''DAY'' , max_date, run_date) <= 62', 'threshold_value', 'CASE WHEN DATEDIFF( ''DAY'' , max_date, run_date) <= 3 THEN DATEDIFF(''DAY'', max_date, run_date) + 3 WHEN DATEDIFF(''DAY'', max_date, run_date) <= 7 then DATEDIFF(''DAY'', max_date, run_date) + 7 WHEN DATEDIFF( ''DAY'' , max_date, run_date) <= 31 THEN CEILING( DATEDIFF( ''DAY'' , max_date, run_date)::FLOAT / 7.0) * 7 WHEN DATEDIFF( ''DAY'' , max_date, run_date) > 31 THEN CEILING( DATEDIFF( ''DAY'' , max_date, run_date)::FLOAT / 30.0) * 30 END', 'Threshold Maximum Days before Test', NULL, 'Warning', 'CAT', 'column', 'Timeliness', 'Recency', 'Expected maximum count of days preceding test date', 'This test evaluates recency based on the latest referenced dates in the column. The test is appropriate for transactional dates and timestamps. The test can be especially valuable because timely data deliveries themselves may not assure that the most recent data is present. You can adjust the expected threshold to the maximum number of days that you expect the data to age before the dataset is refreshed. ', 'Y'), - ('1029', 'RELATIVE ENTROPY', 'Distribution Shift', 'Probability distribution consistent with reference', 'Tests the closeness of match between two distributions of aggregate measures across combinations of column values, using Jensen-Shannon Divergence test', 'Divergence between two distributions exceeds specified threshold.', 'Divergence level (0-1)', 'Jensen-Shannon Divergence, from 0 (identical distributions), to 1.0 (max divergence)', NULL, 'subset_condition,groupby_names,match_schema_name,match_subset_condition,match_groupby_names,threshold_value', NULL, 'Standardized Divergence Measure (0 to 1)', NULL, 'Warning', 'QUERY', 'multi-column', 'Consistency', 'Data Drift', 'Expected maximum divergence level between 0 and 1', NULL, 'N'), - ('1030', 'Required', 'Required Entry', 'Required non-null value present', 'Tests that a non-null value is present in each record for the column, consistent with baseline data', 'Every record for this column is expected to be filled, but some are missing.', 'Missing values', NULL, 'record_ct = value_ct', 'threshold_value', '0', 'Threshold Missing Value Count', NULL, 'Fail', 'CAT', 'column', 'Completeness', 'Schema Drift', 'Expected count of missing values', NULL, 'Y'), - ('1031', 'Row_Ct', 'Row Count', 'Number of rows is at or above threshold', 'Tests that the count of records has not decreased from the baseline count.', 'Row count less than baseline count.', 'Row count', NULL, NULL, 'threshold_value', NULL, 'Threshold Minimum Record Count', NULL, 'Fail', 'CAT', 'table', 'Completeness', 'Volume', 'Expected minimum row count', 'Because this tests the row count against a constant minimum threshold, it''s appropriate for any dataset, as long as the number of rows doesn''t radically change from refresh to refresh. But it''s not responsive to change over time. You may want to adjust the threshold periodically if you are dealing with a cumulative dataset.', 'Y'), - ('1032', 'Row_Ct_Pct', 'Row Range', 'Number of rows within percent range of threshold', 'Tests that the count of records is within a percentage above or below the baseline count.', 'Row Count is outside of threshold percent of baseline count.', 'Percent of baseline', 'Row count percent above or below baseline', NULL, 'baseline_ct,threshold_value', NULL, 'Baseline Record Count,Threshold Pct Above or Below Baseline', NULL, 'Fail', 'CAT', 'table', 'Completeness', 'Volume', 'Expected percent window below or above baseline', 'This test is better than Row Count for an incremental or windowed dataset where you would expect the row count to range within a percentage of baseline.', 'Y'), - ('1033', 'Street_Addr_Pattern', 'Street Address', 'Enough street address entries match defined pattern', 'Tests for percent of records matching standard street address pattern.', 'Percent of values matching standard street address format is under expected threshold.', 'Percent matches', 'Percent of records that match street address pattern', '(std_pattern_match=''STREET_ADDR'') AND (avg_length <> round(avg_length)) AND (avg_embedded_spaces BETWEEN 2 AND 6) AND (avg_length < 35)', 'threshold_value', '75', 'Threshold Pct that Match Address Pattern', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected percent of records that match standard street address pattern', 'The street address pattern used in this test should match the vast majority of USA addresses. You can adjust the threshold percent of matches based on the results you are getting -- you may well want to tighten it to make the test more sensitive to invalid entries.', 'Y'), - ('1034', 'Unique', 'Unique Values', 'Each column value is unique', 'Tests that no values for the column are repeated in multiple records.', 'Column values should be unique per row.', 'Duplicate values', 'Count of non-unique values', 'record_ct > 500 and record_ct = distinct_value_ct and value_ct > 0', 'threshold_value', '0', 'Threshold Duplicate Value Count', NULL, 'Fail', 'CAT', 'column', 'Uniqueness', 'Schema Drift', 'Expected count of duplicate values', 'This test is ideal when the database itself does not enforce a primary key constraint on the table. It serves as an independent check on uniqueness. If''s also useful when there are a small number of exceptions to uniqueness, which can be reflected in the expected threshold count of duplicates.', 'Y'), - ('1035', 'Unique_Pct', 'Percent Unique', 'Consistent ratio of unique values', 'Tests for statistically-significant shift in percentage of unique values vs. baseline data.', 'Significant shift in percent of unique values vs. baseline.', 'Difference measure', 'Cohen''s H Difference (0.20 small, 0.5 mod, 0.8 large, 1.2 very large, 2.0 huge)', 'distinct_value_ct > 10', 'baseline_value_ct,baseline_unique_ct,threshold_value', 'value_ct,distinct_value_ct,0.5', 'Value Count at Baseline,Distinct Value Count at Baseline,Standardized Difference Measure (0 to 1)', NULL, 'Warning', 'CAT', 'column', 'Uniqueness', 'Data Drift', 'Expected maximum Cohen''s H Difference', 'You can think of this as a test of similarity that measures whether the percentage of unique values is consistent with the percentage at baseline. A significant change might indicate duplication or a telling shift in cardinality between entities. The test uses Cohen''s H, a statistical test to identify a significant difference between two ratios. Results are reported on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge. You can refine the expected threshold value as you view legitimate results of the measure over time.', 'Y'), - ('1036', 'US_State', 'US State', 'Column value is two-letter US state code', 'Tests that the recorded column value is a valid US state.', 'Column Value is not a valid US state.', 'Not US States', 'Values that doo not match 2-character US state abbreviations.', 'general_type= ''A'' AND column_name ILIKE ''%state%'' AND distinct_value_ct < 70 AND max_length = 2', 'threshold_value', '0', 'Threshold Count not Matching State Abbreviations', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of values that are not US state abbreviations', 'This test validates entries against a fixed list of two-character US state codes and related Armed Forces codes.', 'Y'), - ('1037', 'Weekly_Rec_Ct', 'Weekly Records', 'At least one date per week present within date range', 'Tests for presence of at least one date per calendar week within min/max date range, per baseline data', 'At least one date per week expected in min/max date range.', 'Missing weeks', 'Calendar weeks without date values present', 'general_type= ''D'' AND date_days_present > 1 AND date_weeks_present > 3 AND date_weeks_present - (DATEDIFF(''week'', ''1800-01-05''::DATE, max_date) - DATEDIFF(''week'', ''1800-01-05''::DATE, min_date) + 1) = 0 AND future_date_ct::FLOAT / NULLIF(value_ct, 0) <= 0.75', 'threshold_value', '0', 'Threshold Weeks without Dates', NULL, 'Fail', 'CAT', 'column', 'Completeness', 'Volume', 'Expected maximum count of calendar weeks without dates present', 'Weekly Records tests that at least one record is present for every calendar week within the minimum and maximum date range for the column. The test is relevant for transactional data, where you would expect at least one transaction to be recorded each week. A failure here would suggest missing records for the number of weeks identified without data. You can adjust the threshold to accept a number of weeks that you know legitimately have no records.', 'Y'), - ('1038', 'WINDOW MATCH NO DROPS', 'Timeframe Minimum', 'Latest timeframe includes all values in prior timeframe', 'Tests that column values in most recent time-window include at least same as prior time window', 'Column values in most recent time-window don''t include all values in prior window.', 'Mismatched values', NULL, NULL, 'window_date_column,window_days,subset_condition', NULL, 'Date Column for Time Windows,Time Window in Days,Record Subset Condition', NULL, 'Fail', 'QUERY', 'multi-column', 'Consistency', 'Data Drift', 'Expected count of missing value combinations', NULL, 'N'), - ('1039', 'WINDOW MATCH SAME', 'Timeframe Match', 'Values in latest timeframe all found in prior timeframe', 'Tests for presence of same column values in most recent time-window vs. prior time window', 'Column values don''t match in most recent time-windows.', 'Mismatched values', NULL, NULL, 'window_date_column,window_days,subset_condition', NULL, 'Date Column for Time Windows,Time Window in Days,Record Subset Condition', NULL, 'Fail', 'QUERY', 'multi-column', 'Consistency', 'Data Drift', 'Expected count of non-matching value combinations', NULL, 'N'), - ('1040', 'Variability_Increase', 'Variability Increase', 'Variability has increased above threshold', 'Tests that the spread or dispersion of column values has increased significantly over baseline, indicating a drop in stability of the measure.', 'The Standard Deviation of the measure has increased beyond the defined threshold. This could signal a change in a process or a data quality issue.', 'Pct SD shift', 'Percent of baseline Standard Deviation', 'general_type = ''N'' AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%'' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2)', 'baseline_sd,threshold_value', 'stdev_value,120', 'Std Deviation at Baseline,Expected Maximum Percent', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Expected maximum pct of baseline Standard Deviation (SD)', 'This test looks for percent shifts in standard deviation as a measure of the stability of a measure over time. A significant change could indicate that new values are erroneous, or that the cohort being evaluated is significantly different from baseline. An increase in particular could mark new problems in measurement, a more heterogeneous cohort, or that significant outliers have been introduced. Consider this test along with Average Shift and New Shift. If the average shifts as well, there may be a fundamental shift in the dataset or process used to collect the data point. This might suggest a data shift that should be noted and assessed by business users. If the average does not shift, this may point to a data quality or data collection problem. ', 'Y'), - ('1041', 'Variability_Decrease', 'Variability Decrease', 'Variability has decreased below threshold', 'Tests that the spread or dispersion of column values has decreased significantly over baseline, indicating a shift in stability of the measure. This could signal a change in a process or a data quality issue.', 'The Standard Deviation of the measure has decreased below the defined threshold. This could signal a change in a process or a data quality issue.', 'Pct SD shift', 'Percent of baseline Standard Deviation', 'general_type = ''N'' AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%'' AND value_ct <> distinct_value_ct AND distinct_value_ct > 10 AND stdev_value > 0 AND avg_value IS NOT NULL AND NOT (distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct > 2)', 'baseline_sd,threshold_value', 'stdev_value, 80', 'Std Deviation at Baseline,Expected Minimum Percent', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Expected minimum pct of baseline Standard Deviation (SD)', 'This test looks for percent shifts in standard deviation as a measure of the stability of a measure over time. A significant change could indicate that new values are erroneous, or that the cohort being evaluated is significantly different from baseline. A decrease in particular could indicate an improved process, better precision in measurement, the elimination of outliers, or a more homogeneous cohort. ', 'Y'), - ('1042', 'Valid_Month', 'Valid Month', 'Valid calendar month in expected format', 'Tests for the presence of a valid representation of a calendar month consistent with the format at baseline.', 'Column values are not a valid representation of a calendar month consistent with the format at baseline.', 'Invalid months', NULL, 'functional_data_type = ''Period Month''', 'threshold_value,baseline_value', '0,CASE WHEN max_length > 3 AND initcap(min_text) = min_text THEN ''''''January'''',''''February'''',''''March'''',''''April'''',''''May'''',''''June'''',''''July'''',''''August'''',''''September'''',''''October'''',''''November'''',''''December'''''' WHEN max_length > 3 AND upper(min_text) = min_text THEN ''''''JANUARY'''',''''FEBRUARY'''',''''MARCH'''',''''APRIL'''',''''MAY'''',''''JUNE'''',''''JULY'''',''''AUGUST'''',''''SEPTEMBER'''',''''OCTOBER'''',''''NOVEMBER'''',''''DECEMBER'''''' WHEN max_length > 3 AND lower(min_text) = min_text THEN ''''''january'''',''''february'''',''''march'''',''''april'''',''''may'''',''''june'''',''''july'''',''''august'''',''''september'''',''''october'''',''''november'''',''''december'''''' WHEN max_length = 3 AND initcap(min_text) = min_text THEN ''''''Jan'''',''''Feb'''',''''Mar'''',''''Apr'''',''''May'''',''''Jun'''',''''Jul'''',''''Aug'''',''''Sep'''',''''Oct'''',''''Nov'''',''''Dec'''''' WHEN max_length = 3 AND upper(min_text) = min_text THEN ''''''JAN'''',''''FEB'''',''''MAR'''',''''APR'''',''''MAY'''',''''JUN'''',''''JUL'''',''''AUG'''',''''SEP'''',''''OCT'''',''''NOV'''',''''DEC'''''' WHEN max_length = 3 AND lower(min_text) = min_text THEN ''''''jan'''',''''feb'''',''''mar'''',''''apr'''',''''may'''',''''jun'''',''''jul'''',''''aug'''',''''sep'''',''''oct'''',''''nov'''',''''dec'''''' WHEN max_length = 2 AND min_text = ''01'' THEN ''''''01'''',''''02'''',''''03'''',''''04'''',''''05'''',''''06'''',''''07'''',''''08'''',''''09'''',''''10'''',''''11'''',''''12'''''' WHEN max_length = 2 AND min_text = ''1'' THEN ''''''1'''',''''2'''',''''3'''',''''4'''',''''5'''',''''6'''',''''7'''',''''8'''',''''9'''',''''10'''',''''11'''',''''12'''''' WHEN min_value = 1 THEN ''1,2,3,4,5,6,7,8,9,10,11,12'' ELSE ''NULL'' END', 'Threshold Invalid Months,Valid Month List', 'The acceptable number of records with invalid months present.|List of valid month values for this field, in quotes if field is numeric, separated by commas.', 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of invalid months', NULL, 'N'), - ('1043', 'Valid_Characters', 'Valid Characters', 'Column contains no invalid characters', 'Tests for the presence of non-printing characters, leading spaces, or surrounding quotes.', 'Invalid characters, such as non-printing characters, leading spaces, or surrounding quotes, were found.', 'Invalid records', 'Expected count of values with invalid characters', 'general_type = ''A''', 'threshold_value', '0', NULL, 'The acceptable number of records with invalid character values present.', 'Warning', 'CAT', 'column', 'Validity', 'Schema Drift', 'Threshold Invalid Value Count', 'This test looks for the presence of non-printing ASCII characters that are considered non-standard in basic text processing. It also identifies leading spaces and values enclosed in quotes. Values that fail this test may be artifacts of data conversion, or just more difficult to process or analyze downstream.', 'N'), - ('1044', 'Valid_US_Zip', 'Valid US Zip', 'Valid USA Postal Codes', 'Tests that postal codes match the 5 or 9 digit standard US format', 'Invalid US Zip Code formats found.', 'Invalid Zip Codes', 'Expected count of values with invalid Zip Codes', 'functional_data_type = ''Zip''', 'threshold_value', '0', NULL, NULL, 'Warning', 'CAT', 'column', 'Validity', 'Schema Drift', 'Threshold Invalid Value Count', NULL, 'Y'), - ('1045', 'Valid_US_Zip3', 'Valid US Zip-3 ', 'Valid USA Zip-3 Prefix', 'Tests that postal codes match the 3 digit format of a regional prefix.', 'Invalid 3-digit US Zip Code regional prefix formats found.', 'Invalid Zip-3 Prefix', 'Expected count of values with invalid Zip-3 Prefix Codes', 'functional_data_type = ''Zip3''', 'threshold_value', '0', NULL, NULL, 'Warning', 'CAT', 'column', 'Validity', 'Schema Drift', 'Threshold Invalid Zip3 Count', 'This test looks for the presence of values that fail to match the three-digit numeric code expected for US Zip Code regional prefixes. These prefixes are often used to roll up Zip Code data to a regional level, and may be critical to anonymize detailed data and protect PID. Depending on your needs and regulatory requirements, longer zip codes could place PID at risk.', 'Y'); TRUNCATE TABLE generation_sets; @@ -166,44 +168,40 @@ VALUES ('Monitor', 'Recency'), TRUNCATE TABLE test_templates; INSERT INTO test_templates (id, test_type, sql_flavor, template_name) -VALUES ('2001', 'DATA MATCH', 'redshift', 'ex_data_match_generic.sql'), - ('2002', 'Aggregate_No_Drops', 'redshift', 'ex_aggregate_match_no_drops_generic.sql'), - ('2003', 'RELATIVE ENTROPY', 'redshift', 'ex_relative_entropy_generic.sql'), +VALUES ('2001', 'Combo_Match', 'redshift', 'ex_data_match_generic.sql'), + ('2002', 'Aggregate_Minimum', 'redshift', 'ex_aggregate_match_no_drops_generic.sql'), + ('2003', 'Distribution_Shift', 'redshift', 'ex_relative_entropy_generic.sql'), ('2004', 'CUSTOM', 'redshift', 'ex_custom_query_generic.sql'), - ('2005', 'PRIOR MATCH', 'redshift', 'ex_prior_match_generic.sql'), - ('2006', 'Aggregate_Match', 'redshift', 'ex_aggregate_match_same_generic.sql'), - ('2007', 'WINDOW MATCH NO DROPS', 'redshift', 'ex_window_match_no_drops_generic.sql'), - ('2008', 'WINDOW MATCH SAME', 'redshift', 'ex_window_match_same_generic.sql'), + ('2006', 'Aggregate_Balance', 'redshift', 'ex_aggregate_match_same_generic.sql'), + ('2007', 'Timeframe_Combo_Gain', 'redshift', 'ex_window_match_no_drops_generic.sql'), + ('2008', 'Timeframe_Combo_Match', 'redshift', 'ex_window_match_same_generic.sql'), ('2009', 'Aggregate_Increase', 'redshift', 'ex_aggregate_match_num_incr_generic.sql'), - ('2101', 'DATA MATCH', 'snowflake', 'ex_data_match_generic.sql'), - ('2102', 'Aggregate_No_Drops', 'snowflake', 'ex_aggregate_match_no_drops_generic.sql'), - ('2103', 'RELATIVE ENTROPY', 'snowflake', 'ex_relative_entropy_generic.sql'), + ('2101', 'Combo_Match', 'snowflake', 'ex_data_match_generic.sql'), + ('2102', 'Aggregate_Minimum', 'snowflake', 'ex_aggregate_match_no_drops_generic.sql'), + ('2103', 'Distribution_Shift', 'snowflake', 'ex_relative_entropy_generic.sql'), ('2104', 'CUSTOM', 'snowflake', 'ex_custom_query_generic.sql'), - ('2105', 'PRIOR MATCH', 'snowflake', 'ex_prior_match_generic.sql'), - ('2106', 'Aggregate_Match', 'snowflake', 'ex_aggregate_match_same_generic.sql'), - ('2107', 'WINDOW MATCH NO DROPS', 'snowflake', 'ex_window_match_no_drops_generic.sql'), - ('2108', 'WINDOW MATCH SAME', 'snowflake', 'ex_window_match_same_generic.sql'), + ('2106', 'Aggregate_Balance', 'snowflake', 'ex_aggregate_match_same_generic.sql'), + ('2107', 'Timeframe_Combo_Gain', 'snowflake', 'ex_window_match_no_drops_generic.sql'), + ('2108', 'Timeframe_Combo_Match', 'snowflake', 'ex_window_match_same_generic.sql'), ('2109', 'Aggregate_Increase', 'snowflake', 'ex_aggregate_match_num_incr_generic.sql'), - ('2201', 'DATA MATCH', 'mssql', 'ex_data_match__generic.sql'), - ('2202', 'Aggregate_No_Drops', 'mssql', 'ex_aggregate_match_no_drops_generic.sql'), - ('2203', 'RELATIVE ENTROPY', 'mssql', 'ex_relative_entropy_generic.sql'), + ('2201', 'Combo_Match', 'mssql', 'ex_data_match_generic.sql'), + ('2202', 'Aggregate_Minimum', 'mssql', 'ex_aggregate_match_no_drops_generic.sql'), + ('2203', 'Distribution_Shift', 'mssql', 'ex_relative_entropy_generic.sql'), ('2204', 'CUSTOM', 'mssql', 'ex_custom_query_generic.sql'), - ('2205', 'PRIOR MATCH', 'mssql', 'ex_prior_match__generic.sql'), - ('2206', 'Aggregate_Match', 'mssql', 'ex_aggregate_match_same_generic.sql'), - ('2207', 'WINDOW MATCH NO DROPS', 'mssql', 'ex_window_match_no_drops_generic.sql'), - ('2208', 'WINDOW MATCH SAME', 'mssql', 'ex_window_match_same_generic.sql'), + ('2206', 'Aggregate_Balance', 'mssql', 'ex_aggregate_match_same_generic.sql'), + ('2207', 'Timeframe_Combo_Gain', 'mssql', 'ex_window_match_no_drops_generic.sql'), + ('2208', 'Timeframe_Combo_Match', 'mssql', 'ex_window_match_same_generic.sql'), ('2209', 'Aggregate_Increase', 'mssql', 'ex_aggregate_match_num_incr_generic.sql'), - ('2301', 'DATA MATCH', 'postgresql', 'ex_data_match_generic.sql'), - ('2302', 'Aggregate_No_Drops', 'postgresql', 'ex_aggregate_match_no_drops_generic.sql'), - ('2303', 'RELATIVE ENTROPY', 'postgresql', 'ex_relative_entropy_generic.sql'), + ('2301', 'Combo_Match', 'postgresql', 'ex_data_match_generic.sql'), + ('2302', 'Aggregate_Minimum', 'postgresql', 'ex_aggregate_match_no_drops_generic.sql'), + ('2303', 'Distribution_Shift', 'postgresql', 'ex_relative_entropy_generic.sql'), ('2304', 'CUSTOM', 'postgresql', 'ex_custom_query_generic.sql'), - ('2305', 'PRIOR MATCH', 'postgresql', 'ex_prior_match_generic.sql'), - ('2306', 'Aggregate_Match', 'postgresql', 'ex_aggregate_match_same_generic.sql'), - ('2307', 'WINDOW MATCH NO DROPS', 'postgresql', 'ex_window_match_no_drops_generic.sql'), - ('2308', 'WINDOW MATCH SAME', 'postgresql', 'ex_window_match_same_generic.sql'), + ('2306', 'Aggregate_Balance', 'postgresql', 'ex_aggregate_match_same_generic.sql'), + ('2307', 'Timeframe_Combo_Gain', 'postgresql', 'ex_window_match_no_drops_generic.sql'), + ('2308', 'Timeframe_Combo_Match', 'postgresql', 'ex_window_match_same_generic.sql'), ('2309', 'Aggregate_Increase', 'postgresql', 'ex_aggregate_match_num_incr_generic.sql'); TRUNCATE TABLE cat_test_conditions; @@ -234,7 +232,7 @@ VALUES ('1001', 'Alpha_Trunc', 'redshift', 'MAX(LENGTH({COLUMN_NAME}))', '<', ' ('1023', 'Required', 'redshift', 'COUNT(*) - COUNT( {COLUMN_NAME} )', '>', '{THRESHOLD_VALUE}'), ('1024', 'Row_Ct', 'redshift', 'COUNT(*)', '<', '{THRESHOLD_VALUE}'), ('1025', 'Row_Ct_Pct', 'redshift', 'ABS(ROUND(100.0 * (COUNT(*) - {BASELINE_CT})::FLOAT / {BASELINE_CT}::FLOAT, 2))', '>', '{THRESHOLD_VALUE}'), - ('1026', 'Street_Addr_Pattern', 'redshift', '100.0*SUM(({COLUMN_NAME} ~ ''^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$'')::BIGINT)::FLOAT / COUNT({COLUMN_NAME})::FLOAT', '<', '{THRESHOLD_VALUE}'), + ('1026', 'Street_Addr_Pattern', 'redshift', '100.0*SUM(({COLUMN_NAME} ~ ''^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$'')::BIGINT)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT', '<', '{THRESHOLD_VALUE}'), ('1027', 'US_State', 'redshift', 'SUM(CASE WHEN {COLUMN_NAME} NOT IN ('''',''AL'',''AK'',''AS'',''AZ'',''AR'',''CA'',''CO'',''CT'',''DE'',''DC'',''FM'',''FL'',''GA'',''GU'',''HI'',''ID'',''IL'',''IN'',''IA'',''KS'',''KY'',''LA'',''ME'',''MH'',''MD'',''MA'',''MI'',''MN'',''MS'',''MO'',''MT'',''NE'',''NV'',''NH'',''NJ'',''NM'',''NY'',''NC'',''ND'',''MP'',''OH'',''OK'',''OR'',''PW'',''PA'',''PR'',''RI'',''SC'',''SD'',''TN'',''TX'',''UT'',''VT'',''VI'',''VA'',''WA'',''WV'',''WI'',''WY'',''AE'',''AP'',''AA'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), ('1028', 'Unique', 'redshift', 'COUNT(*) - COUNT(DISTINCT {COLUMN_NAME})', '>', '{THRESHOLD_VALUE}'), ('1029', 'Unique_Pct', 'redshift', 'ABS( 2.0 * ASIN( SQRT({BASELINE_UNIQUE_CT}::FLOAT / {BASELINE_VALUE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( DISTINCT {COLUMN_NAME} )::FLOAT / NULLIF(COUNT( {COLUMN_NAME} ), 0)::FLOAT )) )', '>=', '{THRESHOLD_VALUE}'), @@ -257,14 +255,14 @@ VALUES ('1001', 'Alpha_Trunc', 'redshift', 'MAX(LENGTH({COLUMN_NAME}))', '<', ' ('2016', 'Min_Val', 'snowflake', 'SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), ('2017', 'Missing_Pct', 'snowflake', 'ABS( 2.0 * ASIN( SQRT( {BASELINE_VALUE_CT}::FLOAT / {BASELINE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( {COLUMN_NAME} )::FLOAT / NULLIF(COUNT(*), 0)::FLOAT )) )', '>=', '{THRESHOLD_VALUE}'), ('2018', 'Monthly_Rec_Ct', 'snowflake', '(MAX(DATEDIFF(month, {COLUMN_NAME}, ''{RUN_DATE}''::DATE)) - MIN(DATEDIFF(month, {COLUMN_NAME}, ''{RUN_DATE}''::DATE)) + 1) - COUNT(DISTINCT DATEDIFF(month, {COLUMN_NAME}, ''{RUN_DATE}''::DATE))', '>', '{THRESHOLD_VALUE}'), - ('2019', 'Outlier_Pct_Above', 'snowflake', 'SUM(CASE WHEN {COLUMN_NAME}::FLOAT > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / COUNT({COLUMN_NAME})::FLOAT', '>', '{THRESHOLD_VALUE}'), - ('2020', 'Outlier_Pct_Below', 'snowflake', 'SUM(CASE WHEN {COLUMN_NAME}::FLOAT < {BASELINE_AVG}-(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / COUNT({COLUMN_NAME})::FLOAT', '>', '{THRESHOLD_VALUE}'), + ('2019', 'Outlier_Pct_Above', 'snowflake', 'SUM(CASE WHEN {COLUMN_NAME}::FLOAT > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT', '>', '{THRESHOLD_VALUE}'), + ('2020', 'Outlier_Pct_Below', 'snowflake', 'SUM(CASE WHEN {COLUMN_NAME}::FLOAT < {BASELINE_AVG}-(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT', '>', '{THRESHOLD_VALUE}'), ('2021', 'Pattern_Match', 'snowflake', 'COUNT(NULLIF({COLUMN_NAME}, '''')) - SUM(REGEXP_LIKE(NULLIF({COLUMN_NAME}::VARCHAR, ''''), ''{BASELINE_VALUE}'')::BIGINT)', '>', '{THRESHOLD_VALUE}'), ('2022', 'Recency', 'snowflake', 'DATEDIFF(''D'', MAX({COLUMN_NAME}), ''{RUN_DATE}''::DATE)', '>', '{THRESHOLD_VALUE}'), ('2023', 'Required', 'snowflake', 'COUNT(*) - COUNT( {COLUMN_NAME} )', '>', '{THRESHOLD_VALUE}'), ('2024', 'Row_Ct', 'snowflake', 'COUNT(*)', '<', '{THRESHOLD_VALUE}'), ('2025', 'Row_Ct_Pct', 'snowflake', 'ABS(ROUND(100.0 * (COUNT(*) - {BASELINE_CT})::FLOAT / {BASELINE_CT}::FLOAT, 2))', '>', '{THRESHOLD_VALUE}'), - ('2026', 'Street_Addr_Pattern', 'snowflake', '100.0*SUM((regexp_like({COLUMN_NAME}::VARCHAR, ''^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$''))::BIGINT)::FLOAT / COUNT({COLUMN_NAME})::FLOAT', '<', '{THRESHOLD_VALUE}'), + ('2026', 'Street_Addr_Pattern', 'snowflake', '100.0*SUM((regexp_like({COLUMN_NAME}::VARCHAR, ''^[0-9]{1,5}[a-zA-Z]?\\s\\w{1,5}\\.?\\s?\\w*\\s?\\w*\\s[a-zA-Z]{1,6}\\.?\\s?[0-9]{0,5}[A-Z]{0,1}$''))::BIGINT)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT', '<', '{THRESHOLD_VALUE}'), ('2027', 'US_State', 'snowflake', 'SUM(CASE WHEN {COLUMN_NAME} NOT IN ('''',''AL'',''AK'',''AS'',''AZ'',''AR'',''CA'',''CO'',''CT'',''DE'',''DC'',''FM'',''FL'',''GA'',''GU'',''HI'',''ID'',''IL'',''IN'',''IA'',''KS'',''KY'',''LA'',''ME'',''MH'',''MD'',''MA'',''MI'',''MN'',''MS'',''MO'',''MT'',''NE'',''NV'',''NH'',''NJ'',''NM'',''NY'',''NC'',''ND'',''MP'',''OH'',''OK'',''OR'',''PW'',''PA'',''PR'',''RI'',''SC'',''SD'',''TN'',''TX'',''UT'',''VT'',''VI'',''VA'',''WA'',''WV'',''WI'',''WY'',''AE'',''AP'',''AA'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), ('2028', 'Unique', 'snowflake', 'COUNT(*) - COUNT(DISTINCT {COLUMN_NAME})', '>', '{THRESHOLD_VALUE}'), ('2029', 'Unique_Pct', 'snowflake', 'ABS( 2.0 * ASIN( SQRT({BASELINE_UNIQUE_CT}::FLOAT / {BASELINE_VALUE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( DISTINCT {COLUMN_NAME} )::FLOAT / NULLIF(COUNT( {COLUMN_NAME} ), 0)::FLOAT )) )', '>=', '{THRESHOLD_VALUE}'), @@ -317,14 +315,14 @@ VALUES ('1001', 'Alpha_Trunc', 'redshift', 'MAX(LENGTH({COLUMN_NAME}))', '<', ' ('4016', 'Min_Val', 'postgresql', 'SUM(CASE WHEN {COLUMN_NAME} < {BASELINE_VALUE} THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), ('4017', 'Missing_Pct', 'postgresql', 'ABS( 2.0 * ASIN( SQRT( {BASELINE_VALUE_CT}::FLOAT / {BASELINE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( {COLUMN_NAME} )::FLOAT / NULLIF(COUNT(*), 0)::FLOAT )) )', '>=', '{THRESHOLD_VALUE}'), ('4018', 'Monthly_Rec_Ct', 'postgresql', '(MAX({DATA_QC_SCHEMA}.DATEDIFF(''MON'', {COLUMN_NAME}, ''{RUN_DATE}''::DATE)) - MIN({DATA_QC_SCHEMA}.DATEDIFF(''MON'', {COLUMN_NAME}, ''{RUN_DATE}''::DATE)) + 1) - COUNT(DISTINCT {DATA_QC_SCHEMA}.DATEDIFF(''MON'', {COLUMN_NAME}, ''{RUN_DATE}''::DATE))', '>', '{THRESHOLD_VALUE}'), - ('4019', 'Outlier_Pct_Above', 'postgresql', 'SUM(CASE WHEN {COLUMN_NAME}::FLOAT > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / COUNT({COLUMN_NAME})::FLOAT', '>', '{THRESHOLD_VALUE}'), - ('4020', 'Outlier_Pct_Below', 'postgresql', 'SUM(CASE WHEN {COLUMN_NAME}::FLOAT < {BASELINE_AVG}-(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / COUNT({COLUMN_NAME})::FLOAT', '>', '{THRESHOLD_VALUE}'), + ('4019', 'Outlier_Pct_Above', 'postgresql', 'SUM(CASE WHEN {COLUMN_NAME}::FLOAT > {BASELINE_AVG}+(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT', '>', '{THRESHOLD_VALUE}'), + ('4020', 'Outlier_Pct_Below', 'postgresql', 'SUM(CASE WHEN {COLUMN_NAME}::FLOAT < {BASELINE_AVG}-(2.0*{BASELINE_SD}) THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT', '>', '{THRESHOLD_VALUE}'), ('4021', 'Pattern_Match', 'postgresql', 'COUNT(NULLIF({COLUMN_NAME}, '''')) - SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') ~ ''{BASELINE_VALUE}'' THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), ('4022', 'Recency', 'postgresql', '{DATA_QC_SCHEMA}.DATEDIFF(''DAY'', MAX({COLUMN_NAME}), ''{RUN_DATE}''::DATE)', '>', '{THRESHOLD_VALUE}'), ('4023', 'Required', 'postgresql', 'COUNT(*) - COUNT({COLUMN_NAME})', '>', '{THRESHOLD_VALUE}'), ('4024', 'Row_Ct', 'postgresql', 'COUNT(*)', '<', '{THRESHOLD_VALUE}'), ('4025', 'Row_Ct_Pct', 'postgresql', 'ABS(ROUND(100.0 * (COUNT(*) - {BASELINE_CT})::DECIMAL(18,4) / {BASELINE_CT}::DECIMAL(18,4), 2))', '>', '{THRESHOLD_VALUE}'), - ('4026', 'Street_Addr_Pattern', 'postgresql', '100.0*SUM(CASE WHEN {COLUMN_NAME} ~ ''^[0-9]{1,5}[a-zA-Z]?\s\w{1,5}\.?\s?\w*\s?\w*\s[a-zA-Z]{1,6}\.?\s?[0-9]{0,5}[A-Z]{0,1}$'' THEN 1 ELSE 0 END)::FLOAT / COUNT({COLUMN_NAME})::FLOAT', '<', '{THRESHOLD_VALUE}'), + ('4026', 'Street_Addr_Pattern', 'postgresql', '100.0*SUM(CASE WHEN {COLUMN_NAME} ~ ''^[0-9]{1,5}[a-zA-Z]?\s\w{1,5}\.?\s?\w*\s?\w*\s[a-zA-Z]{1,6}\.?\s?[0-9]{0,5}[A-Z]{0,1}$'' THEN 1 ELSE 0 END)::FLOAT / NULLIF(COUNT({COLUMN_NAME}), 0)::FLOAT', '<', '{THRESHOLD_VALUE}'), ('4027', 'US_State', 'postgresql', 'SUM(CASE WHEN NULLIF({COLUMN_NAME}, '''') NOT IN (''AL'',''AK'',''AS'',''AZ'',''AR'',''CA'',''CO'',''CT'',''DE'',''DC'',''FM'',''FL'',''GA'',''GU'',''HI'',''ID'',''IL'',''IN'',''IA'',''KS'',''KY'',''LA'',''ME'',''MH'',''MD'',''MA'',''MI'',''MN'',''MS'',''MO'',''MT'',''NE'',''NV'',''NH'',''NJ'',''NM'',''NY'',''NC'',''ND'',''MP'',''OH'',''OK'',''OR'',''PW'',''PA'',''PR'',''RI'',''SC'',''SD'',''TN'',''TX'',''UT'',''VT'',''VI'',''VA'',''WA'',''WV'',''WI'',''WY'',''AE'',''AP'',''AA'') THEN 1 ELSE 0 END)', '>', '{THRESHOLD_VALUE}'), ('4028', 'Unique', 'postgresql', 'COUNT(*) - COUNT(DISTINCT {COLUMN_NAME})', '>', '{THRESHOLD_VALUE}'), ('4029', 'Unique_Pct', 'postgresql', 'ABS( 2.0 * ASIN( SQRT({BASELINE_UNIQUE_CT}::FLOAT / {BASELINE_VALUE_CT}::FLOAT ) ) - 2 * ASIN( SQRT( COUNT( DISTINCT {COLUMN_NAME} )::FLOAT / NULLIF(COUNT( {COLUMN_NAME} ), 0)::FLOAT )) )', '>=', '{THRESHOLD_VALUE}'), @@ -462,7 +460,7 @@ VALUES ('1058', '1001', 'Profile Anomaly' , 'Suggested_Type', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT 20;'), ('1059', '1002', 'Profile Anomaly' , 'Non_Standard_Blanks', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CASE WHEN "{COLUMN_NAME}" IN (''.'', ''?'', '' '') THEN 1 WHEN LOWER("{COLUMN_NAME}") SIMILAR TO ''(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)'' THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER("{COLUMN_NAME}") IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN "{COLUMN_NAME}" = '''' THEN 1 WHEN "{COLUMN_NAME}" IS NULL THEN 1 ELSE 0 END = 1 GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";'), - ('1050', '1003', 'Profile Anomaly' , 'Invalid_Zip_USA', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;'), + ('1060', '1003', 'Profile Anomaly' , 'Invalid_Zip_USA', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" LIMIT 500;'), ('1061', '1004', 'Profile Anomaly' , 'Multiple_Types_Minor', 'postgresql', NULL, 'SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type = ''timestamp without time zone'' THEN ''timestamp'' WHEN data_type = ''character varying'' THEN ''varchar('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type = ''character'' THEN ''char('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type = ''numeric'' THEN ''numeric('' || CAST(numeric_precision AS VARCHAR) || '','' || CAST(numeric_scale AS VARCHAR) || '')'' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND UPPER(tables.table_type) = ''BASE TABLE'' ORDER BY data_type, table_name;'), ('1062', '1005', 'Profile Anomaly' , 'Multiple_Types_Major', 'postgresql', NULL, 'SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type = ''timestamp without time zone'' THEN ''timestamp'' WHEN data_type = ''character varying'' THEN ''varchar('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type = ''character'' THEN ''char('' || CAST(character_maximum_length AS VARCHAR) || '')'' WHEN data_type = ''numeric'' THEN ''numeric('' || CAST(numeric_precision AS VARCHAR) || '','' || CAST(numeric_scale AS VARCHAR) || '')'' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND UPPER(tables.table_type) = ''BASE TABLE'' ORDER BY data_type, table_name;'), ('1063', '1006', 'Profile Anomaly' , 'No_Values', 'postgresql', NULL, 'SELECT DISTINCT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}";' ), @@ -775,7 +773,426 @@ ORDER BY check_period DESC;'), ('1241', '1045', 'Test Results', 'Valid_US_Zip3', 'redshift', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') <> ''999'' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'), ('1242', '1045', 'Test Results', 'Valid_US_Zip3', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') <> '''' GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC LIMIT 20;'), ('1243', '1045', 'Test Results', 'Valid_US_Zip3', 'mssql', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'), - ('1244', '1045', 'Test Results', 'Valid_US_Zip3', 'snowflake', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'); + ('1244', '1045', 'Test Results', 'Valid_US_Zip3', 'snowflake', NULL, 'SELECT TOP 20 "{COLUMN_NAME}", COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE("{COLUMN_NAME}",''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY "{COLUMN_NAME}" ORDER BY record_ct DESC;'), + + ('1245', '1500', 'Test Results', 'Aggregate_Balance', 'redshift', NULL, 'SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total + FROM {TARGET_SCHEMA}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) +ORDER BY {GROUPBY_NAMES};'), + ('1246', '1500', 'Test Results', 'Aggregate_Balance', 'snowflake', NULL, 'SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total + FROM {TARGET_SCHEMA}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) +ORDER BY {GROUPBY_NAMES};'), + ('1247', '1500', 'Test Results', 'Aggregate_Balance', 'mssql', NULL, 'SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total + FROM {TARGET_SCHEMA}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) +ORDER BY {GROUPBY_NAMES};'), + ('1248', '1500', 'Test Results', 'Aggregate_Balance', 'postgresql', NULL, 'SELECT * + FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) AS total, SUM(MATCH_TOTAL) AS MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} AS total, NULL AS match_total + FROM {TARGET_SCHEMA}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL AS total, {MATCH_COLUMN_NAMES} AS match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE total <> match_total OR (total IS NOT NULL AND match_total IS NULL) OR (total IS NULL AND match_total IS NOT NULL) +ORDER BY {GROUPBY_NAMES};'), + ('1249', '1501', 'Test Results', 'Aggregate_Minimum', 'redshift', NULL, 'SELECT * +FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total + FROM {TARGET_SCHEMA}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL) +ORDER BY {GROUPBY_NAMES};'), + ('1250', '1501', 'Test Results', 'Aggregate_Minimum', 'snowflake', NULL, 'SELECT * +FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total + FROM {TARGET_SCHEMA}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL) +ORDER BY {GROUPBY_NAMES};'), + ('1251', '1501', 'Test Results', 'Aggregate_Minimum', 'mssql', NULL, 'SELECT * +FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total + FROM {TARGET_SCHEMA}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL) +ORDER BY {GROUPBY_NAMES};'), + ('1252', '1501', 'Test Results', 'Aggregate_Minimum', 'postgresql', NULL, 'SELECT * +FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total + FROM {TARGET_SCHEMA}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE total < match_total OR (total IS NULL AND match_total IS NOT NULL) +ORDER BY {GROUPBY_NAMES};'), + ('1253', '1502', 'Test Results', 'Combo_Match', 'redshift', NULL, 'SELECT * + FROM ( SELECT {COLUMN_NAME_NO_QUOTES} + FROM {TARGET_SCHEMA}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + GROUP BY {COLUMN_NAME_NO_QUOTES} + {HAVING_CONDITION} + EXCEPT + SELECT {MATCH_GROUPBY_NAMES} + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} + ) test +ORDER BY {COLUMN_NAME_NO_QUOTES};'), + ('1254', '1502', 'Test Results', 'Combo_Match', 'snowflake', NULL, 'SELECT * + FROM ( SELECT {COLUMN_NAME_NO_QUOTES} + FROM {TARGET_SCHEMA}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + GROUP BY {COLUMN_NAME_NO_QUOTES} + {HAVING_CONDITION} + EXCEPT + SELECT {MATCH_GROUPBY_NAMES} + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} + ) test +ORDER BY {COLUMN_NAME_NO_QUOTES};'), + ('1255', '1502', 'Test Results', 'Combo_Match', 'mssql', NULL, 'SELECT * + FROM ( SELECT {COLUMN_NAME_NO_QUOTES} + FROM {TARGET_SCHEMA}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + GROUP BY {COLUMN_NAME_NO_QUOTES} + {HAVING_CONDITION} + EXCEPT + SELECT {MATCH_GROUPBY_NAMES} + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} + ) test +ORDER BY {COLUMN_NAME_NO_QUOTES};'), + ('1256', '1502', 'Test Results', 'Combo_Match', 'postgresql', NULL, 'SELECT * + FROM ( SELECT {COLUMN_NAME_NO_QUOTES} + FROM {TARGET_SCHEMA}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + GROUP BY {COLUMN_NAME_NO_QUOTES} + {HAVING_CONDITION} + EXCEPT + SELECT {MATCH_GROUPBY_NAMES} + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} + ) test +ORDER BY {COLUMN_NAME_NO_QUOTES};'), + ('1257', '1503', 'Test Results', 'Distribution_Shift', 'redshift', NULL, 'WITH latest_ver + AS ( SELECT COALESCE({COLUMN_NAME_NO_QUOTES}, '''') as category, + COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total + FROM {TARGET_SCHEMA}.{TABLE_NAME} v1 + WHERE {SUBSET_CONDITION} + GROUP BY 1 ), +older_ver + AS ( SELECT COALESCE({MATCH_GROUPBY_NAMES}, '''') as category, + COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total + FROM {MATCH_SCHEMA_NAME}.{TABLE_NAME} v2 + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY 1 ) +SELECT COALESCE(l.category, o.category) AS category, + o.pct_of_total AS old_pct, + l.pct_of_total AS new_pct + FROM latest_ver l +FULL JOIN older_ver o + ON (l.category = o.category) +ORDER BY COALESCE(l.category, o.category);'), + ('1258', '1503', 'Test Results', 'Distribution_Shift', 'snowflake', NULL, 'WITH latest_ver + AS ( SELECT COALESCE({COLUMN_NAME_NO_QUOTES}, '''') as category, + COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total + FROM {TARGET_SCHEMA}.{TABLE_NAME} v1 + WHERE {SUBSET_CONDITION} + GROUP BY 1 ), +older_ver + AS ( SELECT COALESCE({MATCH_GROUPBY_NAMES}, '''') as category, + COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total + FROM {MATCH_SCHEMA_NAME}.{TABLE_NAME} v2 + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY 1 ) +SELECT COALESCE(l.category, o.category) AS category, + o.pct_of_total AS old_pct, + l.pct_of_total AS new_pct + FROM latest_ver l +FULL JOIN older_ver o + ON (l.category = o.category) +ORDER BY COALESCE(l.category, o.category);'), + ('1259', '1503', 'Test Results', 'Distribution_Shift', 'mssql', NULL, 'WITH latest_ver + AS ( SELECT COALESCE({COLUMN_NAME_NO_QUOTES}, '''') as category, + COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total + FROM {TARGET_SCHEMA}.{TABLE_NAME} v1 + WHERE {SUBSET_CONDITION} + GROUP BY 1 ), +older_ver + AS ( SELECT COALESCE({MATCH_GROUPBY_NAMES}, '''') as category, + COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total + FROM {MATCH_SCHEMA_NAME}.{TABLE_NAME} v2 + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY 1 ) +SELECT COALESCE(l.category, o.category) AS category, + o.pct_of_total AS old_pct, + l.pct_of_total AS new_pct + FROM latest_ver l +FULL JOIN older_ver o + ON (l.category = o.category) +ORDER BY COALESCE(l.category, o.category);'), + ('1260', '1503', 'Test Results', 'Distribution_Shift', 'postgresql', NULL, 'WITH latest_ver + AS ( SELECT COALESCE({COLUMN_NAME_NO_QUOTES}, '''') as category, + COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total + FROM {TARGET_SCHEMA}.{TABLE_NAME} v1 + WHERE {SUBSET_CONDITION} + GROUP BY 1 ), +older_ver + AS ( SELECT COALESCE({MATCH_GROUPBY_NAMES}, '''') as category, + COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total + FROM {MATCH_SCHEMA_NAME}.{TABLE_NAME} v2 + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY 1 ) +SELECT COALESCE(l.category, o.category) AS category, + o.pct_of_total AS old_pct, + l.pct_of_total AS new_pct + FROM latest_ver l +FULL JOIN older_ver o + ON (l.category = o.category) +ORDER BY COALESCE(l.category, o.category);'), + + ('1261', '1508', 'Test Results', 'Timeframe_Combo_Gain', 'redshift', NULL, 'SELECT {COLUMN_NAME_NO_QUOTES} + FROM {TARGET_SCHEMA}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} +GROUP BY {COLUMN_NAME_NO_QUOTES} + EXCEPT +SELECT {COLUMN_NAME_NO_QUOTES} + FROM {TARGET_SCHEMA}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} +GROUP BY {COLUMN_NAME_NO_QUOTES}'), + ('1262', '1508', 'Test Results', 'Timeframe_Combo_Gain', 'snowflake', NULL, 'SELECT {COLUMN_NAME_NO_QUOTES} + FROM {TARGET_SCHEMA}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} +GROUP BY {COLUMN_NAME_NO_QUOTES} + EXCEPT +SELECT {COLUMN_NAME_NO_QUOTES} + FROM {TARGET_SCHEMA}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} +GROUP BY {COLUMN_NAME_NO_QUOTES}'), + ('1263', '1508', 'Test Results', 'Timeframe_Combo_Gain', 'mssql', NULL, 'SELECT {COLUMN_NAME_NO_QUOTES} + FROM {TARGET_SCHEMA}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} +GROUP BY {COLUMN_NAME_NO_QUOTES} + EXCEPT +SELECT {COLUMN_NAME_NO_QUOTES} + FROM {TARGET_SCHEMA}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} +GROUP BY {COLUMN_NAME_NO_QUOTES}'), + ('1264', '1508', 'Test Results', 'Timeframe_Combo_Gain', 'postgresql', NULL, 'SELECT {COLUMN_NAME_NO_QUOTES} + FROM {TARGET_SCHEMA}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} +GROUP BY {COLUMN_NAME_NO_QUOTES} + EXCEPT +SELECT {COLUMN_NAME_NO_QUOTES} + FROM {TARGET_SCHEMA}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} +GROUP BY {COLUMN_NAME_NO_QUOTES}'), + ('1265', '1509', 'Test Results', 'Timeframe_Combo_Match', 'redshift', NULL, ' ( +SELECT ''Prior Timeframe'' as missing_from, {COLUMN_NAME} +FROM {TARGET_SCHEMA}.{TABLE_NAME} +WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} +EXCEPT +SELECT ''Prior Timeframe'' as missing_from, {COLUMN_NAME} +FROM {TARGET_SCHEMA}.{TABLE_NAME} +WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} +) +UNION ALL +( +SELECT ''Latest Timeframe'' as missing_from, {COLUMN_NAME} +FROM {TARGET_SCHEMA}.{TABLE_NAME} +WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} + EXCEPT +SELECT ''Latest Timeframe'' as missing_from, {COLUMN_NAME} +FROM {TARGET_SCHEMA}.{TABLE_NAME} +WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} +)'), + ('1266', '1509', 'Test Results', 'Timeframe_Combo_Match', 'snowflake', NULL, ' ( +SELECT ''Prior Timeframe'' as missing_from, {COLUMN_NAME} +FROM {TARGET_SCHEMA}.{TABLE_NAME} +WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} +EXCEPT +SELECT ''Prior Timeframe'' as missing_from, {COLUMN_NAME} +FROM {TARGET_SCHEMA}.{TABLE_NAME} +WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} +) +UNION ALL +( +SELECT ''Latest Timeframe'' as missing_from, {COLUMN_NAME} +FROM {TARGET_SCHEMA}.{TABLE_NAME} +WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} + EXCEPT +SELECT ''Latest Timeframe'' as missing_from, {COLUMN_NAME} +FROM {TARGET_SCHEMA}.{TABLE_NAME} +WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} +)'), + ('1267', '1509', 'Test Results', 'Timeframe_Combo_Match', 'mssql', NULL, ' ( +SELECT ''Prior Timeframe'' as missing_from, {COLUMN_NAME} +FROM {TARGET_SCHEMA}.{TABLE_NAME} +WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} +EXCEPT +SELECT ''Prior Timeframe'' as missing_from, {COLUMN_NAME} +FROM {TARGET_SCHEMA}.{TABLE_NAME} +WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} +) +UNION ALL +( +SELECT ''Latest Timeframe'' as missing_from, {COLUMN_NAME} +FROM {TARGET_SCHEMA}.{TABLE_NAME} +WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} + EXCEPT +SELECT ''Latest Timeframe'' as missing_from, {COLUMN_NAME} +FROM {TARGET_SCHEMA}.{TABLE_NAME} +WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} +)'), + ('1268', '1509', 'Test Results', 'Timeframe_Combo_Match', 'postgresql', NULL, ' ( +SELECT ''Prior Timeframe'' as missing_from, {COLUMN_NAME} +FROM {TARGET_SCHEMA}.{TABLE_NAME} +WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} +EXCEPT +SELECT ''Prior Timeframe'' as missing_from, {COLUMN_NAME} +FROM {TARGET_SCHEMA}.{TABLE_NAME} +WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} +) +UNION ALL +( +SELECT ''Latest Timeframe'' as missing_from, {COLUMN_NAME} +FROM {TARGET_SCHEMA}.{TABLE_NAME} +WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} + EXCEPT +SELECT ''Latest Timeframe'' as missing_from, {COLUMN_NAME} +FROM {TARGET_SCHEMA}.{TABLE_NAME} +WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} +)') +; TRUNCATE TABLE variant_codings; diff --git a/testgen/template/dbupgrade/0103_incremental_upgrade.sql b/testgen/template/dbupgrade/0103_incremental_upgrade.sql new file mode 100644 index 0000000..3011aa9 --- /dev/null +++ b/testgen/template/dbupgrade/0103_incremental_upgrade.sql @@ -0,0 +1,22 @@ +SET SEARCH_PATH TO {SCHEMA_NAME}; + +ALTER TABLE test_types + ADD COLUMN column_name_prompt TEXT; + +ALTER TABLE test_types + ADD COLUMN column_name_help TEXT; + +ALTER TABLE test_results + ADD COLUMN auto_gen BOOLEAN; + +UPDATE test_results + SET auto_gen = TRUE + FROM test_results r +INNER JOIN test_definitions d + ON (r.project_code = d.project_code + AND r.test_suite = d.test_suite + AND r.table_name = d.table_name + AND r.column_names = COALESCE(d.column_name, 'N/A') + AND r.test_type = d.test_type) +WHERE d.last_auto_gen_date IS NOT NULL + AND test_results.id = r.id; \ No newline at end of file diff --git a/testgen/template/execution/ex_finalize_test_run_results.sql b/testgen/template/execution/ex_finalize_test_run_results.sql index 657e30c..7eb5454 100644 --- a/testgen/template/execution/ex_finalize_test_run_results.sql +++ b/testgen/template/execution/ex_finalize_test_run_results.sql @@ -22,7 +22,8 @@ UPDATE test_results WHEN r.skip_errors > 0 THEN 'Errors Ignored: ' || r.skip_errors::VARCHAR ELSE '' END), - table_groups_id = d.table_groups_id + table_groups_id = d.table_groups_id, + auto_gen = d.last_auto_gen_date IS NOT NULL FROM test_results r INNER JOIN test_suites s ON (r.project_code = s.project_code diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_no_drops_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_no_drops_generic.sql index 4c5b54d..0cfe53a 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_no_drops_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_no_drops_generic.sql @@ -1,33 +1,48 @@ SELECT '{PROJECT_CODE}' as project_code, '{TEST_TYPE}' as test_type, '{TEST_DEFINITION_ID}' as test_definition_id, '{TEST_SUITE}' as test_suite, - '{RUN_DATE}' as test_time, '{START_TIME}' as starttime, CURRENT_TIMESTAMP as endtime, - '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{COLUMN_NAME}' as column_names, + '{TEST_RUN_ID}' as test_run_id, + '{RUN_DATE}' as test_time, + '{START_TIME}' as starttime, + CURRENT_TIMESTAMP as endtime, + '{SCHEMA_NAME}' as schema_name, + '{TABLE_NAME}' as table_name, + '{COLUMN_NAME_NO_QUOTES}' as column_names, + '{SKIP_ERRORS}' as threshold_value, {SKIP_ERRORS} as skip_errors, - 'match_schema_name = {MATCH_SCHEMA_NAME}, match_table_name = {MATCH_TABLE_NAME}, match_groupby_names = {MATCH_GROUPBY_NAMES}, match_column_names = {MATCH_COLUMN_NAMES}, match_subset_condition = {MATCH_SUBSET_CONDITION}, match_having_condition = {MATCH_HAVING_CONDITION}, mode = {MODE}' - as input_parameters, - CASE WHEN COUNT(*) > COALESCE(skip_errors, 0) THEN 0 ELSE 1 END as result_code, + '{INPUT_PARAMETERS}' as input_parameters, + CASE WHEN COUNT (*) > {SKIP_ERRORS} THEN 0 ELSE 1 END as result_code, + CASE + WHEN COUNT(*) > 0 THEN + CONCAT( + CONCAT( CAST(COUNT(*) AS VARCHAR), ' error(s) identified, ' ), CONCAT( - CONCAT( 'Mismatched measures: ', CAST( COALESCE(COUNT(*), 0) AS VARCHAR) ), - CONCAT( ', Threshold: ', - CONCAT( CAST(COALESCE(skip_errors, 0) AS VARCHAR), '.') + CASE + WHEN COUNT(*) > {SKIP_ERRORS} THEN 'exceeding limit of ' + ELSE 'within limit of ' + END, + '{SKIP_ERRORS}.' ) - ) AS result_message, + ) + ELSE 'No errors found.' + END AS result_message, COUNT(*) as result_measure, - '{TEST_ACTION}' as test_action, - '{SUBSET_CONDITION}' as subset_condition, - NULL as result_query, - '{TEST_DESCRIPTION}' as test_description -FROM ( - SELECT {MATCH_GROUPBY_NAMES}, {MATCH_SUM_COLUMNS} - FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} - WHERE {MATCH_SUBSET_CONDITION} - GROUP BY {MATCH_GROUPBY_NAMES} - {MATCH_HAVING_CONDITION} - EXCEPT - SELECT {GROUPBY_NAMES}, {SUM_COLUMNS} + '{SUBSET_DISPLAY}' as subset_condition, + NULL as result_query +FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total FROM {SCHEMA_NAME}.{TABLE_NAME} WHERE {SUBSET_CONDITION} GROUP BY {GROUPBY_NAMES} {HAVING_CONDITION} - ) a; + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE total < match_total +-- OR (total IS NOT NULL AND match_total IS NULL) -- New categories + OR (total IS NULL AND match_total IS NOT NULL); -- Dropped categories diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_percent_above_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_percent_above_generic.sql new file mode 100644 index 0000000..c36d130 --- /dev/null +++ b/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_percent_above_generic.sql @@ -0,0 +1,49 @@ +SELECT '{PROJECT_CODE}' as project_code, + '{TEST_TYPE}' as test_type, + '{TEST_DEFINITION_ID}' as test_definition_id, + '{TEST_SUITE}' as test_suite, + '{TEST_RUN_ID}' as test_run_id, + '{RUN_DATE}' as test_time, + '{START_TIME}' as starttime, + CURRENT_TIMESTAMP as endtime, + '{SCHEMA_NAME}' as schema_name, + '{TABLE_NAME}' as table_name, + '{COLUMN_NAME_NO_QUOTES}' as column_names, + '{THRESHOLD_VALUE}' as threshold_value, + {SKIP_ERRORS} as skip_errors, + '{INPUT_PARAMETERS}' as input_parameters, + CASE WHEN COUNT (*) > {SKIP_ERRORS} THEN 0 ELSE 1 END as result_code, + CASE + WHEN COUNT(*) > 0 THEN + CONCAT( + CONCAT( CAST(COUNT(*) AS VARCHAR), ' error(s) identified, ' ), + CONCAT( + CASE + WHEN COUNT(*) > {SKIP_ERRORS} THEN 'exceeding limit of ' + ELSE 'within limit of ' + END, + '{SKIP_ERRORS}.' + ) + ) + ELSE 'No errors found.' + END AS result_message, + COUNT(*) as result_measure, + '{SUBSET_DISPLAY}' as subset_condition, + NULL as result_query +FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total + FROM {SCHEMA_NAME}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE NOT total BETWEEN match_total AND match_total * (1 + {BASELINE_VALUE}::FLOAT/100.0) + OR (total IS NOT NULL AND match_total IS NULL) + OR (total IS NULL AND match_total IS NOT NULL); diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_percent_within_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_percent_within_generic.sql new file mode 100644 index 0000000..a3aaf9b --- /dev/null +++ b/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_percent_within_generic.sql @@ -0,0 +1,49 @@ +SELECT '{PROJECT_CODE}' as project_code, + '{TEST_TYPE}' as test_type, + '{TEST_DEFINITION_ID}' as test_definition_id, + '{TEST_SUITE}' as test_suite, + '{TEST_RUN_ID}' as test_run_id, + '{RUN_DATE}' as test_time, + '{START_TIME}' as starttime, + CURRENT_TIMESTAMP as endtime, + '{SCHEMA_NAME}' as schema_name, + '{TABLE_NAME}' as table_name, + '{COLUMN_NAME_NO_QUOTES}' as column_names, + '{THRESHOLD_VALUE}' as threshold_value, + {SKIP_ERRORS} as skip_errors, + '{INPUT_PARAMETERS}' as input_parameters, + CASE WHEN COUNT (*) > {SKIP_ERRORS} THEN 0 ELSE 1 END as result_code, + CASE + WHEN COUNT(*) > 0 THEN + CONCAT( + CONCAT( CAST(COUNT(*) AS VARCHAR), ' error(s) identified, ' ), + CONCAT( + CASE + WHEN COUNT(*) > {SKIP_ERRORS} THEN 'exceeding limit of ' + ELSE 'within limit of ' + END, + '{SKIP_ERRORS}.' + ) + ) + ELSE 'No errors found.' + END AS result_message, + COUNT(*) as result_measure, + '{SUBSET_DISPLAY}' as subset_condition, + NULL as result_query +FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total + FROM {SCHEMA_NAME}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE NOT total BETWEEN match_total * (1 - {BASELINE_VALUE}::FLOAT/100.0) AND match_total * (1 + {BASELINE_VALUE}::FLOAT/100.0) + OR (total IS NOT NULL AND match_total IS NULL) + OR (total IS NULL AND match_total IS NOT NULL); diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_same_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_same_generic.sql index df30859..09d5890 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_same_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_aggregate_match_same_generic.sql @@ -8,12 +8,11 @@ SELECT '{PROJECT_CODE}' as project_code, CURRENT_TIMESTAMP as endtime, '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, - CASE WHEN '{COLUMN_NAME}' = '' OR '{COLUMN_NAME}' IS NULL THEN 'N/A' ELSE '{COLUMN_NAME}' END as column_names, - '{SKIP_ERRORS}' as threshold_value, - {SKIP_ERRORS} as skip_errors, - 'match_schema_name = {MATCH_SCHEMA_NAME}, match_table_name = {MATCH_TABLE_NAME}, match_groupby_names = {MATCH_GROUPBY_NAMES} ,match_column_names = {MATCH_COLUMN_NAMES}, match_subset_condition = {MATCH_SUBSET_CONDITION}, match_having_condition = {MATCH_HAVING_CONDITION}, mode = {MODE}' - as input_parameters, - CASE WHEN COUNT (*) > {SKIP_ERRORS} THEN 0 ELSE 1 END as result_code, + '{COLUMN_NAME_NO_QUOTES}' as column_names, + '{SKIP_ERRORS}' as threshold_value, + {SKIP_ERRORS} as skip_errors, + '{INPUT_PARAMETERS}' as input_parameters, + CASE WHEN COUNT (*) > {SKIP_ERRORS} THEN 0 ELSE 1 END as result_code, CASE WHEN COUNT(*) > 0 THEN CONCAT( @@ -29,31 +28,22 @@ SELECT '{PROJECT_CODE}' as project_code, ELSE 'No errors found.' END AS result_message, COUNT(*) as result_measure, - '{SUBSET_CONDITION}' as subset_condition, + '{SUBSET_DISPLAY}' as subset_condition, NULL as result_query -FROM ( - (SELECT {GROUPBY_NAMES}, {SUM_COLUMNS} +FROM ( SELECT {GROUPBY_NAMES}, SUM(TOTAL) as total, SUM(MATCH_TOTAL) as MATCH_TOTAL + FROM + ( SELECT {GROUPBY_NAMES}, {COLUMN_NAME_NO_QUOTES} as total, NULL as match_total FROM {SCHEMA_NAME}.{TABLE_NAME} WHERE {SUBSET_CONDITION} GROUP BY {GROUPBY_NAMES} {HAVING_CONDITION} - EXCEPT - SELECT {MATCH_GROUPBY_NAMES}, {MATCH_SUM_COLUMNS} + UNION ALL + SELECT {MATCH_GROUPBY_NAMES}, NULL as total, {MATCH_COLUMN_NAMES} as match_total FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} WHERE {MATCH_SUBSET_CONDITION} GROUP BY {MATCH_GROUPBY_NAMES} - {MATCH_HAVING_CONDITION} - ) - UNION - (SELECT {MATCH_GROUPBY_NAMES}, {MATCH_SUM_COLUMNS} - FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} - WHERE {MATCH_SUBSET_CONDITION} - GROUP BY {MATCH_GROUPBY_NAMES} - {MATCH_HAVING_CONDITION} - EXCEPT - SELECT {GROUPBY_NAMES}, {SUM_COLUMNS} - FROM {SCHEMA_NAME}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - GROUP BY {GROUPBY_NAMES} - {HAVING_CONDITION}) - ) a; + {MATCH_HAVING_CONDITION} ) a + GROUP BY {GROUPBY_NAMES} ) s + WHERE total <> match_total + OR (total IS NOT NULL AND match_total IS NULL) + OR (total IS NULL AND match_total IS NOT NULL); diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_custom_query_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_custom_query_generic.sql index 3a202c3..eedda15 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_custom_query_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_custom_query_generic.sql @@ -9,8 +9,8 @@ SELECT '{PROJECT_CODE}' as project_code, '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, CASE - WHEN '{COLUMN_NAME_DISPLAY}' = '' OR '{COLUMN_NAME_DISPLAY}' IS NULL THEN 'N/A' - ELSE '{COLUMN_NAME_DISPLAY}' + WHEN '{COLUMN_NAME_NO_QUOTES}' = '' OR '{COLUMN_NAME_NO_QUOTES}' IS NULL THEN 'N/A' + ELSE '{COLUMN_NAME_NO_QUOTES}' END as column_names, '{SKIP_ERRORS}' as threshold_value, {SKIP_ERRORS} as skip_errors, diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_data_match_2way_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_data_match_2way_generic.sql new file mode 100644 index 0000000..9022693 --- /dev/null +++ b/testgen/template/flavors/generic/exec_query_tests/ex_data_match_2way_generic.sql @@ -0,0 +1,58 @@ +SELECT '{PROJECT_CODE}' as project_code, + '{TEST_TYPE}' as test_type, + '{TEST_DEFINITION_ID}' as test_definition_id, + '{TEST_SUITE}' as test_suite, + '{TEST_RUN_ID}' as test_run_id, + '{RUN_DATE}' as test_time, + '{START_TIME}' as starttime, + CURRENT_TIMESTAMP as endtime, + '{SCHEMA_NAME}' as schema_name, + '{TABLE_NAME}' as table_name, + '{COLUMN_NAME_NO_QUOTES}' as column_names, + '{SKIP_ERRORS}' as threshold_value, + {SKIP_ERRORS} as skip_errors, + '{INPUT_PARAMETERS}' as input_parameters, + CASE WHEN COUNT (*) > {SKIP_ERRORS} THEN 0 ELSE 1 END as result_code, + CASE + WHEN COUNT(*) > 0 THEN + CONCAT( + CONCAT( CAST(COUNT(*) AS VARCHAR), ' error(s) identified, ' ), + CONCAT( + CASE + WHEN COUNT(*) > {SKIP_ERRORS} THEN 'exceeding limit of ' + ELSE 'within limit of ' + END, + '{SKIP_ERRORS}.' + ) + ) + ELSE 'No errors found.' + END AS result_message, + COUNT(*) as result_measure, + '{SUBSET_DISPLAY}' as subset_condition, + NULL as result_query + FROM ( + ( SELECT {GROUPBY_NAMES} + FROM {SCHEMA_NAME}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + EXCEPT + SELECT {MATCH_COLUMN_NAMES} + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) + UNION + ( SELECT {MATCH_COLUMN_NAMES} + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} + EXCEPT + SELECT {GROUPBY_NAMES} + FROM {SCHEMA_NAME}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + GROUP BY {GROUPBY_NAMES} + {HAVING_CONDITION} + ) + ) test; diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_data_match_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_data_match_generic.sql index 7c704c3..683e9d2 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_data_match_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_data_match_generic.sql @@ -1,38 +1,44 @@ -SELECT '{PROJECT_CODE}' as project_code, '{TEST_TYPE}' as test_type, +SELECT '{PROJECT_CODE}' as project_code, + '{TEST_TYPE}' as test_type, '{TEST_DEFINITION_ID}' as test_definition_id, - '{TEST_SUITE}' as test_suite, - '{RUN_DATE}' as test_time, '{START_TIME}' as starttime, CURRENT_TIMESTAMP as endtime, - '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{COLUMN_NAME}' as column_names, - '{SKIP_ERRORS}' as skip_errors, - 'match_schema_name = {MATCH_SCHEMA_NAME}, match_table_name = {MATCH_TABLE_NAME}, match_column_names = {MATCH_COLUMN_NAMES}, match_subset_condition = {MATCH_SUBSET_CONDITION}, test_mode = {MODE}' - as input_parameters, - CASE WHEN COUNT(*) > COALESCE('{SKIP_ERRORS}', 0) THEN 0 ELSE 1 END as result_code, - CONCAT( - CONCAT( 'Mismatched values: ', CAST( COALESCE(COUNT(*), 0) AS VARCHAR) ), - CONCAT( ', Threshold: ', - CONCAT( CAST(COALESCE('{SKIP_ERRORS}', 0) AS VARCHAR), '.') - ) - ) AS result_message, + '{TEST_SUITE}' as test_suite, + '{TEST_RUN_ID}' as test_run_id, + '{RUN_DATE}' as test_time, + '{START_TIME}' as starttime, + CURRENT_TIMESTAMP as endtime, + '{SCHEMA_NAME}' as schema_name, + '{TABLE_NAME}' as table_name, + '{COLUMN_NAME_NO_QUOTES}' as column_names, + '{SKIP_ERRORS}' as threshold_value, + {SKIP_ERRORS} as skip_errors, + '{INPUT_PARAMETERS}' as input_parameters, + CASE WHEN COUNT (*) > {SKIP_ERRORS} THEN 0 ELSE 1 END as result_code, + CASE + WHEN COUNT(*) > 0 THEN + CONCAT( + CONCAT( CAST(COUNT(*) AS VARCHAR), ' error(s) identified, ' ), + CONCAT( + CASE + WHEN COUNT(*) > {SKIP_ERRORS} THEN 'exceeding limit of ' + ELSE 'within limit of ' + END, + '{SKIP_ERRORS}.' + ) + ) + ELSE 'No errors found.' + END AS result_message, COUNT(*) as result_measure, - '{TEST_ACTION}' as test_action, - '{SUBSET_CONDITION}' as subset_condition, - NULL as result_query, - '{TEST_DESCRIPTION}' as test_description - FROM ( - ( SELECT {COLUMN_NAME} - FROM {SCHEMA_NAME}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - EXCEPT - SELECT {MATCH_COLUMN_NAMES} - FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} - WHERE {MATCH_SUBSET_CONDITION} - ) - UNION - ( SELECT {MATCH_COLUMN_NAMES} - FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} - WHERE {MATCH_SUBSET_CONDITION} - EXCEPT - SELECT {COLUMN_NAME} - FROM {SCHEMA_NAME}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} ) + '{SUBSET_DISPLAY}' as subset_condition, + NULL as result_query + FROM ( SELECT {COLUMN_NAME_NO_QUOTES} + FROM {SCHEMA_NAME}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + GROUP BY {COLUMN_NAME_NO_QUOTES} + {HAVING_CONDITION} + EXCEPT + SELECT {MATCH_GROUPBY_NAMES} + FROM {MATCH_SCHEMA_NAME}.{MATCH_TABLE_NAME} + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} + {MATCH_HAVING_CONDITION} ) test; diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_relative_entropy_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_relative_entropy_generic.sql index 56d3d00..b14c654 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_relative_entropy_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_relative_entropy_generic.sql @@ -5,7 +5,7 @@ -- 0.5 * kl_divergence(p, m) + 0.5 * kl_divergence(q, m) -- Log base 2 of x = LN(x)/LN(2) WITH latest_ver - AS ( SELECT COALESCE({GROUPBY_NAMES}, '') as category, + AS ( SELECT COALESCE({COLUMN_NAME_NO_QUOTES}, '') as category, COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total FROM {SCHEMA_NAME}.{TABLE_NAME} v1 WHERE {SUBSET_CONDITION} @@ -25,23 +25,28 @@ dataset FROM latest_ver l FULL JOIN older_ver o ON (l.category = o.category) ) -SELECT '{PROJECT_CODE}' as project_code, '{TEST_TYPE}' as test_type, +SELECT '{PROJECT_CODE}' as project_code, + '{TEST_TYPE}' as test_type, '{TEST_DEFINITION_ID}' as test_definition_id, '{TEST_SUITE}' as test_suite, - '{RUN_DATE}' as test_time, '{START_TIME}' as starttime, GETDATE() as endtime, - '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{GROUPBY_NAMES}' as column_names, + '{TEST_RUN_ID}' as test_run_id, + '{RUN_DATE}' as test_time, + '{START_TIME}' as starttime, + CURRENT_TIMESTAMP as endtime, + '{SCHEMA_NAME}' as schema_name, + '{TABLE_NAME}' as table_name, + '{COLUMN_NAME_NO_QUOTES}' as column_names, +-- '{GROUPBY_NAMES}' as column_names, + '{THRESHOLD_VALUE}' as threshold_value, NULL as skip_errors, - 'schema_name = {SCHEMA_NAME}, matching_schema = {MATCH_SCHEMA_NAME}, table_name = {TABLE_NAME}, column_names = {GROUPBY_NAMES}, subset_condition = {SUBSET_CONDITION}' - as input_parameters, - CASE WHEN js_divergence <= {THRESHOLD_VALUE} THEN 0 ELSE 1 END as result_code, + '{INPUT_PARAMETERS}' as input_parameters, + CASE WHEN js_divergence > {THRESHOLD_VALUE} THEN 0 ELSE 1 END as result_code, CONCAT('Divergence Level: ', CONCAT(CAST(js_divergence AS VARCHAR), ', Threshold: {THRESHOLD_VALUE}.')) as result_message, - COUNT(*) as result_measure, - '{TEST_ACTION}' as test_action, - '{SUBSET_CONDITION}' as subset_condition, - NULL as result_query, - '{TEST_DESCRIPTION}' as test_description + js_divergence as result_measure, + '{SUBSET_DISPLAY}' as subset_condition, + NULL as result_query FROM ( SELECT 0.5 * ABS(SUM(new_pct * LN(new_pct/avg_pct)/LN(2))) + 0.5 * ABS(SUM(old_pct * LN(old_pct/avg_pct)/LN(2))) as js_divergence diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_window_match_no_drops_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_window_match_no_drops_generic.sql index 6abfc92..92d9554 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_window_match_no_drops_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_window_match_no_drops_generic.sql @@ -1,33 +1,46 @@ -SELECT '{PROJECT_CODE}' as project_code, '{TEST_TYPE}' as test_type, +SELECT '{PROJECT_CODE}' as project_code, + '{TEST_TYPE}' as test_type, '{TEST_DEFINITION_ID}' as test_definition_id, - '{TEST_SUITE}' as test_suite, - '{RUN_DATE}' as test_time, '{START_TIME}' as starttime, CURRENT_TIMESTAMP as endtime, - '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{COLUMN_NAME}' as column_names, + '{TEST_SUITE}' as test_suite, + '{TEST_RUN_ID}' as test_run_id, + '{RUN_DATE}' as test_time, + '{START_TIME}' as starttime, + CURRENT_TIMESTAMP as endtime, + '{SCHEMA_NAME}' as schema_name, + '{TABLE_NAME}' as table_name, + '{COLUMN_NAME_NO_QUOTES}' as column_names, + '{SKIP_ERRORS}' as threshold_value, {SKIP_ERRORS} as skip_errors, - 'schema_name = {SCHEMA_NAME}, table_name = {TABLE_NAME}, column_name = {COLUMN_NAME}, subset_condition = {SUBSET_CONDITION}, window_date_column = {WINDOW_DATE_COLUMN}, window_days = {WINDOW_DAYS}, mode = {MODE}' - as input_parameters, - CASE WHEN COUNT(*) > COALESCE(skip_errors, 0) THEN 0 ELSE 1 END as result_code, - CONCAT( - CONCAT( 'Mismatched values: ', CAST( COALESCE(COUNT(*), 0) AS VARCHAR) ), - CONCAT( ', Threshold: ', - CONCAT( CAST(COALESCE(skip_errors, 0) AS VARCHAR), '.') - ) - ) AS result_message, + '{INPUT_PARAMETERS}' as input_parameters, + CASE WHEN COUNT (*) > {SKIP_ERRORS} THEN 0 ELSE 1 END as result_code, + CASE + WHEN COUNT(*) > 0 THEN + CONCAT( + CONCAT( CAST(COUNT(*) AS VARCHAR), ' error(s) identified, ' ), + CONCAT( + CASE + WHEN COUNT(*) > {SKIP_ERRORS} THEN 'exceeding limit of ' + ELSE 'within limit of ' + END, + '{SKIP_ERRORS}.' + ) + ) + ELSE 'No errors found.' + END AS result_message, COUNT(*) as result_measure, - '{TEST_ACTION}' as test_action, - '{SUBSET_CONDITION}' as subset_condition, - NULL as result_query, - '{TEST_DESCRIPTION}' as test_description - FROM ( - SELECT {COLUMN_NAME} - FROM {SCHEMA_NAME}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} BETWEEN (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - 2 * {WINDOW_DAYS} - AND (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - {WINDOW_DAYS} -EXCEPT -SELECT {COLUMN_NAME} - FROM {SCHEMA_NAME}.{TABLE_NAME} - WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} BETWEEN (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - {WINDOW_DAYS} - AND (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - ) TEST; + '{SUBSET_DISPLAY}' as subset_condition, + NULL as result_query +FROM ( + SELECT {COLUMN_NAME_NO_QUOTES} + FROM {SCHEMA_NAME}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - 2 * {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - {WINDOW_DAYS} + GROUP BY {COLUMN_NAME_NO_QUOTES} + EXCEPT + SELECT {COLUMN_NAME_NO_QUOTES} + FROM {SCHEMA_NAME}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - {WINDOW_DAYS} + GROUP BY {COLUMN_NAME_NO_QUOTES} + ) test; diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_window_match_same_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_window_match_same_generic.sql index dca387c..202ebba 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_window_match_same_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_window_match_same_generic.sql @@ -1,49 +1,59 @@ -SELECT '{PROJECT_CODE}' as project_code, '{TEST_TYPE}' as test_type, +SELECT '{PROJECT_CODE}' as project_code, + '{TEST_TYPE}' as test_type, '{TEST_DEFINITION_ID}' as test_definition_id, - '{TEST_SUITE}' as test_suite, - '{RUN_DATE}' as test_time, '{START_TIME}' as starttime, CURRENT_TIMESTAMP as endtime, - '{SCHEMA_NAME}' as schema_name, '{TABLE_NAME}' as table_name, '{COLUMN_NAME}' as column_names, + '{TEST_SUITE}' as test_suite, + '{TEST_RUN_ID}' as test_run_id, + '{RUN_DATE}' as test_time, + '{START_TIME}' as starttime, + CURRENT_TIMESTAMP as endtime, + '{SCHEMA_NAME}' as schema_name, + '{TABLE_NAME}' as table_name, + '{COLUMN_NAME_NO_QUOTES}' as column_names, + '{SKIP_ERRORS}' as threshold_value, {SKIP_ERRORS} as skip_errors, - 'schema_name = {SCHEMA_NAME}, table_name = {TABLE_NAME}, column_name = {COLUMN_NAME}, subset_condition = {SUBSET_CONDITION}, window_date_column = {WINDOW_DATE_COLUMN},window_days = {WINDOW_DAYS}, mode = {MODE}' - as input_parameters, - CASE WHEN COUNT(*) > COALESCE(skip_errors, 0) THEN 0 ELSE 1 END as result_code, - CONCAT( - CONCAT( 'Mismatched values: ', CAST( COALESCE(COUNT(*), 0) AS VARCHAR) ), - CONCAT( ', Threshold: ', - CONCAT( CAST(COALESCE(skip_errors, 0) AS VARCHAR), '.') - ) - ) AS result_message, + '{INPUT_PARAMETERS}' as input_parameters, + CASE WHEN COUNT (*) > {SKIP_ERRORS} THEN 0 ELSE 1 END as result_code, + CASE + WHEN COUNT(*) > 0 THEN + CONCAT( + CONCAT( CAST(COUNT(*) AS VARCHAR), ' error(s) identified, ' ), + CONCAT( + CASE + WHEN COUNT(*) > {SKIP_ERRORS} THEN 'exceeding limit of ' + ELSE 'within limit of ' + END, + '{SKIP_ERRORS}.' + ) + ) + ELSE 'No errors found.' + END AS result_message, COUNT(*) as result_measure, - '{TEST_ACTION}' as test_action, - '{SUBSET_CONDITION}' as subset_condition, - NULL as result_query, - '{TEST_DESCRIPTION}' as test_description + '{SUBSET_DISPLAY}' as subset_condition, + NULL as result_query FROM ( ( -SELECT {COLUMN_NAME} +SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME} FROM {SCHEMA_NAME}.{TABLE_NAME} WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} BETWEEN (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - {WINDOW_DAYS} - AND (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - {WINDOW_DAYS} EXCEPT -SELECT {COLUMN_NAME} +SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME} FROM {SCHEMA_NAME}.{TABLE_NAME} WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} BETWEEN (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - 2 * {WINDOW_DAYS} - AND (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - 2 * {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - {WINDOW_DAYS} ) -UNION +UNION ALL ( -SELECT {COLUMN_NAME} +SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME} FROM {SCHEMA_NAME}.{TABLE_NAME} WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} BETWEEN (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - 2 * {WINDOW_DAYS} - AND (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - 2 * {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - {WINDOW_DAYS} EXCEPT -SELECT {COLUMN_NAME} +SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME} FROM {SCHEMA_NAME}.{TABLE_NAME} WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} BETWEEN (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - {WINDOW_DAYS} - AND (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - {WINDOW_DAYS} ) ) test; diff --git a/testgen/template/generation/gen_delete_old_tests.sql b/testgen/template/generation/gen_delete_old_tests.sql index a0e9bdf..67b38b4 100644 --- a/testgen/template/generation/gen_delete_old_tests.sql +++ b/testgen/template/generation/gen_delete_old_tests.sql @@ -1,11 +1,5 @@ DELETE FROM test_definitions - WHERE id IN ( - SELECT d.id - FROM test_definitions d - INNER JOIN test_types t - ON (d.test_type = t.test_type - AND 'CAT' = t.run_type) - WHERE d.table_groups_id = '{TABLE_GROUPS_ID}'::UUID - AND d.test_suite = '{TEST_SUITE}' - AND t.selection_criteria IS NOT NULL - AND COALESCE(d.lock_refresh, 'N') <> 'Y' ); + WHERE table_groups_id = '{TABLE_GROUPS_ID}'::UUID + AND test_suite = '{TEST_SUITE}' + AND last_auto_gen_date IS NOT NULL + AND COALESCE(lock_refresh, 'N') <> 'Y'; diff --git a/testgen/template/generation/gen_standard_test_type_list.sql b/testgen/template/generation/gen_standard_test_type_list.sql index 307b500..11b7d17 100644 --- a/testgen/template/generation/gen_standard_test_type_list.sql +++ b/testgen/template/generation/gen_standard_test_type_list.sql @@ -7,7 +7,7 @@ LEFT JOIN generation_sets s ON (t.test_type = s.test_type AND '{GENERATION_SET}' = s.generation_set) WHERE t.active = 'Y' - AND t.selection_criteria IS NOT NULL + AND t.selection_criteria <> 'TEMPLATE' -- Also excludes NULL AND (s.generation_set IS NOT NULL OR '{GENERATION_SET}' = '') ORDER BY test_type; diff --git a/testgen/template/profiling/functional_datatype.sql b/testgen/template/profiling/functional_datatype.sql index 08adb66..fd4775e 100644 --- a/testgen/template/profiling/functional_datatype.sql +++ b/testgen/template/profiling/functional_datatype.sql @@ -225,7 +225,7 @@ UPDATE profile_results SET functional_data_type = 'Person Name' WHERE profile_run_id = '{PROFILE_RUN_ID}' AND functional_data_type IS NULL - AND column_name ~ '^(approver|first|last|full|contact|emp|employee|hcp|manager|mgr_|middle|nick|person|preferred|rep|reviewer|salesperson|spouse)(_| |)name$'; + AND column_name ~ '^(approver|first|last|full|contact|emp|employee|hcp|manager|mgr_|middle|nick|party|person|preferred|rep|reviewer|salesperson|spouse)(_| |)name$'; UPDATE profile_results SET functional_data_type = 'Entity Name' @@ -356,6 +356,24 @@ SET functional_data_type = WHERE profile_run_id = '{PROFILE_RUN_ID}' AND functional_data_type IS NULL; +-- Assign City +UPDATE profile_results + SET functional_data_type = 'City' + FROM ( SELECT p.id + FROM profile_results p + LEFT JOIN profile_results pn + ON p.profile_run_id = pn.profile_run_id + AND p.table_name = pn.table_name + AND p.position = pn.position - 1 + WHERE p.profile_run_id = '{PROFILE_RUN_ID}' + AND p.includes_digit_ct::FLOAT/NULLIF(p.value_ct,0)::FLOAT < 0.05 + AND p.numeric_ct::FLOAT/NULLIF(p.value_ct,0)::FLOAT < 0.05 + AND p.date_ct::FLOAT/NULLIF(p.value_ct,0)::FLOAT < 0.05 + AND pn.functional_data_type = 'State' + AND p.avg_length BETWEEN 7 AND 12 + AND p.avg_embedded_spaces < 1 + AND p.distinct_value_ct BETWEEN 15 AND 40000 ) c +WHERE profile_results.id = c.id; -- 7. Assign 'ID-Unique' functional data type to the columns that are identity columns diff --git a/testgen/ui/bootstrap.py b/testgen/ui/bootstrap.py index 89dea60..bb755e3 100644 --- a/testgen/ui/bootstrap.py +++ b/testgen/ui/bootstrap.py @@ -3,6 +3,8 @@ import inspect import logging +import streamlit + from testgen import settings from testgen.commands.run_upgrade_db_config import get_schema_revision from testgen.common import configure_logging, docker_service @@ -53,7 +55,7 @@ def __init__(self, router: Router, menu: Menu, logger: logging.Logger) -> None: def get_version(self) -> Version: return Version( current=settings.VERSION, - latest=docker_service.check_for_new_docker_release(), + latest=check_for_upgrade(), schema=_get_schema_rev(), ) @@ -97,8 +99,15 @@ def run(log_level: int = logging.INFO) -> Application: ) +@streamlit.cache_resource(show_spinner=False) def _get_schema_rev() -> str: revision = session.sb_schema_rev if not revision: revision = session.sb_schema_rev = get_schema_revision() return revision + + +@streamlit.cache_resource(show_spinner=False) +def check_for_upgrade(): + return docker_service.check_for_new_docker_release() + diff --git a/testgen/ui/navigation/router.py b/testgen/ui/navigation/router.py index 6a2e240..7df43be 100644 --- a/testgen/ui/navigation/router.py +++ b/testgen/ui/navigation/router.py @@ -38,6 +38,8 @@ def navigate(self, /, to: str, with_args: dict | None = None) -> None: try: route = self._routes[to] + bc_source = route(self).path + for guard in route.can_activate or []: can_activate = guard() if type(can_activate) == str: @@ -51,11 +53,11 @@ def navigate(self, /, to: str, with_args: dict | None = None) -> None: self.active.render(**(with_args or {})) except KeyError as k: - error_message = f"Navigation Exception: {k!s}" + error_message = f"{bc_source}: {k!s}" st.error(error_message) logger.exception(error_message) return self.navigate(to=self._default.path, with_args=with_args) except Exception as e: - error_message = f"Navigation Exception: {e!s}" + error_message = f"{bc_source}: {e!s}" st.error(error_message) logger.exception(error_message) diff --git a/testgen/ui/queries/test_definition_queries.py b/testgen/ui/queries/test_definition_queries.py index b56b0d9..7e68d5f 100644 --- a/testgen/ui/queries/test_definition_queries.py +++ b/testgen/ui/queries/test_definition_queries.py @@ -142,7 +142,7 @@ def update(schema, test_definition): subset_condition = NULLIF($${test_definition["subset_condition"]}$$, ''), groupby_names = NULLIF($${test_definition["groupby_names"]}$$, ''), having_condition = NULLIF($${test_definition["having_condition"]}$$, ''), - window_date_column = NULLIF($${test_definition["window_date_column"]}$$, ''), + window_date_column = NULLIF('{test_definition["window_date_column"]}', ''), match_schema_name = NULLIF('{test_definition["match_schema_name"]}', ''), match_table_name = NULLIF('{test_definition["match_table_name"]}', ''), match_column_names = NULLIF($${test_definition["match_column_names"]}$$, ''), @@ -232,22 +232,22 @@ def add(schema, test_definition): NULLIF('{test_definition["check_result"]}', '') as check_result, NULLIF('{test_definition["baseline_ct"]}', '') as baseline_ct, NULLIF('{test_definition["baseline_unique_ct"]}', '') as baseline_unique_ct, - NULLIF('{test_definition["baseline_value"]}', '') as baseline_value, - NULLIF('{test_definition["baseline_value_ct"]}', '') as baseline_value_ct, - NULLIF('{test_definition["threshold_value"]}', '') as threshold_value, - NULLIF('{test_definition["baseline_sum"]}', '') as baseline_sum, + NULLIF($${test_definition["baseline_value"]}$$, '') as baseline_value, + NULLIF($${test_definition["baseline_value_ct"]}$$, '') as baseline_value_ct, + NULLIF($${test_definition["threshold_value"]}$$, '') as threshold_value, + NULLIF($${test_definition["baseline_sum"]}$$, '') as baseline_sum, NULLIF('{test_definition["baseline_avg"]}', '') as baseline_avg, NULLIF('{test_definition["baseline_sd"]}', '') as baseline_sd, - NULLIF('{test_definition["subset_condition"]}', '') as subset_condition, - NULLIF('{test_definition["groupby_names"]}', '') as groupby_names, - NULLIF('{test_definition["having_condition"]}', '') as having_condition, + NULLIF($${test_definition["subset_condition"]}$$, '') as subset_condition, + NULLIF($${test_definition["groupby_names"]}$$, '') as groupby_names, + NULLIF($${test_definition["having_condition"]}$$, '') as having_condition, NULLIF('{test_definition["window_date_column"]}', '') as window_date_column, NULLIF('{test_definition["match_schema_name"]}', '') as match_schema_name, NULLIF('{test_definition["match_table_name"]}', '') as match_table_name, - NULLIF('{test_definition["match_column_names"]}', '') as match_column_names, - NULLIF('{test_definition["match_subset_condition"]}', '') as match_subset_condition, - NULLIF('{test_definition["match_groupby_names"]}', '') as match_groupby_names, - NULLIF('{test_definition["match_having_condition"]}', '') as match_having_condition, + NULLIF($${test_definition["match_column_names"]}$$, '') as match_column_names, + NULLIF($${test_definition["match_subset_condition"]}$$, '') as match_subset_condition, + NULLIF($${test_definition["match_groupby_names"]}$$, '') as match_groupby_names, + NULLIF($${test_definition["match_having_condition"]}$$, '') as match_having_condition, COALESCE({test_definition["window_days"]}, 0) as window_days ; """ diff --git a/testgen/ui/services/query_service.py b/testgen/ui/services/query_service.py index 46650a3..0fecadd 100644 --- a/testgen/ui/services/query_service.py +++ b/testgen/ui/services/query_service.py @@ -39,21 +39,37 @@ def get_project_by_code(schema: str, project_code: str): return results.iloc[0] -def run_test_type_lookup_query(str_schema, str_test_type=None): +def run_test_type_lookup_query(str_schema, str_test_type=None, boo_show_referential=True, boo_show_table=True, + boo_show_column=True, boo_show_custom=True): if str_test_type: str_criteria = f" AND tt.test_type = '{str_test_type}'" else: str_criteria = "" + + if (boo_show_referential and boo_show_table and boo_show_column and boo_show_custom) == False: + str_scopes = "" + str_scopes += "'referential'," if boo_show_referential else "" + str_scopes += "'table'," if boo_show_table else "" + str_scopes += "'column'," if boo_show_column else "" + str_scopes += "'custom'," if boo_show_custom else "" + if str_scopes > "": + str_criteria += f"AND tt.test_scope in ({str_scopes[:-1]})" + str_sql = f""" SELECT tt.id, tt.test_type, tt.id as cat_test_id, tt.test_name_short, tt.test_name_long, tt.test_description, tt.measure_uom, COALESCE(tt.measure_uom_description, '') as measure_uom_description, tt.default_parm_columns, tt.default_severity, - tt.run_type, tt.test_scope, tt.dq_dimension, tt.threshold_description, tt.default_parm_prompts, - tt.default_parm_help, tt.usage_notes + tt.run_type, tt.test_scope, tt.dq_dimension, tt.threshold_description, + tt.column_name_prompt, tt.column_name_help, + tt.default_parm_prompts, tt.default_parm_help, tt.usage_notes, + CASE tt.test_scope WHEN 'referential' THEN '⧉ ' WHEN 'custom' THEN '⛭ ' WHEN 'table' THEN '⊞ ' WHEN 'column' THEN '≣ ' ELSE '? ' END + || tt.test_name_short || ': ' || lower(tt.test_name_long) + || CASE WHEN tt.selection_criteria > '' THEN ' [auto-generated]' ELSE '' END as select_name FROM {str_schema}.test_types tt WHERE tt.active = 'Y' {str_criteria} - ORDER BY tt.test_name_short; + ORDER BY CASE tt.test_scope WHEN 'referential' THEN 1 WHEN 'custom' THEN 2 WHEN 'table' THEN 3 WHEN 'column' THEN 4 ELSE 5 END, + tt.test_name_short; """ return db.retrieve_data(str_sql) diff --git a/testgen/ui/views/app_log_modal.py b/testgen/ui/views/app_log_modal.py new file mode 100644 index 0000000..a6893e5 --- /dev/null +++ b/testgen/ui/views/app_log_modal.py @@ -0,0 +1,89 @@ +import logging +import re +from datetime import date, datetime + +import streamlit as st + +import testgen.ui.services.form_service as fm +from testgen.common import display_service +from testgen.ui.components import widgets as testgen + +logger = logging.getLogger("testgen.ui") + + +# Read the log file +@st.cache_data +def _read_log(file_path): + try: + with open(file_path) as file: + log_data = file.readlines() + return log_data # NOQA TRY300 + + except Exception: + st.warning(f"Log file is unavailable: {file_path}") + logger.debug(f"Log viewer can't read log file {file_path}") + + +# Function to filter log data by date +def _filter_by_date(log_data, start_date, end_date): + filtered_data = [] + for line in log_data: + # Assuming the log line starts with a date in the format 'YYYY-MM-DD' + match = re.match(r"^(\d{4}-\d{2}-\d{2})", line) + if match: + log_date = datetime.strptime(match.group(1), "%Y-%m-%d") + if start_date <= log_date <= end_date: + filtered_data.append(line) + return filtered_data + +# Function to search text in log data +def _search_text(log_data, search_query): + return [line for line in log_data if search_query in line] + + +def view_log_file(button_container): + log_file_modal = testgen.Modal(title=None, key="dk-view-log-modal", max_width=1100) + + with button_container: + if st.button( + "Troubleshooting →", help="Open and review TestGen Log files", use_container_width=True + ): + log_file_modal.open() + + if log_file_modal.is_open(): + with log_file_modal.container(): + fm.render_modal_header("TestGen App Log", None, "Review/Troubleshoot daily log files") + + _, file_out_path = display_service.get_in_out_paths() + + col1, col2, col3 = st.columns([33, 33, 33]) + log_date = col1.date_input("Log Date", value=datetime.today()) + + if log_date == date.today(): + file_name = "app.log" + else: + file_name = f"app.log.{log_date.strftime('%Y-%m-%d')}" + + # log_file_location = os.path.join(file_out_path, file_name) + log_file_location = f"/var/log/testgen/{file_name}" + log_data = _read_log(log_file_location) + + search_query = col2.text_input("Filter by Text") + if search_query: + show_data = _search_text(log_data, search_query) + else: + show_data = log_data + + # Refresh button + col3.write(" \n ") + if col3.button("Refresh"): + # Clear cache to refresh the log data + st.cache_data.clear() + + if log_data: + st.markdown(f"**Log File:** {log_file_location}") + # TOO SLOW: st.code(body=''.join(show_data), language="log", line_numbers=True) + st.text_area("Log Data", value="".join(show_data), height=400) + + # Download button + st.download_button("Download", data="".join(show_data), file_name=file_name) diff --git a/testgen/ui/views/project_settings.py b/testgen/ui/views/project_settings.py index fac9971..38ba7e2 100644 --- a/testgen/ui/views/project_settings.py +++ b/testgen/ui/views/project_settings.py @@ -7,6 +7,7 @@ from testgen.ui.navigation.page import Page from testgen.ui.services import form_service, query_service from testgen.ui.session import session +from testgen.ui.views.app_log_modal import view_log_file class ProjectSettingsPage(Page): @@ -38,7 +39,7 @@ def render(self) -> None: form_unique_key="project-settings", ) - _, col2 = st.columns([70, 30]) + _, col2, col3 = st.columns([50, 25, 25]) if col2.button("Test Observability Connection", use_container_width=False): status = st.empty() status.info("Testing your connection to DataKitchen Observability...") @@ -55,6 +56,8 @@ def render(self) -> None: error_message = e.args[0] st.text_area("Error Details", value=error_message) + view_log_file(col3) + @st.cache_data(show_spinner=False) def get_current_project(code: str): diff --git a/testgen/ui/views/test_definitions.py b/testgen/ui/views/test_definitions.py index f49cdd5..ff866d4 100644 --- a/testgen/ui/views/test_definitions.py +++ b/testgen/ui/views/test_definitions.py @@ -60,7 +60,7 @@ def render(self, **_) -> None: # Prompt for Table Group with tool_bar.long_slots[1]: - str_table_groups_id, str_connection_id, _, table_group = prompt_for_table_group( + str_table_groups_id, str_connection_id, str_schema, table_group = prompt_for_table_group( session.project, table_group, str_connection_id ) @@ -268,7 +268,7 @@ def show_add_edit_modal( # run type run_type = selected_test_type_row["run_type"] # Can be "QUERY" or "CAT" - test_scope = selected_test_type_row["test_scope"] # Can be "column", "table", "multi-column", "custom" + test_scope = selected_test_type_row["test_scope"] # Can be "column", "table", "referential", "custom" # test_description test_description = empty_if_null(selected_test_def["test_description"]) if mode == "edit" else "" @@ -297,7 +297,6 @@ def show_add_edit_modal( profile_run_id = selected_test_def["profile_run_id"] if mode == "edit" else "" test_suite_name = selected_test_def["test_suite"] if mode == "edit" else test_suite["test_suite"] test_action = empty_if_null(selected_test_def["test_action"]) if mode == "edit" else "" - test_mode = empty_if_null(selected_test_def["test_mode"]) if mode == "edit" else "" schema_name = selected_test_def["schema_name"] if mode == "edit" else table_group["table_group_schema"] table_name = empty_if_null(selected_test_def["table_name"]) if mode == "edit" else empty_if_null(str_table_name) skip_errors = selected_test_def["skip_errors"] if mode == "edit" else 0 @@ -328,6 +327,7 @@ def show_add_edit_modal( match_groupby_names = empty_if_null(selected_test_def["match_groupby_names"]) if mode == "edit" else "" match_having_condition = empty_if_null(selected_test_def["match_having_condition"]) if mode == "edit" else "" window_days = selected_test_def["window_days"] if mode == "edit" and selected_test_def["window_days"] else 0 + test_mode = empty_if_null(selected_test_def["test_mode"]) if mode == "edit" else "" # export_to_observability test_suite_export_to_observability = test_suite["export_to_observability"] @@ -470,10 +470,21 @@ def show_add_edit_modal( ) # column_name + column_name_label = selected_test_type_row["column_name_prompt"] + column_name_help = selected_test_type_row["column_name_help"] + if test_scope == "table": test_definition["column_name"] = None - elif test_scope == "multi-column": - pass # TODO: this will have to be modified to accommodate aggregate match tests + column_name_label = None + elif test_scope == "referential": + column_name_disabled = False + test_definition["column_name"] = st.text_input( + label=column_name_label, + value=column_name, + max_chars=500, + help=column_name_help, + disabled=column_name_disabled, + ) elif test_scope == "custom": if str_column_name: if mode == "add": # query add present @@ -488,16 +499,10 @@ def show_add_edit_modal( else: # query edit not-present column_name_disabled = False - column_name_help = "Specify a brief indication of scope for this test " - column_name_help += "that is unique within this Test Suite for the Table and Test Type. " - column_name_help += "This distinguishes this test from others of the same type on the same table. \n\n" - column_name_help += "Example: if you are testing whether product_code is found " - column_name_help += "in the related table called dim_products, you might say `Ref Integrity: dim_products`." - test_definition["column_name"] = st.text_input( - label="Test Scope", + label=column_name_label, value=column_name, - max_chars=50, + max_chars=100, help=column_name_help, disabled=column_name_disabled, ) @@ -515,11 +520,12 @@ def show_add_edit_modal( else: pass # CAT edit not-present + column_name_label = "Column Name" column_name_options = get_column_names(table_groups_id, test_definition["table_name"]) column_name_help = "Select the column to test" column_name_index = column_name_options.index(column_name) if column_name else 0 test_definition["column_name"] = st.selectbox( - label="Column Name", + label=column_name_label, options=column_name_options, index=column_name_index, help=column_name_help, @@ -535,7 +541,7 @@ def show_add_edit_modal( current_column = mid_left_column show_custom_query = False dynamic_attributes_length = len(dynamic_attributes) - dynamic_attributes_half_length = max(round(dynamic_attributes_length / 2), 1) + dynamic_attributes_half_length = max(round((dynamic_attributes_length + 0.5) / 2), 1) for i, dynamic_attribute in enumerate(dynamic_attributes): if i >= dynamic_attributes_half_length: current_column = mid_right_column @@ -554,20 +560,12 @@ def show_add_edit_modal( else snake_case_to_title_case(dynamic_attribute) ) - if dynamic_attribute in ["match_column_names", "match_groupby_names", "groupby_names"]: - test_definition[dynamic_attribute] = st.text_area( - label=actual_dynamic_attributes_labels, - value=value, - height=1, - max_chars=4000, - help=actual_dynamic_attributes_help, - ) - elif dynamic_attribute in ["custom_query"]: + if dynamic_attribute in ["custom_query"]: show_custom_query = True else: test_definition[dynamic_attribute] = current_column.text_input( label=actual_dynamic_attributes_labels, - max_chars=1000, + max_chars=4000 if dynamic_attribute in ["match_column_names", "match_groupby_names", "groupby_names"] else 1000, value=value, help=actual_dynamic_attributes_help, ) @@ -593,7 +591,7 @@ def show_add_edit_modal( # skip_errors if run_type == "QUERY": - test_definition["skip_errors"] = left_column.number_input(label="Skip Errors", value=skip_errors) + test_definition["skip_errors"] = left_column.number_input(label="Threshold Error Count", value=skip_errors) else: test_definition["skip_errors"] = skip_errors @@ -615,7 +613,7 @@ def show_add_edit_modal( submit = bottom_left_column.button("Save", disabled=authentication_service.current_user_has_read_role()) if submit: - if validate_form(test_scope, test_type, test_definition): + if validate_form(test_scope, test_type, test_definition, column_name_label): if mode == "edit": test_definition_service.update(test_definition) test_definition_modal.close() @@ -628,15 +626,15 @@ def show_add_edit_modal( test_definition_modal.close() -def validate_form(test_scope, test_type, test_definition): +def validate_form(test_scope, test_type, test_definition, column_name_label): if test_type == "Condition_Flag" and not test_definition["threshold_value"]: st.error("Threshold Error Count is a required field.") return False if not test_definition["test_type"]: st.error("Test Type is a required field.") return False - if test_scope in ["column", "multi-column", "custom"] and not test_definition["column_name"]: - st.error("Test Scope is a required field.") + if test_scope in ["column", "referential", "custom"] and not test_definition["column_name"]: + st.error(f"{column_name_label} is a required field.") return False return True @@ -647,10 +645,10 @@ def validate_test_definition_uniqueness(test_definition, test_scope): match test_scope: case "column": message_bit = "and Column Name " - case "multi-column": + case "referential": message_bit = "and Column Names " case "custom": - message_bit = "and Test Scope" + message_bit = "and Test Focus " case "table": message_bit = "" case _: @@ -660,12 +658,22 @@ def validate_test_definition_uniqueness(test_definition, test_scope): def prompt_for_test_type(): - df = run_test_type_lookup_query() - lst_choices = ["(Select a Test Type)", *df["test_name_short"].tolist()] + + col0, col1, col2, col3, col4, col5 = st.columns([0.1, 0.2, 0.2, 0.2, 0.2, 0.1]) + col0.write("Show Types") + boo_show_referential = col1.checkbox(":green[⧉] Referential", True) + boo_show_table = col2.checkbox(":green[⊞] Table", True) + boo_show_column = col3.checkbox(":green[≣] Column", True) + boo_show_custom = col4.checkbox(":green[⛭] Custom", True) + + df = run_test_type_lookup_query(str_test_type=None, boo_show_referential=boo_show_referential, + boo_show_table=boo_show_table, boo_show_column=boo_show_column, + boo_show_custom=boo_show_custom) + lst_choices = ["(Select a Test Type)", *df["select_name"].tolist()] str_selected = selectbox("Test Type", lst_choices) if str_selected: - row_selected = df[df["test_name_short"] == str_selected].iloc[0] + row_selected = df[df["test_name_short"] == str_selected.split(":", 1)[0][2:]].iloc[0] str_value = row_selected["test_type"] else: str_value = None @@ -705,19 +713,19 @@ def show_test_defs_grid( "last_manual_update", ] show_column_headers = [ - "schema_name", - "table_name", - "column_name", - "test_name_short", - "test_active", - "lock_refresh", - "urgency", - "export_to_observability", - "profiling_as_of_date", - "last_manual_update", + "Schema", + "Table", + "Columns / Focus", + "Test Name", + "Active", + "Locked", + "Urgency", + "Export to Observabilty", + "Based on Profiling", + "Last Manual Update", ] - show_column_headers = list(map(snake_case_to_title_case, show_column_headers)) + # show_column_headers = list(map(snake_case_to_title_case, show_column_headers)) dct_selected_row = fm.render_grid_select( df, @@ -746,7 +754,7 @@ def show_test_defs_grid( lst_export_headers = [ "Schema", "Table Name", - "Column/Test Scope", + "Column/Test Focus", "Test Type", "Description", "Test Threshold", @@ -856,9 +864,11 @@ def run_project_lookup_query(): @st.cache_data(show_spinner=False) -def run_test_type_lookup_query(str_test_type=None): +def run_test_type_lookup_query(str_test_type=None, boo_show_referential=True, boo_show_table=True, + boo_show_column=True, boo_show_custom=True): str_schema = st.session_state["dbschema"] - return dq.run_test_type_lookup_query(str_schema, str_test_type) + return dq.run_test_type_lookup_query(str_schema, str_test_type, boo_show_referential, boo_show_table, + boo_show_column, boo_show_custom) @st.cache_data(show_spinner=False) diff --git a/testgen/ui/views/test_results.py b/testgen/ui/views/test_results.py index c7b39f9..e495f0c 100644 --- a/testgen/ui/views/test_results.py +++ b/testgen/ui/views/test_results.py @@ -49,7 +49,7 @@ def render(self) -> None: str_sel_test_run = None if not str_project: - st.write("Select a Project from the menu.") + st.write("Choose a Project from the menu.") else: # Setup Toolbar tool_bar = tb.ToolBar(3, 1, 4, None) @@ -206,10 +206,13 @@ def get_test_results_uncached(str_schema, str_run_id, str_sel_test_status): r.id::VARCHAR as test_result_id, r.test_run_id::VARCHAR, c.id::VARCHAR as connection_id, ts.id::VARCHAR as test_suite_id, r.test_definition_id::VARCHAR as test_definition_id_runtime, - d.id::VARCHAR as test_definition_id_current + d.id::VARCHAR as test_definition_id_current, + r.auto_gen FROM run_results r INNER JOIN {str_schema}.test_types tt ON (r.test_type = tt.test_type) + LEFT JOIN {str_schema}.test_definitions rd + ON (r.test_definition_id = rd.id) LEFT JOIN {str_schema}.test_definitions d ON (r.project_code = d.project_code AND r.test_suite = d.test_suite @@ -277,19 +280,30 @@ def get_test_result_summary(str_run_id): @st.cache_data(show_spinner=ALWAYS_SPIN) -def get_test_result_history(str_test_type, str_table_groups_id, str_table_name, str_column_names): +def get_test_result_history(str_test_type, str_table_groups_id, str_table_name, str_column_names, + str_test_definition_id, auto_gen): str_schema = st.session_state["dbschema"] - str_sql = f""" - SELECT test_date, test_type, - test_name_short, test_name_long, measure_uom, test_operator, - threshold_value::NUMERIC, result_measure, result_status - FROM {str_schema}.v_test_results + + if auto_gen: + str_where = f""" WHERE table_groups_id = '{str_table_groups_id}' AND table_name = '{str_table_name}' AND column_names = '{str_column_names}' AND test_type = '{str_test_type}' + """ + else: + str_where = f""" + WHERE test_definition_id_runtime = '{str_test_definition_id}' + """ + + str_sql = f""" + SELECT test_date, test_type, + test_name_short, test_name_long, measure_uom, test_operator, + threshold_value::NUMERIC, result_measure, result_status + FROM {str_schema}.v_test_results {str_where} ORDER BY test_date DESC; """ + df = db.retrieve_data(str_sql) # Clean Up df["test_date"] = pd.to_datetime(df["test_date"]) @@ -307,7 +321,12 @@ def get_test_definition_uncached(str_schema, str_test_def_id): str_sql = f""" SELECT d.id::VARCHAR, tt.test_name_short as test_name, tt.test_name_long as full_name, tt.test_description as description, tt.usage_notes, + d.column_name, d.baseline_value, d.baseline_ct, d.baseline_avg, d.baseline_sd, d.threshold_value, + d.subset_condition, d.groupby_names, d.having_condition, d.match_schema_name, + d.match_table_name, d.match_column_names, d.match_subset_condition, + d.match_groupby_names, d.match_having_condition, + d.window_date_column, d.window_days::VARCHAR as window_days, d.custom_query, d.severity, tt.default_severity, d.test_active, d.lock_refresh, d.last_manual_update @@ -359,6 +378,27 @@ def replace_parms(df_test, str_query): str_query = str_query.replace("{BASELINE_SD}", empty_if_null(df_test.at[0, "baseline_sd"])) str_query = str_query.replace("{THRESHOLD_VALUE}", empty_if_null(df_test.at[0, "threshold_value"])) + str_substitute = empty_if_null(df_test.at[0, "subset_condition"]) + str_substitute = "1=1" if str_substitute == "" else str_substitute + str_query = str_query.replace("{SUBSET_CONDITION}", str_substitute) + + str_query = str_query.replace("{GROUPBY_NAMES}", empty_if_null(df_test.at[0, "groupby_names"])) + str_query = str_query.replace("{HAVING_CONDITION}", empty_if_null(df_test.at[0, "having_condition"])) + str_query = str_query.replace("{MATCH_SCHEMA_NAME}", empty_if_null(df_test.at[0, "match_schema_name"])) + str_query = str_query.replace("{MATCH_TABLE_NAME}", empty_if_null(df_test.at[0, "match_table_name"])) + str_query = str_query.replace("{MATCH_COLUMN_NAMES}", empty_if_null(df_test.at[0, "match_column_names"])) + + str_substitute = empty_if_null(df_test.at[0, "match_subset_condition"]) + str_substitute = "1=1" if str_substitute == "" else str_substitute + str_query = str_query.replace("{MATCH_SUBSET_CONDITION}", str_substitute) + + str_query = str_query.replace("{MATCH_GROUPBY_NAMES}", empty_if_null(df_test.at[0, "match_groupby_names"])) + str_query = str_query.replace("{MATCH_HAVING_CONDITION}", empty_if_null(df_test.at[0, "match_having_condition"])) + str_query = str_query.replace("{COLUMN_NAME_NO_QUOTES}", empty_if_null(selected_row["column_names"])) + + str_query = str_query.replace("{WINDOW_DATE_COLUMN}", empty_if_null(df_test.at[0, "window_date_column"])) + str_query = str_query.replace("{WINDOW_DAYS}", empty_if_null(df_test.at[0, "window_days"])) + if str_query is None or str_query == "": raise ValueError("Lookup query is not defined for this Test Type.") return str_query @@ -542,7 +582,7 @@ def show_result_detail(str_run_id, str_sel_test_status, do_multi_select, export_ lst_show_headers = [ "Table Name", - "Columns/Scope", + "Columns/Focus", "Test Type", "Result Measure", "UOM", @@ -575,7 +615,7 @@ def show_result_detail(str_run_id, str_sel_test_status, do_multi_select, export_ lst_export_headers = [ "Schema Name", "Table Name", - "Columns/Scope", + "Columns/Focus", "Test Type", "Test Description", "DQ Dimension", @@ -602,6 +642,8 @@ def show_result_detail(str_run_id, str_sel_test_status, do_multi_select, export_ selected_row["table_groups_id"], selected_row["table_name"], selected_row["column_names"], + selected_row["test_definition_id_runtime"], + selected_row["auto_gen"] ) show_hist_columns = ["test_date", "threshold_value", "result_measure", "result_status"] diff --git a/testgen/ui/views/test_suites.py b/testgen/ui/views/test_suites.py index b13f877..aa1b9f6 100644 --- a/testgen/ui/views/test_suites.py +++ b/testgen/ui/views/test_suites.py @@ -250,7 +250,7 @@ def show_run_test_generation(modal, selected): ) if test_ct: warning_msg = "" - counts_msg = f"\n\nTests: {test_ct}, Unlocked: {unlocked_test_ct}, Edited Unlocked: {unlocked_edits_ct}" + counts_msg = f"\n\nAuto-Generated Tests: {test_ct}, Unlocked: {unlocked_test_ct}, Edited Unlocked: {unlocked_edits_ct}" if unlocked_edits_ct > 0: if unlocked_edits_ct > 1: From 7f2a375b26454913e12a5fc7df5ef797fe655e92 Mon Sep 17 00:00:00 2001 From: "Chip.Bloche" Date: Tue, 11 Jun 2024 12:09:11 -0400 Subject: [PATCH 03/22] Fixed linter issues --- testgen/commands/queries/execute_tests_query.py | 3 ++- testgen/common/clean_sql.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/testgen/commands/queries/execute_tests_query.py b/testgen/commands/queries/execute_tests_query.py index bc9fb4a..8d43098 100644 --- a/testgen/commands/queries/execute_tests_query.py +++ b/testgen/commands/queries/execute_tests_query.py @@ -1,6 +1,7 @@ import typing -from testgen.common import CleanSQL, AddQuotesToIdentifierCSV, date_service, read_template_sql_file +from testgen.common import AddQuotesToIdentifierCSV, CleanSQL, date_service, read_template_sql_file + class CTestExecutionSQL: flavor = "" diff --git a/testgen/common/clean_sql.py b/testgen/common/clean_sql.py index 7443bd6..9cb9c30 100644 --- a/testgen/common/clean_sql.py +++ b/testgen/common/clean_sql.py @@ -1,4 +1,4 @@ -__all__ = ["CleanSQL", "AddQuotesToIdentifierCSV"] +__all__ = ["AddQuotesToIdentifierCSV", "CleanSQL"] import re From 1ef612d1c0e52e899f94223b70ea89f8ea044568 Mon Sep 17 00:00:00 2001 From: "Chip.Bloche" Date: Wed, 12 Jun 2024 12:02:23 -0400 Subject: [PATCH 04/22] fix(mssql): fixes to distribution shift test --- .../commands/queries/execute_tests_query.py | 23 +++++-- testgen/common/clean_sql.py | 16 ++++- .../050_populate_new_schema_metadata.sql | 68 +++++++++---------- .../ex_relative_entropy_generic.sql | 8 +-- .../ex_window_match_no_drops_generic.sql | 6 +- .../ex_window_match_same_generic.sql | 12 ++-- .../ex_relative_entropy_mssql.sql | 53 +++++++++++++++ .../ex_window_match_no_drops_postgresql.sql | 46 +++++++++++++ .../ex_window_match_same_postgresql.sql | 59 ++++++++++++++++ testgen/ui/views/test_results.py | 7 +- 10 files changed, 242 insertions(+), 56 deletions(-) create mode 100644 testgen/template/flavors/mssql/exec_query_tests/ex_relative_entropy_mssql.sql create mode 100644 testgen/template/flavors/postgresql/exec_query_tests/ex_window_match_no_drops_postgresql.sql create mode 100644 testgen/template/flavors/postgresql/exec_query_tests/ex_window_match_same_postgresql.sql diff --git a/testgen/commands/queries/execute_tests_query.py b/testgen/commands/queries/execute_tests_query.py index 8d43098..69bb3a4 100644 --- a/testgen/commands/queries/execute_tests_query.py +++ b/testgen/commands/queries/execute_tests_query.py @@ -1,6 +1,6 @@ import typing -from testgen.common import AddQuotesToIdentifierCSV, CleanSQL, date_service, read_template_sql_file +from testgen.common import AddQuotesToIdentifierCSV, CleanSQL, ConcatColumnList, date_service, read_template_sql_file class CTestExecutionSQL: @@ -12,7 +12,7 @@ class CTestExecutionSQL: exception_message = "" process_id = "" - # Test Set Parameters + # Test Group Parameters dctTestParms: typing.ClassVar = {} sum_columns = "" match_sum_columns = "" @@ -47,9 +47,9 @@ def _ReplaceParms(self, strInputString: str): strInputString = strInputString.replace("{INPUT_PARAMETERS}", self._AssembleDisplayParameters()) strInputString = strInputString.replace("{RUN_DATE}", self.run_date) - strInputString = strInputString.replace("{SUM_COLUMNS}", self.sum_columns) - strInputString = strInputString.replace("{MATCH_SUM_COLUMNS}", self.match_sum_columns) - strInputString = strInputString.replace("{MULTI_COLUMN_ERROR_CONDITION}", self.multi_column_error_condition) + # strInputString = strInputString.replace("{SUM_COLUMNS}", self.sum_columns) + # strInputString = strInputString.replace("{MATCH_SUM_COLUMNS}", self.match_sum_columns) + # strInputString = strInputString.replace("{MULTI_COLUMN_ERROR_CONDITION}", self.multi_column_error_condition) strInputString = strInputString.replace("{EXCEPTION_MESSAGE}", self.exception_message) strInputString = strInputString.replace("{START_TIME}", self.today) strInputString = strInputString.replace("{PROCESS_ID}", str(self.process_id)) @@ -78,9 +78,17 @@ def _ReplaceParms(self, strInputString: str): if parm == "column_name": # Shows contents without double-quotes for display and aggregate expressions strInputString = strInputString.replace("{COLUMN_NAME_NO_QUOTES}", value if value else "") + # Concatenates column list into single expression for relative entropy + str_value = ConcatColumnList(value, "") + strInputString = strInputString.replace("{CONCAT_COLUMNS}", str_value if str_value else "") + if parm == "match_groupby_names": + # Concatenates column list into single expression for relative entropy + str_value = ConcatColumnList(value, "") + strInputString = strInputString.replace("{CONCAT_MATCH_GROUPBY}", str_value if str_value else "") if parm == "subset_condition": strInputString = strInputString.replace("{SUBSET_DISPLAY}", value.replace("'", "''") if value else "") + # Adding escape character where ':' is referenced strInputString = strInputString.replace(":", "\\:") @@ -138,6 +146,7 @@ def _ConstructAggregateMatchParms(self): self.list_multi_column_error_condition = [i + " < 0" for i in cols] self.multi_column_error_condition = " or ".join(self.list_multi_column_error_condition) + def GetTestQuery(self, booClean: bool): strTestType = self.dctTestParms["test_type"] strTemplate = self.dctTestParms["template_name"] @@ -145,8 +154,8 @@ def GetTestQuery(self, booClean: bool): if strTemplate == "": raise ValueError(f"No query template assigned to test_type {strTestType}") - if strTestType in {"AGG MATCH NO DROPS", "AGG MATCH SAME", "AGG MATCH NUM INCR"}: - self._ConstructAggregateMatchParms() + # if strTestType in {"AGG MATCH NO DROPS", "AGG MATCH SAME", "AGG MATCH NUM INCR"}: + # self._ConstructAggregateMatchParms() strQ = self._GetTestQueryFromTemplate(strTemplate) # Final replace to cover parm within CUSTOM_QUERY parm strQ = strQ.replace("{DATA_SCHEMA}", self.dctTestParms["schema_name"]) diff --git a/testgen/common/clean_sql.py b/testgen/common/clean_sql.py index 9cb9c30..8c275ac 100644 --- a/testgen/common/clean_sql.py +++ b/testgen/common/clean_sql.py @@ -1,4 +1,4 @@ -__all__ = ["AddQuotesToIdentifierCSV", "CleanSQL"] +__all__ = ["AddQuotesToIdentifierCSV", "CleanSQL", "ConcatColumnList"] import re @@ -37,3 +37,17 @@ def AddQuotesToIdentifierCSV(strInput: str) -> str: else: quoted_values.append(value) return ", ".join(quoted_values) + + +def ConcatColumnList(str_column_list, str_null_value): + # Prepares SQL expression to concatenate comma-separated column list into single SQL expression + str_expression = "" + if str_column_list: + if "," in str_column_list: + # Split each comma separated column name into individual list items + cols = [s.strip() for s in str_column_list.split(",")] + str_each = [f"COALESCE({i}, '{str_null_value}')" for i in cols] + str_expression = "CONCAT(" + ", ".join(str_each) + ")" + else: + str_expression = str_column_list + return str_expression diff --git a/testgen/template/dbsetup/050_populate_new_schema_metadata.sql b/testgen/template/dbsetup/050_populate_new_schema_metadata.sql index 57278de..87a748a 100644 --- a/testgen/template/dbsetup/050_populate_new_schema_metadata.sql +++ b/testgen/template/dbsetup/050_populate_new_schema_metadata.sql @@ -188,7 +188,7 @@ VALUES ('2001', 'Combo_Match', 'redshift', 'ex_data_match_generic.sql'), ('2201', 'Combo_Match', 'mssql', 'ex_data_match_generic.sql'), ('2202', 'Aggregate_Minimum', 'mssql', 'ex_aggregate_match_no_drops_generic.sql'), - ('2203', 'Distribution_Shift', 'mssql', 'ex_relative_entropy_generic.sql'), + ('2203', 'Distribution_Shift', 'mssql', 'ex_relative_entropy_mssql.sql'), ('2204', 'CUSTOM', 'mssql', 'ex_custom_query_generic.sql'), ('2206', 'Aggregate_Balance', 'mssql', 'ex_aggregate_match_same_generic.sql'), ('2207', 'Timeframe_Combo_Gain', 'mssql', 'ex_window_match_no_drops_generic.sql'), @@ -200,8 +200,8 @@ VALUES ('2001', 'Combo_Match', 'redshift', 'ex_data_match_generic.sql'), ('2303', 'Distribution_Shift', 'postgresql', 'ex_relative_entropy_generic.sql'), ('2304', 'CUSTOM', 'postgresql', 'ex_custom_query_generic.sql'), ('2306', 'Aggregate_Balance', 'postgresql', 'ex_aggregate_match_same_generic.sql'), - ('2307', 'Timeframe_Combo_Gain', 'postgresql', 'ex_window_match_no_drops_generic.sql'), - ('2308', 'Timeframe_Combo_Match', 'postgresql', 'ex_window_match_same_generic.sql'), + ('2307', 'Timeframe_Combo_Gain', 'postgresql', 'ex_window_match_no_drops_postgresql.sql'), + ('2308', 'Timeframe_Combo_Match', 'postgresql', 'ex_window_match_same_postgresql.sql'), ('2309', 'Aggregate_Increase', 'postgresql', 'ex_aggregate_match_num_incr_generic.sql'); TRUNCATE TABLE cat_test_conditions; @@ -968,81 +968,81 @@ ORDER BY {COLUMN_NAME_NO_QUOTES};'), ) test ORDER BY {COLUMN_NAME_NO_QUOTES};'), ('1257', '1503', 'Test Results', 'Distribution_Shift', 'redshift', NULL, 'WITH latest_ver - AS ( SELECT COALESCE({COLUMN_NAME_NO_QUOTES}, '''') as category, + AS ( SELECT {CONCAT_COLUMNS} as category, COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total FROM {TARGET_SCHEMA}.{TABLE_NAME} v1 WHERE {SUBSET_CONDITION} - GROUP BY 1 ), + GROUP BY {COLUMN_NAME_NO_QUOTES} ), older_ver - AS ( SELECT COALESCE({MATCH_GROUPBY_NAMES}, '''') as category, + AS ( SELECT {CONCAT_MATCH_GROUPBY} as category, COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total FROM {MATCH_SCHEMA_NAME}.{TABLE_NAME} v2 WHERE {MATCH_SUBSET_CONDITION} - GROUP BY 1 ) + GROUP BY {MATCH_GROUPBY_NAMES} ) SELECT COALESCE(l.category, o.category) AS category, o.pct_of_total AS old_pct, l.pct_of_total AS new_pct FROM latest_ver l FULL JOIN older_ver o ON (l.category = o.category) -ORDER BY COALESCE(l.category, o.category);'), +ORDER BY COALESCE(l.category, o.category)'), ('1258', '1503', 'Test Results', 'Distribution_Shift', 'snowflake', NULL, 'WITH latest_ver - AS ( SELECT COALESCE({COLUMN_NAME_NO_QUOTES}, '''') as category, + AS ( SELECT {CONCAT_COLUMNS} as category, COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total FROM {TARGET_SCHEMA}.{TABLE_NAME} v1 WHERE {SUBSET_CONDITION} - GROUP BY 1 ), + GROUP BY {COLUMN_NAME_NO_QUOTES} ), older_ver - AS ( SELECT COALESCE({MATCH_GROUPBY_NAMES}, '''') as category, + AS ( SELECT {CONCAT_MATCH_GROUPBY} as category, COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total FROM {MATCH_SCHEMA_NAME}.{TABLE_NAME} v2 WHERE {MATCH_SUBSET_CONDITION} - GROUP BY 1 ) + GROUP BY {MATCH_GROUPBY_NAMES} ) SELECT COALESCE(l.category, o.category) AS category, o.pct_of_total AS old_pct, l.pct_of_total AS new_pct FROM latest_ver l FULL JOIN older_ver o ON (l.category = o.category) -ORDER BY COALESCE(l.category, o.category);'), +ORDER BY COALESCE(l.category, o.category)'), ('1259', '1503', 'Test Results', 'Distribution_Shift', 'mssql', NULL, 'WITH latest_ver - AS ( SELECT COALESCE({COLUMN_NAME_NO_QUOTES}, '''') as category, - COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total + AS ( SELECT {CONCAT_COLUMNS} as category, + CAST(COUNT(*) as FLOAT) / CAST(SUM(COUNT(*)) OVER () as FLOAT) AS pct_of_total FROM {TARGET_SCHEMA}.{TABLE_NAME} v1 WHERE {SUBSET_CONDITION} - GROUP BY 1 ), + GROUP BY {COLUMN_NAME_NO_QUOTES} ), older_ver - AS ( SELECT COALESCE({MATCH_GROUPBY_NAMES}, '''') as category, - COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total + AS ( SELECT {CONCAT_MATCH_GROUPBY} as category, + CAST(COUNT(*) as FLOAT) / CAST(SUM(COUNT(*)) OVER () as FLOAT) AS pct_of_total FROM {MATCH_SCHEMA_NAME}.{TABLE_NAME} v2 WHERE {MATCH_SUBSET_CONDITION} - GROUP BY 1 ) + GROUP BY {MATCH_GROUPBY_NAMES} ) SELECT COALESCE(l.category, o.category) AS category, o.pct_of_total AS old_pct, l.pct_of_total AS new_pct FROM latest_ver l FULL JOIN older_ver o ON (l.category = o.category) -ORDER BY COALESCE(l.category, o.category);'), +ORDER BY COALESCE(l.category, o.category)'), ('1260', '1503', 'Test Results', 'Distribution_Shift', 'postgresql', NULL, 'WITH latest_ver - AS ( SELECT COALESCE({COLUMN_NAME_NO_QUOTES}, '''') as category, + AS ( SELECT {CONCAT_COLUMNS} as category, COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total FROM {TARGET_SCHEMA}.{TABLE_NAME} v1 WHERE {SUBSET_CONDITION} - GROUP BY 1 ), + GROUP BY {COLUMN_NAME_NO_QUOTES} ), older_ver - AS ( SELECT COALESCE({MATCH_GROUPBY_NAMES}, '''') as category, + AS ( SELECT {CONCAT_MATCH_GROUPBY} as category, COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total FROM {MATCH_SCHEMA_NAME}.{TABLE_NAME} v2 WHERE {MATCH_SUBSET_CONDITION} - GROUP BY 1 ) + GROUP BY {MATCH_GROUPBY_NAMES} ) SELECT COALESCE(l.category, o.category) AS category, o.pct_of_total AS old_pct, l.pct_of_total AS new_pct FROM latest_ver l FULL JOIN older_ver o ON (l.category = o.category) -ORDER BY COALESCE(l.category, o.category);'), +ORDER BY COALESCE(l.category, o.category)'), ('1261', '1508', 'Test Results', 'Timeframe_Combo_Gain', 'redshift', NULL, 'SELECT {COLUMN_NAME_NO_QUOTES} FROM {TARGET_SCHEMA}.{TABLE_NAME} @@ -1071,14 +1071,14 @@ GROUP BY {COLUMN_NAME_NO_QUOTES}'), ('1263', '1508', 'Test Results', 'Timeframe_Combo_Gain', 'mssql', NULL, 'SELECT {COLUMN_NAME_NO_QUOTES} FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS} - AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME})) + AND {WINDOW_DATE_COLUMN} < DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME})) GROUP BY {COLUMN_NAME_NO_QUOTES} EXCEPT SELECT {COLUMN_NAME_NO_QUOTES} FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME})) GROUP BY {COLUMN_NAME_NO_QUOTES}'), ('1264', '1508', 'Test Results', 'Timeframe_Combo_Gain', 'postgresql', NULL, 'SELECT {COLUMN_NAME_NO_QUOTES} FROM {TARGET_SCHEMA}.{TABLE_NAME} @@ -1146,26 +1146,26 @@ WHERE {SUBSET_CONDITION} SELECT ''Prior Timeframe'' as missing_from, {COLUMN_NAME} FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME})) EXCEPT SELECT ''Prior Timeframe'' as missing_from, {COLUMN_NAME} FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS} - AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME})) + AND {WINDOW_DATE_COLUMN} < DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME})) ) UNION ALL ( SELECT ''Latest Timeframe'' as missing_from, {COLUMN_NAME} FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - 2 * {WINDOW_DAYS} - AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME})) + AND {WINDOW_DATE_COLUMN} < DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME})) EXCEPT SELECT ''Latest Timeframe'' as missing_from, {COLUMN_NAME} FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME})) )'), ('1268', '1509', 'Test Results', 'Timeframe_Combo_Match', 'postgresql', NULL, ' ( SELECT ''Prior Timeframe'' as missing_from, {COLUMN_NAME} diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_relative_entropy_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_relative_entropy_generic.sql index b14c654..f27b31b 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_relative_entropy_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_relative_entropy_generic.sql @@ -5,17 +5,17 @@ -- 0.5 * kl_divergence(p, m) + 0.5 * kl_divergence(q, m) -- Log base 2 of x = LN(x)/LN(2) WITH latest_ver - AS ( SELECT COALESCE({COLUMN_NAME_NO_QUOTES}, '') as category, + AS ( SELECT {CONCAT_COLUMNS} as category, COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total FROM {SCHEMA_NAME}.{TABLE_NAME} v1 WHERE {SUBSET_CONDITION} - GROUP BY 1 ), + GROUP BY {COLUMN_NAME_NO_QUOTES} ), older_ver - AS ( SELECT COALESCE({MATCH_GROUPBY_NAMES}, '') as category, + AS ( SELECT {CONCAT_MATCH_GROUPBY} as category, COUNT(*)::FLOAT / SUM(COUNT(*)) OVER ()::FLOAT AS pct_of_total FROM {MATCH_SCHEMA_NAME}.{TABLE_NAME} v2 WHERE {MATCH_SUBSET_CONDITION} - GROUP BY 1 ), + GROUP BY {MATCH_GROUPBY_NAMES} ), dataset AS ( SELECT COALESCE(l.category, o.category) AS category, COALESCE(o.pct_of_total, 0.0000001) AS old_pct, diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_window_match_no_drops_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_window_match_no_drops_generic.sql index 92d9554..10dbda2 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_window_match_no_drops_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_window_match_no_drops_generic.sql @@ -34,13 +34,13 @@ FROM ( SELECT {COLUMN_NAME_NO_QUOTES} FROM {SCHEMA_NAME}.{TABLE_NAME} WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - 2 * {WINDOW_DAYS} - AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME})) + AND {WINDOW_DATE_COLUMN} < DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME})) GROUP BY {COLUMN_NAME_NO_QUOTES} EXCEPT SELECT {COLUMN_NAME_NO_QUOTES} FROM {SCHEMA_NAME}.{TABLE_NAME} WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME})) GROUP BY {COLUMN_NAME_NO_QUOTES} ) test; diff --git a/testgen/template/flavors/generic/exec_query_tests/ex_window_match_same_generic.sql b/testgen/template/flavors/generic/exec_query_tests/ex_window_match_same_generic.sql index 202ebba..4b0c2b8 100644 --- a/testgen/template/flavors/generic/exec_query_tests/ex_window_match_same_generic.sql +++ b/testgen/template/flavors/generic/exec_query_tests/ex_window_match_same_generic.sql @@ -35,25 +35,25 @@ SELECT '{PROJECT_CODE}' as project_code, SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME} FROM {SCHEMA_NAME}.{TABLE_NAME} WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME})) EXCEPT SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME} FROM {SCHEMA_NAME}.{TABLE_NAME} WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - 2 * {WINDOW_DAYS} - AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME})) + AND {WINDOW_DATE_COLUMN} < DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME})) ) UNION ALL ( SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME} FROM {SCHEMA_NAME}.{TABLE_NAME} WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - 2 * {WINDOW_DAYS} - AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - 2 * {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME})) + AND {WINDOW_DATE_COLUMN} < DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME})) EXCEPT SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME} FROM {SCHEMA_NAME}.{TABLE_NAME} WHERE {SUBSET_CONDITION} - AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} >= DATEADD("day", - {WINDOW_DAYS}, (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME})) ) ) test; diff --git a/testgen/template/flavors/mssql/exec_query_tests/ex_relative_entropy_mssql.sql b/testgen/template/flavors/mssql/exec_query_tests/ex_relative_entropy_mssql.sql new file mode 100644 index 0000000..2d51aa0 --- /dev/null +++ b/testgen/template/flavors/mssql/exec_query_tests/ex_relative_entropy_mssql.sql @@ -0,0 +1,53 @@ +-- Relative Entropy: measured by Jensen-Shannon Divergence +-- Smoothed and normalized version of KL divergence, +-- with scores between 0 (identical) and 1 (maximally different), +-- when using the base-2 logarithm. Formula is: +-- 0.5 * kl_divergence(p, m) + 0.5 * kl_divergence(q, m) +-- Log base 2 of x = LN(x)/LN(2) +WITH latest_ver + AS ( SELECT {CONCAT_COLUMNS} as category, + CAST(COUNT(*) as FLOAT) / CAST(SUM(COUNT(*)) OVER () as FLOAT) AS pct_of_total + FROM {SCHEMA_NAME}.{TABLE_NAME} v1 + WHERE {SUBSET_CONDITION} + GROUP BY {COLUMN_NAME_NO_QUOTES} ), +older_ver + AS ( SELECT {CONCAT_MATCH_GROUPBY} as category, + CAST(COUNT(*) as FLOAT) / CAST(SUM(COUNT(*)) OVER () as FLOAT) AS pct_of_total + FROM {MATCH_SCHEMA_NAME}.{TABLE_NAME} v2 + WHERE {MATCH_SUBSET_CONDITION} + GROUP BY {MATCH_GROUPBY_NAMES} ), +dataset + AS ( SELECT COALESCE(l.category, o.category) AS category, + COALESCE(o.pct_of_total, 0.0000001) AS old_pct, + COALESCE(l.pct_of_total, 0.0000001) AS new_pct, + (COALESCE(o.pct_of_total, 0.0000001) + + COALESCE(l.pct_of_total, 0.0000001))/2.0 AS avg_pct + FROM latest_ver l + FULL JOIN older_ver o + ON (l.category = o.category) ) +SELECT '{PROJECT_CODE}' as project_code, + '{TEST_TYPE}' as test_type, + '{TEST_DEFINITION_ID}' as test_definition_id, + '{TEST_SUITE}' as test_suite, + '{TEST_RUN_ID}' as test_run_id, + '{RUN_DATE}' as test_time, + '{START_TIME}' as starttime, + CURRENT_TIMESTAMP as endtime, + '{SCHEMA_NAME}' as schema_name, + '{TABLE_NAME}' as table_name, + '{COLUMN_NAME_NO_QUOTES}' as column_names, +-- '{GROUPBY_NAMES}' as column_names, + '{THRESHOLD_VALUE}' as threshold_value, + NULL as skip_errors, + '{INPUT_PARAMETERS}' as input_parameters, + CASE WHEN js_divergence > {THRESHOLD_VALUE} THEN 0 ELSE 1 END as result_code, + CONCAT('Divergence Level: ', + CONCAT(CAST(js_divergence AS VARCHAR), + ', Threshold: {THRESHOLD_VALUE}.')) as result_message, + js_divergence as result_measure, + '{SUBSET_DISPLAY}' as subset_condition, + NULL as result_query + FROM ( + SELECT 0.5 * ABS(SUM(new_pct * LOG(new_pct/avg_pct)/LOG(2))) + + 0.5 * ABS(SUM(old_pct * LOG(old_pct/avg_pct)/LOG(2))) as js_divergence + FROM dataset ) rslt; diff --git a/testgen/template/flavors/postgresql/exec_query_tests/ex_window_match_no_drops_postgresql.sql b/testgen/template/flavors/postgresql/exec_query_tests/ex_window_match_no_drops_postgresql.sql new file mode 100644 index 0000000..92d9554 --- /dev/null +++ b/testgen/template/flavors/postgresql/exec_query_tests/ex_window_match_no_drops_postgresql.sql @@ -0,0 +1,46 @@ +SELECT '{PROJECT_CODE}' as project_code, + '{TEST_TYPE}' as test_type, + '{TEST_DEFINITION_ID}' as test_definition_id, + '{TEST_SUITE}' as test_suite, + '{TEST_RUN_ID}' as test_run_id, + '{RUN_DATE}' as test_time, + '{START_TIME}' as starttime, + CURRENT_TIMESTAMP as endtime, + '{SCHEMA_NAME}' as schema_name, + '{TABLE_NAME}' as table_name, + '{COLUMN_NAME_NO_QUOTES}' as column_names, + '{SKIP_ERRORS}' as threshold_value, + {SKIP_ERRORS} as skip_errors, + '{INPUT_PARAMETERS}' as input_parameters, + CASE WHEN COUNT (*) > {SKIP_ERRORS} THEN 0 ELSE 1 END as result_code, + CASE + WHEN COUNT(*) > 0 THEN + CONCAT( + CONCAT( CAST(COUNT(*) AS VARCHAR), ' error(s) identified, ' ), + CONCAT( + CASE + WHEN COUNT(*) > {SKIP_ERRORS} THEN 'exceeding limit of ' + ELSE 'within limit of ' + END, + '{SKIP_ERRORS}.' + ) + ) + ELSE 'No errors found.' + END AS result_message, + COUNT(*) as result_measure, + '{SUBSET_DISPLAY}' as subset_condition, + NULL as result_query +FROM ( + SELECT {COLUMN_NAME_NO_QUOTES} + FROM {SCHEMA_NAME}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - 2 * {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - {WINDOW_DAYS} + GROUP BY {COLUMN_NAME_NO_QUOTES} + EXCEPT + SELECT {COLUMN_NAME_NO_QUOTES} + FROM {SCHEMA_NAME}.{TABLE_NAME} + WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - {WINDOW_DAYS} + GROUP BY {COLUMN_NAME_NO_QUOTES} + ) test; diff --git a/testgen/template/flavors/postgresql/exec_query_tests/ex_window_match_same_postgresql.sql b/testgen/template/flavors/postgresql/exec_query_tests/ex_window_match_same_postgresql.sql new file mode 100644 index 0000000..202ebba --- /dev/null +++ b/testgen/template/flavors/postgresql/exec_query_tests/ex_window_match_same_postgresql.sql @@ -0,0 +1,59 @@ +SELECT '{PROJECT_CODE}' as project_code, + '{TEST_TYPE}' as test_type, + '{TEST_DEFINITION_ID}' as test_definition_id, + '{TEST_SUITE}' as test_suite, + '{TEST_RUN_ID}' as test_run_id, + '{RUN_DATE}' as test_time, + '{START_TIME}' as starttime, + CURRENT_TIMESTAMP as endtime, + '{SCHEMA_NAME}' as schema_name, + '{TABLE_NAME}' as table_name, + '{COLUMN_NAME_NO_QUOTES}' as column_names, + '{SKIP_ERRORS}' as threshold_value, + {SKIP_ERRORS} as skip_errors, + '{INPUT_PARAMETERS}' as input_parameters, + CASE WHEN COUNT (*) > {SKIP_ERRORS} THEN 0 ELSE 1 END as result_code, + CASE + WHEN COUNT(*) > 0 THEN + CONCAT( + CONCAT( CAST(COUNT(*) AS VARCHAR), ' error(s) identified, ' ), + CONCAT( + CASE + WHEN COUNT(*) > {SKIP_ERRORS} THEN 'exceeding limit of ' + ELSE 'within limit of ' + END, + '{SKIP_ERRORS}.' + ) + ) + ELSE 'No errors found.' + END AS result_message, + COUNT(*) as result_measure, + '{SUBSET_DISPLAY}' as subset_condition, + NULL as result_query + FROM ( + ( +SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME} +FROM {SCHEMA_NAME}.{TABLE_NAME} +WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - {WINDOW_DAYS} +EXCEPT +SELECT 'Prior Timeframe' as missing_from, {COLUMN_NAME} +FROM {SCHEMA_NAME}.{TABLE_NAME} +WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - 2 * {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - {WINDOW_DAYS} +) +UNION ALL +( +SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME} +FROM {SCHEMA_NAME}.{TABLE_NAME} +WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - 2 * {WINDOW_DAYS} + AND {WINDOW_DATE_COLUMN} < (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - {WINDOW_DAYS} + EXCEPT +SELECT 'Latest Timeframe' as missing_from, {COLUMN_NAME} +FROM {SCHEMA_NAME}.{TABLE_NAME} +WHERE {SUBSET_CONDITION} + AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {SCHEMA_NAME}.{TABLE_NAME}) - {WINDOW_DAYS} +) + ) test; diff --git a/testgen/ui/views/test_results.py b/testgen/ui/views/test_results.py index e495f0c..105603d 100644 --- a/testgen/ui/views/test_results.py +++ b/testgen/ui/views/test_results.py @@ -11,7 +11,7 @@ import testgen.ui.services.form_service as fm import testgen.ui.services.query_service as dq import testgen.ui.services.toolbar_service as tb -from testgen.common import date_service +from testgen.common import ConcatColumnList, date_service from testgen.ui.components import widgets as testgen from testgen.ui.navigation.page import Page from testgen.ui.services.string_service import empty_if_null @@ -399,6 +399,11 @@ def replace_parms(df_test, str_query): str_query = str_query.replace("{WINDOW_DATE_COLUMN}", empty_if_null(df_test.at[0, "window_date_column"])) str_query = str_query.replace("{WINDOW_DAYS}", empty_if_null(df_test.at[0, "window_days"])) + str_substitute = ConcatColumnList(selected_row["column_names"], "") + str_query = str_query.replace("{CONCAT_COLUMNS}", str_substitute) + str_substitute = ConcatColumnList(df_test.at[0, "match_groupby_names"], "") + str_query = str_query.replace("{CONCAT_MATCH_GROUPBY}", str_substitute) + if str_query is None or str_query == "": raise ValueError("Lookup query is not defined for this Test Type.") return str_query From 8220bda56f4c57932a3e50b384d3d621833e4411 Mon Sep 17 00:00:00 2001 From: Luis Trinidad Date: Fri, 31 May 2024 09:28:37 -0400 Subject: [PATCH 05/22] release: 2.1.4 -> 2.1.8 --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a22a420..0c875f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ build-backend = "setuptools.build_meta" [project] name = "data-ops-testgen" -version = "2.1.4" +version = "2.1.8" description = "DataKitchen Inc. Data Quality Engine" urls = { "homepage" = "https://datakitchen.io" } authors = [ @@ -243,7 +243,7 @@ omit = ["tests/*", "templates/*"] skip_empty=true [tool.bumpver] -current_version = "2.1.4" +current_version = "2.1.8" version_pattern = "MAJOR.MINOR.PATCH[PYTAGNUM]" commit_message = "release: {old_version} -> {new_version}" commit = true From ff4b5e5a18c90ea88a08f6e66c7e2f352dfecd90 Mon Sep 17 00:00:00 2001 From: Aarthy Adityan Date: Tue, 4 Jun 2024 17:15:49 -0400 Subject: [PATCH 06/22] Update contributors file --- CONTRIBUTORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index c150d73..b57f3de 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -4,6 +4,7 @@ Thanks to everyone who has contributed to DataOps TestGen! 💻 = code, 🚇 = infrastructure, 📖 = documentation, 📆 = project management, 🤔 = product management, 💬 = user support +- [Samantha Hamilton](https://www.linkedin.com/in/shamilton-darlingdocs/) 📖 - [Javier Giulianetti](https://github.com/JavierGi) 🚇 - [Tyler Stubenvoll](https://github.com/tjstub) 🚇 💻 - [Diogo Basto](https://www.linkedin.com/in/diogo-t-basto/) 🚇 From e6db4477e1ae8dd7660347838cacd73036aa678c Mon Sep 17 00:00:00 2001 From: Alex Fernandez Date: Fri, 14 Jun 2024 13:00:55 -0300 Subject: [PATCH 07/22] Refactor log system --- pyproject.toml | 1 + testgen/__main__.py | 8 ++--- .../commands/queries/generate_tests_query.py | 2 +- testgen/commands/run_execute_cat_tests.py | 2 +- testgen/commands/run_execute_tests.py | 2 +- testgen/commands/run_generate_tests.py | 2 +- testgen/commands/run_get_entities.py | 2 +- testgen/commands/run_launch_db_config.py | 2 +- .../commands/run_observability_exporter.py | 3 +- testgen/commands/run_profiling_bridge.py | 2 +- testgen/commands/run_quick_start.py | 2 +- testgen/commands/run_setup_profiling_tools.py | 2 +- .../commands/run_test_parameter_validation.py | 2 +- testgen/commands/run_upgrade_db_config.py | 2 +- testgen/common/database/database_service.py | 2 +- testgen/common/display_service.py | 2 +- testgen/common/docker_service.py | 11 +++---- testgen/common/logs.py | 29 +++++++++---------- testgen/common/process_service.py | 16 +++++----- testgen/common/read_file.py | 2 +- testgen/settings.py | 26 ++++++++++++----- testgen/ui/app.py | 2 -- testgen/ui/bootstrap.py | 10 ++----- testgen/ui/components/utils/callbacks.py | 4 +-- testgen/ui/components/widgets/breadcrumbs.py | 2 +- testgen/ui/components/widgets/location.py | 2 +- testgen/ui/components/widgets/sidebar.py | 2 +- testgen/ui/navigation/router.py | 6 ++-- testgen/ui/services/authentication_service.py | 4 +-- testgen/ui/services/javascript_service.py | 2 +- testgen/ui/views/app_log_modal.py | 23 ++++++++------- testgen/ui/views/overview.py | 3 ++ 32 files changed, 96 insertions(+), 86 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 0c875f7..a429af6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,6 +60,7 @@ dependencies = [ "trino==0.327.0", "xlsxwriter==3.2.0", "psutil==5.9.8", + "concurrent_log_handler==0.9.25", ] [project.optional-dependencies] diff --git a/testgen/__main__.py b/testgen/__main__.py index bb8ea75..a5ba5f1 100644 --- a/testgen/__main__.py +++ b/testgen/__main__.py @@ -47,7 +47,7 @@ ) from testgen.utils import plugins -LOG = logging.getLogger("testgen.cli") +LOG = logging.getLogger("testgen") @dataclass @@ -73,9 +73,9 @@ class Configuration: @click.pass_context def cli(ctx: Context, verbose: bool): if verbose: - configure_logging(level=logging.INFO, log_to_file=settings.LOG_TO_FILE) + configure_logging(level=logging.DEBUG) else: - configure_logging(level=logging.WARNING, log_to_file=settings.LOG_TO_FILE) + configure_logging(level=logging.INFO) ctx.obj = Configuration(verbose=verbose) status_ok, message = docker_service.check_basic_configuration() @@ -714,7 +714,7 @@ def run(debug: bool): ) status_code: int = -1 - logger = logging.getLogger("testgen.ui") + logger = logging.getLogger("testgen") stderr: typing.TextIO = typing.cast(typing.TextIO, logs.LogPipe(logger, logging.INFO)) stdout: typing.TextIO = typing.cast(typing.TextIO, logs.LogPipe(logger, logging.INFO)) diff --git a/testgen/commands/queries/generate_tests_query.py b/testgen/commands/queries/generate_tests_query.py index 273af1c..696598f 100644 --- a/testgen/commands/queries/generate_tests_query.py +++ b/testgen/commands/queries/generate_tests_query.py @@ -3,7 +3,7 @@ from testgen.common import CleanSQL, date_service, get_template_files, read_template_sql_file -LOG = logging.getLogger("testgen.cli") +LOG = logging.getLogger("testgen") class CDeriveTestsSQL: diff --git a/testgen/commands/run_execute_cat_tests.py b/testgen/commands/run_execute_cat_tests.py index 453747c..0b4b9e2 100644 --- a/testgen/commands/run_execute_cat_tests.py +++ b/testgen/commands/run_execute_cat_tests.py @@ -10,7 +10,7 @@ WriteListToDB, ) -LOG = logging.getLogger("testgen.cli") +LOG = logging.getLogger("testgen") def RetrieveTargetTables(clsCATExecute): diff --git a/testgen/commands/run_execute_tests.py b/testgen/commands/run_execute_tests.py index f998574..43ab72d 100644 --- a/testgen/commands/run_execute_tests.py +++ b/testgen/commands/run_execute_tests.py @@ -20,7 +20,7 @@ from .run_execute_cat_tests import run_cat_test_queries from .run_test_parameter_validation import run_parameter_validation_queries -LOG = logging.getLogger("testgen.cli") +LOG = logging.getLogger("testgen") def run_test_queries(strTestRunID, strTestTime, strProjectCode, strTestSuite, minutes_offset=0, spinner=None): diff --git a/testgen/commands/run_generate_tests.py b/testgen/commands/run_generate_tests.py index d9d31d0..266a52a 100644 --- a/testgen/commands/run_generate_tests.py +++ b/testgen/commands/run_generate_tests.py @@ -3,7 +3,7 @@ from testgen.commands.queries.generate_tests_query import CDeriveTestsSQL from testgen.common import AssignConnectParms, RetrieveDBResultsToDictList, RetrieveTestGenParms, RunActionQueryList -LOG = logging.getLogger("testgen.cli") +LOG = logging.getLogger("testgen") def run_test_gen_queries(strTableGroupsID, strTestSuite, strGenerationSet=None): diff --git a/testgen/commands/run_get_entities.py b/testgen/commands/run_get_entities.py index 19987ea..1f76fa0 100644 --- a/testgen/commands/run_get_entities.py +++ b/testgen/commands/run_get_entities.py @@ -2,7 +2,7 @@ from testgen.common import RetrieveDBResultsToList, read_template_sql_file -LOG = logging.getLogger("testgen.cli") +LOG = logging.getLogger("testgen") def run_list_profiles(table_groups_id): diff --git a/testgen/commands/run_launch_db_config.py b/testgen/commands/run_launch_db_config.py index 59f5f74..e6ab186 100644 --- a/testgen/commands/run_launch_db_config.py +++ b/testgen/commands/run_launch_db_config.py @@ -8,7 +8,7 @@ from testgen.common.encrypt import EncryptText, encrypt_ui_password from testgen.common.read_file import get_template_files -LOG = logging.getLogger("testgen.cli") +LOG = logging.getLogger("testgen") def _get_latest_revision_number(): diff --git a/testgen/commands/run_observability_exporter.py b/testgen/commands/run_observability_exporter.py index 1d5b57f..a708a2e 100644 --- a/testgen/commands/run_observability_exporter.py +++ b/testgen/commands/run_observability_exporter.py @@ -12,7 +12,8 @@ from testgen.common import date_service, display_service, read_template_sql_file from testgen.common.database.database_service import ExecuteDBQuery, RetrieveDBResultsToDictList -LOG = logging.getLogger("testgen.cli") +LOG = logging.getLogger("testgen") + DEFAULT_COMPONENT_TYPE = "dataset" PAYLOAD_MAX_SIZE = 100000 diff --git a/testgen/commands/run_profiling_bridge.py b/testgen/commands/run_profiling_bridge.py index 4bfac33..781891f 100644 --- a/testgen/commands/run_profiling_bridge.py +++ b/testgen/commands/run_profiling_bridge.py @@ -22,7 +22,7 @@ from testgen.common.database.database_service import empty_cache booClean = True -LOG = logging.getLogger("testgen.cli") +LOG = logging.getLogger("testgen") def InitializeProfilingSQL(strProject, strSQLFlavor): diff --git a/testgen/commands/run_quick_start.py b/testgen/commands/run_quick_start.py index f0b28e5..796302d 100644 --- a/testgen/commands/run_quick_start.py +++ b/testgen/commands/run_quick_start.py @@ -13,7 +13,7 @@ ) from testgen.common.read_file import read_template_sql_file -LOG = logging.getLogger("testgen.cli") +LOG = logging.getLogger("testgen") def _get_max_date(iteration: int): diff --git a/testgen/commands/run_setup_profiling_tools.py b/testgen/commands/run_setup_profiling_tools.py index 473698c..8f98d67 100644 --- a/testgen/commands/run_setup_profiling_tools.py +++ b/testgen/commands/run_setup_profiling_tools.py @@ -4,7 +4,7 @@ from testgen.common import AssignConnectParms, RunActionQueryList from testgen.common.database.database_service import get_queries_for_command -LOG = logging.getLogger("testgen.cli") +LOG = logging.getLogger("testgen") def _get_params_mapping(project_qc_schema: str, user: str) -> dict: diff --git a/testgen/commands/run_test_parameter_validation.py b/testgen/commands/run_test_parameter_validation.py index efa2ace..6e0b524 100644 --- a/testgen/commands/run_test_parameter_validation.py +++ b/testgen/commands/run_test_parameter_validation.py @@ -3,7 +3,7 @@ from testgen.commands.queries.test_parameter_validation_query import CTestParamValidationSQL from testgen.common import AssignConnectParms, RetrieveDBResultsToDictList, RetrieveTestExecParms, RunActionQueryList -LOG = logging.getLogger("testgen.cli") +LOG = logging.getLogger("testgen") def run_parameter_validation_queries( diff --git a/testgen/commands/run_upgrade_db_config.py b/testgen/commands/run_upgrade_db_config.py index 6cfd81f..e07c714 100644 --- a/testgen/commands/run_upgrade_db_config.py +++ b/testgen/commands/run_upgrade_db_config.py @@ -6,7 +6,7 @@ from testgen.common.database.database_service import replace_params from testgen.common.read_file import get_template_files -LOG = logging.getLogger("testgen.cli") +LOG = logging.getLogger("testgen") def _get_params_mapping() -> dict: diff --git a/testgen/common/database/database_service.py b/testgen/common/database/database_service.py index 75e4623..05b4865 100644 --- a/testgen/common/database/database_service.py +++ b/testgen/common/database/database_service.py @@ -23,7 +23,7 @@ from testgen.common.encrypt import DecryptText from testgen.common.read_file import get_template_files -LOG = logging.getLogger("testgen.cli") +LOG = logging.getLogger("testgen") class CConnectParms: diff --git a/testgen/common/display_service.py b/testgen/common/display_service.py index 450a3b1..665f675 100644 --- a/testgen/common/display_service.py +++ b/testgen/common/display_service.py @@ -6,7 +6,7 @@ import yaml from prettytable import PrettyTable -LOG = logging.getLogger("testgen.cli") +LOG = logging.getLogger("testgen") def print_table(rows: list[dict], column_names: list[str]): diff --git a/testgen/common/docker_service.py b/testgen/common/docker_service.py index 3763fe3..7b4fcfa 100644 --- a/testgen/common/docker_service.py +++ b/testgen/common/docker_service.py @@ -5,7 +5,8 @@ from testgen import settings from testgen.common import get_tg_db, get_tg_host, get_tg_password, get_tg_schema, get_tg_username -logger = logging.getLogger("testgen.cli") +LOG = logging.getLogger("testgen") + def check_for_new_docker_release() -> str: @@ -16,20 +17,20 @@ def check_for_new_docker_release() -> str: tags = get_docker_tags() if len(tags) == 0: - logger.debug("docker_service: No tags to parse, skipping check.") + LOG.debug("docker_service: No tags to parse, skipping check.") return "unknown" ordered_tags = sorted(tags, key=lambda item: item[1], reverse=True) latest_tag = ordered_tags[0][0] if latest_tag != settings.VERSION: - logger.warning( + LOG.warning( f"A new TestGen upgrade is available. Please update to version {latest_tag} for new features and improvements." ) return latest_tag # noqa: TRY300 except Exception: - logger.warning("Unable to check for latest release", exc_info=True, stack_info=True) + LOG.warning("Unable to check for latest release", exc_info=True, stack_info=True) def get_docker_tags(url: str = "https://hub.docker.com/v2/repositories/datakitchen/dataops-testgen/tags/"): @@ -38,7 +39,7 @@ def get_docker_tags(url: str = "https://hub.docker.com/v2/repositories/datakitch tags_to_return = [] if not response.status_code == 200: - logger.warning(f"docker_service: Failed to fetch docker tags. Status code: {response.status_code}") + LOG.warning(f"docker_service: Failed to fetch docker tags. Status code: {response.status_code}") return tags_to_return tags_data = response.json() diff --git a/testgen/common/logs.py b/testgen/common/logs.py index b686874..bc0799a 100644 --- a/testgen/common/logs.py +++ b/testgen/common/logs.py @@ -7,11 +7,14 @@ import sys import threading +from concurrent_log_handler import ConcurrentTimedRotatingFileHandler + +from testgen import settings + def configure_logging( level: int = logging.DEBUG, - log_format: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s", - log_to_file: bool = False, + log_format: str = "[PID: %(process)s] %(asctime)s - %(levelname)s - %(message)s", ) -> None: """ Configures the testgen logger. @@ -22,9 +25,8 @@ def configure_logging( formatter = logging.Formatter(log_format) console_out_handler = logging.StreamHandler(stream=sys.stdout) - console_out_handler.setLevel(logging.DEBUG) + console_out_handler.setLevel(logging.WARNING) console_out_handler.setFormatter(formatter) - console_out_handler.addFilter(LessThanFilter(logging.WARNING)) console_err_handler = logging.StreamHandler(stream=sys.stderr) console_err_handler.setLevel(logging.WARNING) @@ -33,14 +35,14 @@ def configure_logging( logger.addHandler(console_out_handler) logger.addHandler(console_err_handler) - if log_to_file: - os.makedirs("/var/log/testgen", exist_ok=True) + if settings.LOG_TO_FILE: + os.makedirs(settings.LOG_FILE_PATH, exist_ok=True) - file_handler = logging.handlers.TimedRotatingFileHandler( - "/var/log/testgen/app.log", + file_handler = ConcurrentTimedRotatingFileHandler( + get_log_full_path(), when="D", interval=1, - backupCount=15, + backupCount=int(settings.LOG_FILE_MAX_QTY), ) file_handler.setLevel(level) file_handler.setFormatter(formatter) @@ -48,13 +50,8 @@ def configure_logging( logger.addHandler(file_handler) -class LessThanFilter(logging.Filter): - def __init__(self, maximum: int, name: str = "") -> None: - super().__init__(name) - self._maximum = maximum - - def filter(self, record): - return record.levelno < self._maximum +def get_log_full_path() -> str: + return os.path.join(settings.LOG_FILE_PATH, "app.log") class LogPipe(threading.Thread, io.TextIOBase): diff --git a/testgen/common/process_service.py b/testgen/common/process_service.py index 9cfa229..a258267 100644 --- a/testgen/common/process_service.py +++ b/testgen/common/process_service.py @@ -5,7 +5,7 @@ from testgen import settings -logger = logging.getLogger("testgen.cli") +LOG = logging.getLogger("testgen") def get_current_process_id(): @@ -27,20 +27,20 @@ def kill_test_run(process_id): def kill_process(process_id, keywords=None): if settings.IS_DEBUG: msg = "Cannot kill processes in debug mode (threads are used instead of new process)" - logger.warn(msg) + LOG.warn(msg) return False, msg try: process = psutil.Process(process_id) if process.name().lower() != "testgen": message = f"The process was not killed because the process_id {process_id} is not a testgen process. Details: {process.name()}" - logger.error(f"kill_process: {message}") + LOG.error(f"kill_process: {message}") return False, message if keywords: for keyword in keywords: if keyword.lower() not in process.cmdline(): message = f"The process was not killed because the keyword {keyword} was not found. Details: {process.cmdline()}" - logger.error(f"kill_process: {message}") + LOG.error(f"kill_process: {message}") return False, message process.terminate() @@ -48,15 +48,15 @@ def kill_process(process_id, keywords=None): message = f"Process {process_id} has been terminated." except psutil.NoSuchProcess: message = f"No such process with PID {process_id}." - logger.exception(f"kill_process: {message}") + LOG.exception(f"kill_process: {message}") return False, message except psutil.AccessDenied: message = f"Access denied when trying to terminate process {process_id}." - logger.exception(f"kill_process: {message}") + LOG.exception(f"kill_process: {message}") return False, message except psutil.TimeoutExpired: message = f"Process {process_id} did not terminate within the timeout period." - logger.exception(f"kill_process: {message}") + LOG.exception(f"kill_process: {message}") return False, message - logger.info(f"kill_process: Success. {message}") + LOG.info(f"kill_process: Success. {message}") return True, message diff --git a/testgen/common/read_file.py b/testgen/common/read_file.py index 034e9ca..8960381 100644 --- a/testgen/common/read_file.py +++ b/testgen/common/read_file.py @@ -9,7 +9,7 @@ import yaml -LOG = logging.getLogger("testgen.cli") +LOG = logging.getLogger("testgen") def _get_template_package_resource( diff --git a/testgen/settings.py b/testgen/settings.py index 8eae0f5..d3e71c6 100644 --- a/testgen/settings.py +++ b/testgen/settings.py @@ -1,26 +1,36 @@ import os IS_DEBUG_LOG_LEVEL: bool = os.getenv("TESTGEN_DEBUG_LOG_LEVEL", "no").lower() == "yes" +""" +When set, logs will be at debug level. +defaults to: `no` +""" + IS_DEBUG: bool = os.getenv("TESTGEN_DEBUG", "no").lower() == "yes" """ When True invalidates the cache with the bootstrapped application causing the changes to the routing and plugins to take effect on every render. -Also changes the logging level for the testgen.ui logger from INFO to -DEBUG. - from env variable: `TESTGEN_DEBUG` defaults to: `True` """ -LOG_TO_FILE: bool = os.getenv("TESTGEN_LOG_TO_FILE", "no").lower() == "yes" +LOG_TO_FILE: bool = os.getenv("TESTGEN_LOG_TO_FILE", "yes").lower() == "yes" +""" +When set, rotating file logs will be generated. +defaults to: `True` """ -When set, rotating file logs will be generated under -`/var/log/testgen/`. -from env variable: `TESTGEN_LOG_TO_FILE` -defautls to: `True` +LOG_FILE_PATH: str = os.getenv("TESTGEN_LOG_FILE_PATH", "/var/lib/testgen/log") +""" +When set, rotating file logs will be generated under this path. + +""" + +LOG_FILE_MAX_QTY: str = os.getenv("TESTGEN_LOG_FILE_MAX_QTY", "90") +""" +Maximum log files to keep, defaults to 90 days (one file per day). """ APP_ENCRYPTION_SALT: str = os.getenv("TG_DECRYPT_SALT") diff --git a/testgen/ui/app.py b/testgen/ui/app.py index b6def0c..edcf2a9 100644 --- a/testgen/ui/app.py +++ b/testgen/ui/app.py @@ -12,8 +12,6 @@ from testgen.ui.services import database_service as db from testgen.ui.session import session -logger = logging.getLogger("testgen.ui") - def render(log_level: int = logging.INFO): st.set_page_config( diff --git a/testgen/ui/bootstrap.py b/testgen/ui/bootstrap.py index bb755e3..ea6e65d 100644 --- a/testgen/ui/bootstrap.py +++ b/testgen/ui/bootstrap.py @@ -43,7 +43,7 @@ ProjectSettingsPage, ] -logger = logging.getLogger("testgen.ui") +LOG = logging.getLogger("testgen") class Application(singleton.Singleton): @@ -64,11 +64,7 @@ def run(log_level: int = logging.INFO) -> Application: pages = [*BUILTIN_PAGES] installed_plugins = plugins.discover() - configure_logging( - level=log_level, - log_to_file=settings.LOG_TO_FILE, - log_format="%(asctime)s - testgen.ui - %(levelname)s - %(message)s", - ) + configure_logging(level=log_level) for plugin in installed_plugins: module = importlib.import_module(plugin.package) @@ -95,7 +91,7 @@ def run(log_level: int = logging.INFO) -> Application: schema=_get_schema_rev(), ), ), - logger=logger, + logger=LOG, ) diff --git a/testgen/ui/components/utils/callbacks.py b/testgen/ui/components/utils/callbacks.py index f486d5b..f7b0b9f 100644 --- a/testgen/ui/components/utils/callbacks.py +++ b/testgen/ui/components/utils/callbacks.py @@ -10,7 +10,7 @@ from streamlit import session_state from streamlit.components.v1 import components -logger = logging.getLogger("testgen.ui") +LOG = logging.getLogger("testgen") def _patch_register_widget(register_widget): @@ -48,4 +48,4 @@ def register_callback(element_key, callback, *callback_args, **callback_kwargs): try: session_state._components_callbacks[element_key] = (callback, callback_args, callback_kwargs) except: - logger.debug("unexpected error registering component callback", exc_info=False, stack_info=False) + LOG.debug("unexpected error registering component callback", exc_info=False, stack_info=False) diff --git a/testgen/ui/components/widgets/breadcrumbs.py b/testgen/ui/components/widgets/breadcrumbs.py index fb6807d..8917be6 100644 --- a/testgen/ui/components/widgets/breadcrumbs.py +++ b/testgen/ui/components/widgets/breadcrumbs.py @@ -3,7 +3,7 @@ from testgen.ui.components.utils.component import component -logger = logging.getLogger("testgen.ui") +LOG = logging.getLogger("testgen") def breadcrumbs( diff --git a/testgen/ui/components/widgets/location.py b/testgen/ui/components/widgets/location.py index c85f9f8..6fa8a9b 100644 --- a/testgen/ui/components/widgets/location.py +++ b/testgen/ui/components/widgets/location.py @@ -9,7 +9,7 @@ from testgen.ui.components.utils.component import component from testgen.ui.session import session -logger = logging.getLogger("testgen.ui") +LOG = logging.getLogger("testgen") def location( diff --git a/testgen/ui/components/widgets/sidebar.py b/testgen/ui/components/widgets/sidebar.py index 5a6e59d..47d1fb0 100644 --- a/testgen/ui/components/widgets/sidebar.py +++ b/testgen/ui/components/widgets/sidebar.py @@ -9,7 +9,7 @@ from testgen.ui.navigation.menu import Menu from testgen.ui.services import authentication_service -logger = logging.getLogger("testgen.ui") +LOG = logging.getLogger("testgen") def sidebar( diff --git a/testgen/ui/navigation/router.py b/testgen/ui/navigation/router.py index 7df43be..cad50d5 100644 --- a/testgen/ui/navigation/router.py +++ b/testgen/ui/navigation/router.py @@ -10,7 +10,7 @@ CanActivateGuard = typing.Callable[[], bool | str] -logger = logging.getLogger("testgen.ui") +LOG = logging.getLogger("testgen") class Router(Singleton): @@ -55,9 +55,9 @@ def navigate(self, /, to: str, with_args: dict | None = None) -> None: except KeyError as k: error_message = f"{bc_source}: {k!s}" st.error(error_message) - logger.exception(error_message) + LOG.exception(error_message) return self.navigate(to=self._default.path, with_args=with_args) except Exception as e: error_message = f"{bc_source}: {e!s}" st.error(error_message) - logger.exception(error_message) + LOG.exception(error_message) diff --git a/testgen/ui/services/authentication_service.py b/testgen/ui/services/authentication_service.py index 31984b3..f279c4b 100644 --- a/testgen/ui/services/authentication_service.py +++ b/testgen/ui/services/authentication_service.py @@ -18,7 +18,7 @@ AUTH_TOKEN_COOKIE_NAME = "dk_cookie_name" AUTH_TOKEN_EXPIRATION_DAYS = 5 -logger = logging.getLogger("testgen.ui") +LOG = logging.getLogger("testgen") def load_user_session() -> None: @@ -30,7 +30,7 @@ def load_user_session() -> None: if token["exp_date"] > datetime.datetime.utcnow().timestamp(): start_user_session(token["name"], token["username"]) except Exception: - logger.debug("Invalid auth token found on cookies", exc_info=True, stack_info=True) + LOG.debug("Invalid auth token found on cookies", exc_info=True, stack_info=True) def start_user_session(name: str, username: str) -> None: diff --git a/testgen/ui/services/javascript_service.py b/testgen/ui/services/javascript_service.py index 78846fc..424bb6e 100644 --- a/testgen/ui/services/javascript_service.py +++ b/testgen/ui/services/javascript_service.py @@ -4,7 +4,7 @@ from testgen.ui.services.authentication_service import AUTH_TOKEN_COOKIE_NAME -LOG = logging.getLogger("testgen.ui") +LOG = logging.getLogger("testgen") def copy_to_clipboard(text): diff --git a/testgen/ui/views/app_log_modal.py b/testgen/ui/views/app_log_modal.py index a6893e5..d5f7ca1 100644 --- a/testgen/ui/views/app_log_modal.py +++ b/testgen/ui/views/app_log_modal.py @@ -1,14 +1,16 @@ import logging +import os import re from datetime import date, datetime import streamlit as st +import testgen.common.logs as logs import testgen.ui.services.form_service as fm from testgen.common import display_service from testgen.ui.components import widgets as testgen -logger = logging.getLogger("testgen.ui") +LOG = logging.getLogger("testgen") # Read the log file @@ -21,7 +23,7 @@ def _read_log(file_path): except Exception: st.warning(f"Log file is unavailable: {file_path}") - logger.debug(f"Log viewer can't read log file {file_path}") + LOG.debug(f"Log viewer can't read log file {file_path}") # Function to filter log data by date @@ -36,6 +38,7 @@ def _filter_by_date(log_data, start_date, end_date): filtered_data.append(line) return filtered_data + # Function to search text in log data def _search_text(log_data, search_query): return [line for line in log_data if search_query in line] @@ -59,13 +62,13 @@ def view_log_file(button_container): col1, col2, col3 = st.columns([33, 33, 33]) log_date = col1.date_input("Log Date", value=datetime.today()) - if log_date == date.today(): - file_name = "app.log" - else: - file_name = f"app.log.{log_date.strftime('%Y-%m-%d')}" + log_file_location = logs.get_log_full_path() + + if log_date != date.today(): + log_file_location += log_date.strftime(".%Y-%m-%d") + + log_file_name = os.path.basename(log_file_location) - # log_file_location = os.path.join(file_out_path, file_name) - log_file_location = f"/var/log/testgen/{file_name}" log_data = _read_log(log_file_location) search_query = col2.text_input("Filter by Text") @@ -81,9 +84,9 @@ def view_log_file(button_container): st.cache_data.clear() if log_data: - st.markdown(f"**Log File:** {log_file_location}") + st.markdown(f"**Log File:** {log_file_name}") # TOO SLOW: st.code(body=''.join(show_data), language="log", line_numbers=True) st.text_area("Log Data", value="".join(show_data), height=400) # Download button - st.download_button("Download", data="".join(show_data), file_name=file_name) + st.download_button("Download", data="".join(show_data), file_name=log_file_name) diff --git a/testgen/ui/views/overview.py b/testgen/ui/views/overview.py index 758ea46..48a1954 100644 --- a/testgen/ui/views/overview.py +++ b/testgen/ui/views/overview.py @@ -1,3 +1,4 @@ +import logging import typing import streamlit as st @@ -7,6 +8,8 @@ from testgen.ui.services import form_service from testgen.ui.session import session +LOG = logging.getLogger("testgen") + class OverviewPage(Page): path = "overview" From 555194eb7c427f7621b479e48c71e7b874b3fcbe Mon Sep 17 00:00:00 2001 From: Luis Trinidad Date: Mon, 17 Jun 2024 10:50:09 -0400 Subject: [PATCH 08/22] fix(ui): unset bg color of detail values on dark mode Background color of values in the items details list was strictly set to which does not contrast with the white font of dark mode. This commit uses a CSS variable dependent on the theme to decide that background color. Refs: #14 --- testgen/ui/assets/style.css | 3 +++ testgen/ui/services/form_service.py | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/testgen/ui/assets/style.css b/testgen/ui/assets/style.css index d5541d6..8464393 100644 --- a/testgen/ui/assets/style.css +++ b/testgen/ui/assets/style.css @@ -11,6 +11,8 @@ body { --sidebar-active-item-border-color: #b4e3c9; --field-underline-color: #9e9e9e; + + --dk-text-value-background: aliceblue; } img.dk-logo-img { @@ -118,6 +120,7 @@ button[title="Show password text"] { --sidebar-item-hover-color: rgba(14, 17, 23, .5); --sidebar-active-item-color: rgba(14, 17, 23, .5); --sidebar-active-item-border-color: #b4e3c9; + --dk-text-value-background: unset; } /* Main content */ diff --git a/testgen/ui/services/form_service.py b/testgen/ui/services/form_service.py index f3c8606..16bda64 100644 --- a/testgen/ui/services/form_service.py +++ b/testgen/ui/services/form_service.py @@ -520,7 +520,7 @@ def render_html_list(dct_row, lst_columns, str_section_header=None, int_data_wid .dk-text-value { display: <>; width: <>px; - background-color: aliceblue; + background-color: var(--dk-text-value-background); text-align: left; font-family: 'Courier New', monospace; padding-left: 10px; @@ -530,7 +530,7 @@ def render_html_list(dct_row, lst_columns, str_section_header=None, int_data_wid .dk-num-value { display: <>; width: <>px; - background-color: aliceblue; + background-color: var(--dk-text-value-background); text-align: right; font-family: 'Courier New', monospace; padding-left: 10px; From 1a2df94a28ec66065b9d16f28c99101709c1275f Mon Sep 17 00:00:00 2001 From: "Chip.Bloche" Date: Mon, 17 Jun 2024 16:03:15 -0400 Subject: [PATCH 09/22] Fixed label, help in Test Definitions --- .../commands/queries/execute_tests_query.py | 5 --- .../050_populate_new_schema_metadata.sql | 4 +- .../profiling/functional_datatype.sql | 37 ++++++++++++++++++- testgen/ui/views/test_definitions.py | 10 ++++- 4 files changed, 46 insertions(+), 10 deletions(-) diff --git a/testgen/commands/queries/execute_tests_query.py b/testgen/commands/queries/execute_tests_query.py index 69bb3a4..d8fac7a 100644 --- a/testgen/commands/queries/execute_tests_query.py +++ b/testgen/commands/queries/execute_tests_query.py @@ -47,9 +47,6 @@ def _ReplaceParms(self, strInputString: str): strInputString = strInputString.replace("{INPUT_PARAMETERS}", self._AssembleDisplayParameters()) strInputString = strInputString.replace("{RUN_DATE}", self.run_date) - # strInputString = strInputString.replace("{SUM_COLUMNS}", self.sum_columns) - # strInputString = strInputString.replace("{MATCH_SUM_COLUMNS}", self.match_sum_columns) - # strInputString = strInputString.replace("{MULTI_COLUMN_ERROR_CONDITION}", self.multi_column_error_condition) strInputString = strInputString.replace("{EXCEPTION_MESSAGE}", self.exception_message) strInputString = strInputString.replace("{START_TIME}", self.today) strInputString = strInputString.replace("{PROCESS_ID}", str(self.process_id)) @@ -154,8 +151,6 @@ def GetTestQuery(self, booClean: bool): if strTemplate == "": raise ValueError(f"No query template assigned to test_type {strTestType}") - # if strTestType in {"AGG MATCH NO DROPS", "AGG MATCH SAME", "AGG MATCH NUM INCR"}: - # self._ConstructAggregateMatchParms() strQ = self._GetTestQueryFromTemplate(strTemplate) # Final replace to cover parm within CUSTOM_QUERY parm strQ = strQ.replace("{DATA_SCHEMA}", self.dctTestParms["schema_name"]) diff --git a/testgen/template/dbsetup/050_populate_new_schema_metadata.sql b/testgen/template/dbsetup/050_populate_new_schema_metadata.sql index 87a748a..f08ba06 100644 --- a/testgen/template/dbsetup/050_populate_new_schema_metadata.sql +++ b/testgen/template/dbsetup/050_populate_new_schema_metadata.sql @@ -134,12 +134,12 @@ VALUES ('1004', 'Alpha_Trunc', 'Alpha Truncation', 'Maximum character count con ('1043', 'Valid_Characters', 'Valid Characters', 'Column contains no invalid characters', 'Tests for the presence of non-printing characters, leading spaces, or surrounding quotes.', 'Invalid characters, such as non-printing characters, leading spaces, or surrounding quotes, were found.', 'Invalid records', 'Expected count of values with invalid characters', 'general_type = ''A''', NULL, NULL, 'threshold_value', '0', NULL, 'The acceptable number of records with invalid character values present.', 'Warning', 'CAT', 'column', 'Validity', 'Schema Drift', 'Threshold Invalid Value Count', 'This test looks for the presence of non-printing ASCII characters that are considered non-standard in basic text processing. It also identifies leading spaces and values enclosed in quotes. Values that fail this test may be artifacts of data conversion, or just more difficult to process or analyze downstream.', 'N'), ('1044', 'Valid_US_Zip', 'Valid US Zip', 'Valid USA Postal Codes', 'Tests that postal codes match the 5 or 9 digit standard US format', 'Invalid US Zip Code formats found.', 'Invalid Zip Codes', 'Expected count of values with invalid Zip Codes', 'functional_data_type = ''Zip''', NULL, NULL, 'threshold_value', '0', NULL, NULL, 'Warning', 'CAT', 'column', 'Validity', 'Schema Drift', 'Threshold Invalid Value Count', NULL, 'Y'), ('1045', 'Valid_US_Zip3', 'Valid US Zip-3 ', 'Valid USA Zip-3 Prefix', 'Tests that postal codes match the 3 digit format of a regional prefix.', 'Invalid 3-digit US Zip Code regional prefix formats found.', 'Invalid Zip-3 Prefix', 'Expected count of values with invalid Zip-3 Prefix Codes', 'functional_data_type = ''Zip3''', NULL, NULL, 'threshold_value', '0', NULL, NULL, 'Warning', 'CAT', 'column', 'Validity', 'Schema Drift', 'Threshold Invalid Zip3 Count', 'This test looks for the presence of values that fail to match the three-digit numeric code expected for US Zip Code regional prefixes. These prefixes are often used to roll up Zip Code data to a regional level, and may be critical to anonymize detailed data and protect PID. Depending on your needs and regulatory requirements, longer zip codes could place PID at risk.', 'Y'), - ('1006', 'Condition_Flag', 'Custom Condition', 'Column values match pre-defined condition', 'Tests that each record in the table matches a pre-defined, custom condition', 'Value(s) found not matching defined condition.', 'Values Failing', NULL, NULL, NULL, NULL, 'threshold_value,custom_query', NULL, 'Threshold Error Count,Custom SQL Expression', 'The number of errors that are acceptable before test fails.|Expression should evaluate to TRUE to register an error or FALSE if no error. An expression can reference only columns in the selected table.', 'Fail', 'CAT', 'custom', 'Validity', 'Schema Drift', 'Count of records that don''t meet test condition', 'Custom Condition is a business-rule test for a user-defined error condition based on the value of one or more columns. The condition is applied to each record within the table, and the count of records failing the condition is added up. If that count exceeds a threshold of errors, the test as a whole is failed. This test is ideal for error conditions that TestGen cannot automatically infer, and any condition that involves the values of more than one column in the same record. Performance of this test is fast, since it is performed together with other aggregate tests. Interpretation is based on the user-defined meaning of the test.', 'Y'), + ('1006', 'Condition_Flag', 'Custom Condition', 'Column values match pre-defined condition', 'Tests that each record in the table matches a pre-defined, custom condition', 'Value(s) found not matching defined condition.', 'Values Failing', NULL, NULL, 'Test Focus', 'Specify a brief descriptor of the focus of this test that is unique within this Test Suite for the Table and Test Type. This distinguishes this test from others of the same type on the same table. Example: `Quantity Consistency` if you are testing that quantity ordered matches quantity shipped.', 'threshold_value,custom_query', NULL, 'Threshold Error Count,Custom SQL Expression', 'The number of errors that are acceptable before test fails.|Expression should evaluate to TRUE to register an error or FALSE if no error. An expression can reference only columns in the selected table.', 'Fail', 'CAT', 'custom', 'Validity', 'Schema Drift', 'Count of records that don''t meet test condition', 'Custom Condition is a business-rule test for a user-defined error condition based on the value of one or more columns. The condition is applied to each record within the table, and the count of records failing the condition is added up. If that count exceeds a threshold of errors, the test as a whole is failed. This test is ideal for error conditions that TestGen cannot automatically infer, and any condition that involves the values of more than one column in the same record. Performance of this test is fast, since it is performed together with other aggregate tests. Interpretation is based on the user-defined meaning of the test.', 'Y'), ('1031', 'Row_Ct', 'Row Count', 'Number of rows is at or above threshold', 'Tests that the count of records has not decreased from the baseline count.', 'Row count less than baseline count.', 'Row count', NULL, 'TEMPLATE', NULL, NULL, 'threshold_value', NULL, 'Threshold Minimum Record Count', NULL, 'Fail', 'CAT', 'table', 'Completeness', 'Volume', 'Expected minimum row count', 'Because this tests the row count against a constant minimum threshold, it''s appropriate for any dataset, as long as the number of rows doesn''t radically change from refresh to refresh. But it''s not responsive to change over time. You may want to adjust the threshold periodically if you are dealing with a cumulative dataset.', 'Y'), ('1032', 'Row_Ct_Pct', 'Row Range', 'Number of rows within percent range of threshold', 'Tests that the count of records is within a percentage above or below the baseline count.', 'Row Count is outside of threshold percent of baseline count.', 'Percent of baseline', 'Row count percent above or below baseline', 'TEMPLATE', NULL, NULL, 'baseline_ct,threshold_value', NULL, 'Baseline Record Count,Threshold Pct Above or Below Baseline', NULL, 'Fail', 'CAT', 'table', 'Completeness', 'Volume', 'Expected percent window below or above baseline', 'This test is better than Row Count for an incremental or windowed dataset where you would expect the row count to range within a percentage of baseline.', 'Y'), - ('1008', 'CUSTOM', 'Custom Test', 'Custom-defined business rule', 'Custom SQL Query Test', 'Errors were detected according to test definition.', 'Errors found', 'Count of errors identified by query', NULL, 'Test Focus', 'Specify a brief descriptor of the focus of this test that is unique within this Test Suite for the Table and Test Type. This distinguishes this test from others of the same type on the same table. \n\nExample: if you are testing whether product_code is found in the related table called dim_products', 'custom_query', NULL, 'Custom SQL Query Returning Error Records', 'Query should return records indicating one or more errors. The test passes if no records are returned. Results of the query will be shown when you click `Review Source Data` for a failed test, so be sure to include enough data in your results to follow-up. \n\nA query can refer to any tables in the database. You must hard-code the schema or use `{DATA_SCHEMA}` to represent the schema defined for the Table Group.', 'Fail', 'QUERY', 'custom', 'Accuracy', 'Data Drift', 'Expected count of errors found by custom query', 'This business-rule test is highly flexible, covering any error state that can be expressed by a SQL query against one or more tables in the database. In operation, the user-defined query is embedded within a parent query returning the count of error rows identified. Any row returned by the query is interpreted as a single error condition in the test. Note that this query is run independently of other tests, and that performance will be slower, depending in large part on the efficiency of the query you write. Interpretation is based on the user-defined meaning of the test. Your query might be written to return errors in individual rows identified by joining tables. Or it might return an error based on a multi-column aggregate condition returning a single row if an error is found. This query is run separately when you click `Review Source Data` from Test Results, so be sure to include enough data in your results to follow-up. Interpretation is based on the user-defined meaning of the test.', 'Y'), + ('1008', 'CUSTOM', 'Custom Test', 'Custom-defined business rule', 'Custom SQL Query Test', 'Errors were detected according to test definition.', 'Errors found', 'Count of errors identified by query', NULL, 'Test Focus', 'Specify a brief descriptor of the focus of this test that is unique within this Test Suite for the Table and Test Type. This distinguishes this test from others of the same type on the same table. Example: `Order Total Matches Detail` if you are testing that the total in one table matches the sum of lines in another.', 'custom_query', NULL, 'Custom SQL Query Returning Error Records', 'Query should return records indicating one or more errors. The test passes if no records are returned. Results of the query will be shown when you click `Review Source Data` for a failed test, so be sure to include enough data in your results to follow-up. \n\nA query can refer to any tables in the database. You must hard-code the schema or use `{DATA_SCHEMA}` to represent the schema defined for the Table Group.', 'Fail', 'QUERY', 'custom', 'Accuracy', 'Data Drift', 'Expected count of errors found by custom query', 'This business-rule test is highly flexible, covering any error state that can be expressed by a SQL query against one or more tables in the database. In operation, the user-defined query is embedded within a parent query returning the count of error rows identified. Any row returned by the query is interpreted as a single error condition in the test. Note that this query is run independently of other tests, and that performance will be slower, depending in large part on the efficiency of the query you write. Interpretation is based on the user-defined meaning of the test. Your query might be written to return errors in individual rows identified by joining tables. Or it might return an error based on a multi-column aggregate condition returning a single row if an error is found. This query is run separately when you click `Review Source Data` from Test Results, so be sure to include enough data in your results to follow-up. Interpretation is based on the user-defined meaning of the test.', 'Y'), ('1500', 'Aggregate_Balance', 'Aggregate Balance', 'Aggregate values per group match reference', 'Tests for exact match in aggregate values for each set of column values vs. reference dataset', 'Aggregate measure per set of column values does not exactly match reference dataset.', 'Mismatched measures', NULL, NULL, 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_schema_name,match_table_name,match_column_names,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'Record Subset Condition,Grouping Columns,Group Subset Condition,Matching Schema Name,Matching Table Name,Matching Aggregate Expression,Matching Record Subset Condition,Matching Grouping Columns,Matching Group Subset Condition', 'Condition defining a subset of records in main table (e.g. WHERE clause) - OPTIONAL|Category columns in main table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in main table (e.g. HAVING clause) - OPTIONAL|Schema location of matching table|Matching table name|Agregate column expression in matching table: one of `SUM([column_name])` or `COUNT([column_name])`|Condition defining a subset of records in matching table (e.g. WHERE clause) - OPTIONAL|Category columns in matching table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in matching table (e.g. HAVING clause) - OPTIONAL', 'Fail', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected count of group totals not matching aggregate value', 'This test compares sums or counts of a column rolled up to one or more category combinations. It''s ideal for confirming that two datasets exactly match -- that the sum of a measure or count of a value hasn''t changed or shifted between categories. Use this test to compare a raw and processed version of the same dataset, or to confirm that an aggregated table exactly matches the detail table that it''s built from. An error here means that one or more value combinations fail to match. New categories or combinations will cause failure.', 'Y'), ('1501', 'Aggregate_Minimum', 'Aggregate Minimum', 'Aggregate values per group are at or above reference', 'Tests that aggregate values for each set of column values are at least the same as reference dataset', 'Aggregate measure per set of column values is not at least the same as reference dataset.', 'Mismatched measures', NULL, NULL, 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_schema_name,match_table_name,match_column_names,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'Record Subset Condition,Grouping Columns,Group Subset Condition,Matching Schema Name,Matching Table Name,Matching Aggregate Expression,Matching Record Subset Condition,Matching Grouping Columns,Matching Group Subset Condition', 'Condition defining a subset of records in main table (e.g. WHERE clause) - OPTIONAL,Category columns in main table separated by commas (e.g. GROUP BY columns),Condition defining a subset of aggregate records in main table (e.g. HAVING clause) - OPTIONAL,Schema location of matching table,Matching table name,Aggregate column expression in matching table (e.g. `SUM(sales)`),Condition defining a subset of records in matching table (e.g. WHERE clause) - OPTIONAL,Category columns in matching table separated by commas (e.g. GROUP BY columns),Condition defining a subset of aggregate records in matching table (e.g. HAVING clause) - OPTIONAL', 'Fail', 'QUERY', 'referential', 'Accuracy', 'Data Drift', 'Expected count of group totals below aggregate value', 'This test compares sums or counts of a column rolled up to one or more category combinations, but requires a match or increase in the aggregate value, rather than an exact match. Use it to confirm that aggregate values have not dropped for any set of categories, even if some values may rise. This test is useful to compare an older and newer version of a cumulative dataset. An error here means that one or more values per category set fail to match or exceed the prior dataset. New categories or combinations are allowed (but can be restricted independently with a Combo_Match test). Both tables must be present to run this test.', 'Y'), diff --git a/testgen/template/profiling/functional_datatype.sql b/testgen/template/profiling/functional_datatype.sql index fd4775e..716b0d0 100644 --- a/testgen/template/profiling/functional_datatype.sql +++ b/testgen/template/profiling/functional_datatype.sql @@ -220,12 +220,47 @@ SET functional_data_type = WHERE profile_run_id = '{PROFILE_RUN_ID}' AND functional_data_type IS NULL; +-- Update City based on position of State and Zip +UPDATE profile_results + SET functional_data_type = 'City' + FROM profile_results c +INNER JOIN profile_results z + ON (c.profile_run_id = z.profile_run_id + AND c.table_name = z.table_name + AND c.position + 2 = z.position + AND 'Zip' = z.functional_data_type) +INNER JOIN profile_results s + ON (c.profile_run_id = s.profile_run_id + AND c.table_name = s.table_name + AND c.position + 1 = s.position + AND 'State' = s.functional_data_type) + WHERE c.profile_run_id = '09684d25-d8dd-47d0-8c02-9a6caf7c4a61' + AND c.column_name SIMILAR TO '%c(|i)ty%' + AND c.functional_data_type NOT IN ('State', 'Zip') + AND profile_results.id = c.id; + -- Assign Name UPDATE profile_results SET functional_data_type = 'Person Name' WHERE profile_run_id = '{PROFILE_RUN_ID}' AND functional_data_type IS NULL - AND column_name ~ '^(approver|first|last|full|contact|emp|employee|hcp|manager|mgr_|middle|nick|party|person|preferred|rep|reviewer|salesperson|spouse)(_| |)name$'; + AND column_name ~ '^(approver|full|contact|emp|employee|hcp|manager|mgr_|middle|nick|party|person|preferred|rep|reviewer|salesperson|spouse)(_| |)name$'; + +-- Assign First Name +UPDATE profile_results + SET functional_data_type = 'Person First Name' +WHERE profile_run_id = '{PROFILE_RUN_ID}' + AND column_name SIMILAR TO '%f(|i)rst%n(|a)m%%' + AND avg_length <= 8 + AND avg_embedded_spaces < 0.2; + +-- Assign Last Name +UPDATE profile_results + SET functional_data_type = 'Person Last Name' +WHERE profile_run_id = '{PROFILE_RUN_ID}' + AND column_name SIMILAR TO '%l(|a)st%n(|a)m%' + AND avg_length BETWEEN 5 and 8 + AND avg_embedded_spaces < 0.2; UPDATE profile_results SET functional_data_type = 'Entity Name' diff --git a/testgen/ui/views/test_definitions.py b/testgen/ui/views/test_definitions.py index ff866d4..6d0dc58 100644 --- a/testgen/ui/views/test_definitions.py +++ b/testgen/ui/views/test_definitions.py @@ -470,8 +470,14 @@ def show_add_edit_modal( ) # column_name - column_name_label = selected_test_type_row["column_name_prompt"] - column_name_help = selected_test_type_row["column_name_help"] + if selected_test_type_row["column_name_prompt"]: + column_name_label = selected_test_type_row["column_name_prompt"] + else: + column_name_label = "Test Focus" + if selected_test_type_row["column_name_help"]: + column_name_help = selected_test_type_row["column_name_help"] + else: + column_name_help = "Help is not available" if test_scope == "table": test_definition["column_name"] = None From bc0865548a8766575268d3f8234f053f217e7f80 Mon Sep 17 00:00:00 2001 From: Charles Bloche Date: Tue, 18 Jun 2024 12:56:31 +0000 Subject: [PATCH 10/22] Update file functional_datatype.sql --- testgen/template/profiling/functional_datatype.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testgen/template/profiling/functional_datatype.sql b/testgen/template/profiling/functional_datatype.sql index 716b0d0..65f187e 100644 --- a/testgen/template/profiling/functional_datatype.sql +++ b/testgen/template/profiling/functional_datatype.sql @@ -234,7 +234,7 @@ INNER JOIN profile_results s AND c.table_name = s.table_name AND c.position + 1 = s.position AND 'State' = s.functional_data_type) - WHERE c.profile_run_id = '09684d25-d8dd-47d0-8c02-9a6caf7c4a61' + WHERE c.profile_run_id = '{PROFILE_RUN_ID}' AND c.column_name SIMILAR TO '%c(|i)ty%' AND c.functional_data_type NOT IN ('State', 'Zip') AND profile_results.id = c.id; From 8d103157d75ff119094004aa06509cd0dcdd69b0 Mon Sep 17 00:00:00 2001 From: Alex Fernandez Date: Wed, 19 Jun 2024 09:59:17 -0300 Subject: [PATCH 11/22] Re-enable incremental script testing --- testgen/common/database/database_service.py | 4 ++-- testgen/common/read_file.py | 8 +++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/testgen/common/database/database_service.py b/testgen/common/database/database_service.py index 05b4865..bbed234 100644 --- a/testgen/common/database/database_service.py +++ b/testgen/common/database/database_service.py @@ -601,8 +601,8 @@ def replace_params(query: str, params_mapping: dict) -> str: return query -def get_queries_for_command(sub_directory: str, params_mapping: dict, mask: str = r"^.*sql$") -> list[str]: - files = sorted(get_template_files(mask=mask, sub_directory=sub_directory), key=lambda key: str(key)) +def get_queries_for_command(sub_directory: str, params_mapping: dict, mask: str = r"^.*sql$", path: str | None = None) -> list[str]: + files = sorted(get_template_files(mask=mask, sub_directory=sub_directory, path=path), key=lambda key: str(key)) queries = [] for file in files: diff --git a/testgen/common/read_file.py b/testgen/common/read_file.py index 8960381..dda3ff8 100644 --- a/testgen/common/read_file.py +++ b/testgen/common/read_file.py @@ -15,8 +15,10 @@ def _get_template_package_resource( template_file_name: str | None = None, sub_directory: str | None = None, + path: str | None = None, ) -> Traversable: - path = "testgen.template" + if path is None: + path = "testgen.template" if sub_directory: path = f"{path}.{sub_directory.replace('/', '.')}" if template_file_name: @@ -40,8 +42,8 @@ def read_template_sql_file(template_file_name: str, sub_directory: str | None = return contents -def get_template_files(mask: str, sub_directory: str | None = None) -> Generator[Traversable, None, None]: - folder = _get_template_package_resource(template_file_name=None, sub_directory=sub_directory) +def get_template_files(mask: str, sub_directory: str | None = None, path: str | None = None) -> Generator[Traversable, None, None]: + folder = _get_template_package_resource(template_file_name=None, sub_directory=sub_directory, path=path) LOG.debug("Reading SQL folder resource: %s", str(folder)) for entry in folder.iterdir(): if entry.is_file() and re.search(mask, str(entry)): From 08010485cddf55013ae0760bcd1d9b3ca981b340 Mon Sep 17 00:00:00 2001 From: Alex Fernandez Date: Thu, 20 Jun 2024 10:14:31 -0300 Subject: [PATCH 12/22] wildcard % doubled after quick-start --- testgen/settings.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/testgen/settings.py b/testgen/settings.py index d3e71c6..09762bf 100644 --- a/testgen/settings.py +++ b/testgen/settings.py @@ -314,40 +314,40 @@ from env variable: `DEFAULT_PROFILING_TABLE_SET` """ -DEFAULT_PROFILING_INCLUDE_MASK = os.getenv("DEFAULT_PROFILING_INCLUDE_MASK", "%%") +DEFAULT_PROFILING_INCLUDE_MASK = os.getenv("DEFAULT_PROFILING_INCLUDE_MASK", "%") """ A SQL filter supported by the project database's `LIKE` operator for table names to include. from env variable: `DEFAULT_PROFILING_INCLUDE_MASK` -defaults to: `%%` +defaults to: `%` """ -DEFAULT_PROFILING_EXCLUDE_MASK = os.getenv("DEFAULT_PROFILING_EXCLUDE_MASK", "tmp%%") +DEFAULT_PROFILING_EXCLUDE_MASK = os.getenv("DEFAULT_PROFILING_EXCLUDE_MASK", "tmp%") """ A SQL filter supported by the project database's `LIKE` operator for table names to exclude. from env variable: `DEFAULT_PROFILING_EXCLUDE_MASK` -defaults to: `tmp%%` +defaults to: `tmp%` """ -DEFAULT_PROFILING_ID_COLUMN_MASK = os.getenv("DEFAULT_PROFILING_ID_COLUMN_MASK", "%%id") +DEFAULT_PROFILING_ID_COLUMN_MASK = os.getenv("DEFAULT_PROFILING_ID_COLUMN_MASK", "%id") """ A SQL filter supported by the project database's `LIKE` operator representing ID columns. from env variable: `DEFAULT_PROFILING_ID_COLUMN_MASK` -defaults to: `%%id` +defaults to: `%id` """ -DEFAULT_PROFILING_SK_COLUMN_MASK = os.getenv("DEFAULT_PROFILING_SK_COLUMN_MASK", "%%sk") +DEFAULT_PROFILING_SK_COLUMN_MASK = os.getenv("DEFAULT_PROFILING_SK_COLUMN_MASK", "%sk") """ A SQL filter supported by the project database's `LIKE` operator representing surrogate key columns. from env variable: `DEFAULT_PROFILING_SK_COLUMN_MASK` -defaults to: `%%sk` +defaults to: `%sk` """ DEFAULT_PROFILING_USE_SAMPLING: str = os.getenv("DEFAULT_PROFILING_USE_SAMPLING", "N") From 5999c81e912a5a3dd3f29f6053ce7825bf291252 Mon Sep 17 00:00:00 2001 From: Alex Fernandez Date: Fri, 21 Jun 2024 13:48:05 -0300 Subject: [PATCH 13/22] [UI] [OPEN] Create QC Title is showing up twice --- testgen/common/logs.py | 5 ++++- testgen/ui/views/connections.py | 4 ++-- testgen/ui/views/table_groups.py | 14 +++++++++----- testgen/ui/views/test_definitions.py | 11 ++++++++--- testgen/ui/views/test_results.py | 2 +- testgen/ui/views/test_suites.py | 27 +++++++++++++++++---------- 6 files changed, 41 insertions(+), 22 deletions(-) diff --git a/testgen/common/logs.py b/testgen/common/logs.py index bc0799a..4f566b9 100644 --- a/testgen/common/logs.py +++ b/testgen/common/logs.py @@ -25,7 +25,10 @@ def configure_logging( formatter = logging.Formatter(log_format) console_out_handler = logging.StreamHandler(stream=sys.stdout) - console_out_handler.setLevel(logging.WARNING) + if settings.IS_DEBUG: + console_out_handler.setLevel(level) + else: + console_out_handler.setLevel(logging.WARNING) console_out_handler.setFormatter(formatter) console_err_handler = logging.StreamHandler(stream=sys.stderr) diff --git a/testgen/ui/views/connections.py b/testgen/ui/views/connections.py index 5ae7736..1a029b3 100644 --- a/testgen/ui/views/connections.py +++ b/testgen/ui/views/connections.py @@ -52,7 +52,7 @@ def render(self) -> None: session.current_page_args = {"connection_id": connection["connection_id"]} st.experimental_rerun() - create_qc_schema_modal = testgen.Modal("Create QC utility schema", "dk-create-qc-schema-modal", max_width=1100) + create_qc_schema_modal = testgen.Modal(title=None, key="dk-create-qc-schema-modal", max_width=1100) _, col2 = st.columns([70, 30]) @@ -77,7 +77,7 @@ def render(self) -> None: def show_create_qc_schema_modal(modal, selected_connection): with modal.container(): - fm.render_modal_header("Create QC Utility Schema", selected_connection["project_qc_schema"]) + fm.render_modal_header("Create QC Utility Schema", None) with st.form("Create QC Utility Schema", clear_on_submit=False): skip_schema_creation = st.toggle("Skip schema creation -- create utility functions in existing QC Schema") skip_granting_privileges = st.toggle("Skip granting privileges") diff --git a/testgen/ui/views/table_groups.py b/testgen/ui/views/table_groups.py index 42a6a34..cfe91ff 100644 --- a/testgen/ui/views/table_groups.py +++ b/testgen/ui/views/table_groups.py @@ -71,13 +71,13 @@ def render(self, connection_id: int | None = None) -> None: selected = fm.render_grid_select(df, show_columns, show_column_headers=show_column_headers) - add_modal = testgen.Modal("Add Table Group", "dk-add-table-group-modal", max_width=1100) - edit_modal = testgen.Modal("Edit Table Group", "dk-edit-table-group-modal", max_width=1100) - delete_modal = testgen.Modal("Delete Table Group", "dk-delete-table-group-modal", max_width=1100) + add_modal = testgen.Modal(title=None, key="dk-add-table-group-modal", max_width=1100) + edit_modal = testgen.Modal(title=None, key="dk-edit-table-group-modal", max_width=1100) + delete_modal = testgen.Modal(title=None, key="dk-delete-table-group-modal", max_width=1100) profile_cli_command_modal = testgen.Modal( - "Profiling CLI Command", "dk-profiling-cli-command-modal", max_width=1100 + title=None, key="dk-profiling-cli-command-modal", max_width=1100 ) - profile_command_modal = testgen.Modal("Profiling Command", "dk-profiling-command-modal", max_width=1100) + profile_command_modal = testgen.Modal(title=None, key="dk-profiling-command-modal", max_width=1100) if tool_bar.short_slots[1].button( "➕ Add", help="Add a new Table Group", use_container_width=True # NOQA RUF001 @@ -182,6 +182,7 @@ def show_profile_command(modal, selected): selected_table_group = selected[0] with modal.container(): + fm.render_modal_header("Profiling Command", None) container = st.empty() with container: st.markdown( @@ -215,6 +216,7 @@ def show_profile_command(modal, selected): def show_profile_cli_command(modal, selected): with modal.container(): + fm.render_modal_header("Profiling CLI Command", None) selected_table_group = selected[0] table_group_id = selected_table_group["id"] profile_command = f"testgen run-profile --table-group-id {table_group_id}" @@ -225,6 +227,7 @@ def show_delete_modal(modal, selected=None): selected_table_group = selected[0] with modal.container(): + fm.render_modal_header("Delete Table Group", None) table_group_id = selected_table_group["id"] table_group_name = selected_table_group["table_groups_name"] @@ -269,6 +272,7 @@ def show_delete_modal(modal, selected=None): def show_add_or_edit_modal(modal, mode, project_code, connection, selected=None): connection_id = connection["connection_id"] with modal.container(): + fm.render_modal_header("Edit Table Group" if mode == "edit" else "Add Table Group", None) table_groups_settings_tab, table_groups_preview_tab = st.tabs(["Table Group Settings", "Test"]) with table_groups_settings_tab: diff --git a/testgen/ui/views/test_definitions.py b/testgen/ui/views/test_definitions.py index 6d0dc58..74b1af7 100644 --- a/testgen/ui/views/test_definitions.py +++ b/testgen/ui/views/test_definitions.py @@ -1,3 +1,4 @@ +import logging import time import typing @@ -16,6 +17,8 @@ from testgen.ui.services.string_service import empty_if_null, snake_case_to_title_case from testgen.ui.session import session +LOG = logging.getLogger("testgen") + class TestDefinitionsPage(Page): path = "tests/definitions" @@ -49,10 +52,10 @@ def render(self, **_) -> None: tool_bar = tb.ToolBar(5, 6, 4, None, multiline=True) - add_test_definition_modal = testgen.Modal("Add Test Definition", "dk-add-test-definition", max_width=1100) - edit_test_definition_modal = testgen.Modal("Edit Test Definition", "dk-edit-test-definition", max_width=1100) + add_test_definition_modal = testgen.Modal(title=None, key="dk-add-test-definition", max_width=1100) + edit_test_definition_modal = testgen.Modal(title=None, key="dk-edit-test-definition", max_width=1100) delete_test_definition_modal = testgen.Modal( - "Delete Test Definition", "dk-delete-test-definition", max_width=1100 + title=None, key="dk-delete-test-definition", max_width=1100 ) with tool_bar.long_slots[0]: @@ -180,6 +183,7 @@ class TestDefinitionsPageFromSuite(TestDefinitionsPage): def show_delete_modal(modal, selected_test_definition=None): with modal.container(): + fm.render_modal_header("Delete Test", None) test_definition_id = selected_test_definition["id"] test_name_short = selected_test_definition["test_name_short"] @@ -253,6 +257,7 @@ def show_add_edit_modal( selected_test_def=None, ): with test_definition_modal.container(): + fm.render_modal_header("Add Test" if mode == "add" else "Edit Test", None) # test_type logic if mode == "add": selected_test_type, selected_test_type_row = prompt_for_test_type() diff --git a/testgen/ui/views/test_results.py b/testgen/ui/views/test_results.py index 105603d..8e58124 100644 --- a/testgen/ui/views/test_results.py +++ b/testgen/ui/views/test_results.py @@ -908,7 +908,7 @@ def view_profiling(button_container, str_table_name, str_column_name, str_table_ def view_edit_test(button_container, test_definition_id): - edit_test_definition_modal = testgen.Modal(title="Edit Test", key="dk-test-definition-edit-modal", max_width=1100) + edit_test_definition_modal = testgen.Modal(title=None, key="dk-test-definition-edit-modal", max_width=1100) with button_container: if st.button("🖊️ Edit Test", help="Edit the Test Definition", use_container_width=True): edit_test_definition_modal.open() diff --git a/testgen/ui/views/test_suites.py b/testgen/ui/views/test_suites.py index aa1b9f6..0d27907 100644 --- a/testgen/ui/views/test_suites.py +++ b/testgen/ui/views/test_suites.py @@ -70,22 +70,22 @@ def render(self, connection_id: str | None = None, table_group_id: str | None = selected = fm.render_grid_select(df, show_columns) - add_modal = testgen.Modal("Add Test Suite", "dk-add-test_suite-modal", max_width=1100) - edit_modal = testgen.Modal("Edit Test Suite", "dk-edit-test_suite-modal", max_width=1100) - delete_modal = testgen.Modal("Delete Test Suite", "dk-delete-test_suite-modal", max_width=1100) - run_tests_command_modal = testgen.Modal("Run Test Execution", "dk-run-tests-command-modal", max_width=1100) + add_modal = testgen.Modal(title=None, key="dk-add-test_suite-modal", max_width=1100) + edit_modal = testgen.Modal(title=None, key="dk-edit-test_suite-modal", max_width=1100) + delete_modal = testgen.Modal(title=None, key="dk-delete-test_suite-modal", max_width=1100) + run_tests_command_modal = testgen.Modal(title=None, key="dk-run-tests-command-modal", max_width=1100) show_test_run_command_modal = testgen.Modal( - "Test Execution Command for CLI", "dk-show-test-run-command-modal", max_width=1100 + title=None, key="dk-show-test-run-command-modal", max_width=1100 ) - run_test_generation_modal = testgen.Modal("Run Test Generation", "dk-run-test-generation-modal", max_width=1100) + run_test_generation_modal = testgen.Modal(title=None, key="dk-run-test-generation-modal", max_width=1100) show_run_test_generation_modal = testgen.Modal( - "Test Generation Command for CLI", "dk-show-test-generation-modal", max_width=1100 + title=None, key="dk-show-test-generation-modal", max_width=1100 ) - run_export_command_modal = testgen.Modal("Run Observability Export", "dk-run-export-modal", max_width=1100) + run_export_command_modal = testgen.Modal(title=None, key="dk-run-export-modal", max_width=1100) show_export_command_modal = testgen.Modal( - "Observability Export Command for CLI", "dk-show-export-modal", max_width=1100 + title=None, key="dk-show-export-modal", max_width=1100 ) if tool_bar.short_slots[1].button("➕ Add", help="Add a new Test Run", use_container_width=True): # NOQA RUF001 @@ -236,6 +236,7 @@ def show_run_test_generation(modal, selected): selected_test_suite = selected[0] with modal.container(): + fm.render_modal_header("Run Test Generation", None) container = st.empty() with container: st.markdown(":green[**Execute Test Generation for the Test Suite**]") @@ -304,6 +305,7 @@ def show_delete_modal(modal, selected=None): selected_test_suite = selected[0] with modal.container(): + fm.render_modal_header("Delete Test Suite", None) test_suite_name = selected_test_suite["test_suite"] can_be_deleted = test_suite_service.cascade_delete([test_suite_name], dry_run=True) @@ -319,7 +321,6 @@ def show_delete_modal(modal, selected=None): int_data_width=700, ) - if not can_be_deleted: st.markdown( ":orange[This Test Suite has related data, which includes test definitions and may include test results. If you proceed, all related data will be permanently deleted.
Are you sure you want to proceed?]", @@ -349,6 +350,7 @@ def show_add_or_edit_modal(modal, mode, project_code, connection, table_group, s connection_id = connection["connection_id"] table_group_id = table_group["id"] with modal.container(): + fm.render_modal_header("Edit Test Suite" if mode == "edit" else "Add Test Suite", None) severity_options = ["Inherit", "Failed", "Warning"] selected_test_suite = selected[0] if mode == "edit" else None @@ -450,6 +452,7 @@ def run_tests(modal, project_code, selected): selected_test_suite = selected[0] with modal.container(): + fm.render_modal_header("Run Test Execution", None) container = st.empty() with container: st.markdown(":green[**Run Tests for the Test Suite**]") @@ -481,6 +484,7 @@ def run_tests(modal, project_code, selected): def show_test_run_command(modal, project_code, selected): with modal.container(): + fm.render_modal_header("Test Execution Command for CLI", None) selected_test_suite = selected[0] test_suite_name = selected_test_suite["test_suite"] command = f"testgen run-tests --project-key {project_code} --test-suite-key {test_suite_name}" @@ -489,6 +493,7 @@ def show_test_run_command(modal, project_code, selected): def show_test_generation_command(modal, selected): with modal.container(): + fm.render_modal_header("Test Generation Command for CLI", None) selected_test_suite = selected[0] test_suite_key = selected_test_suite["test_suite"] table_group_id = selected_test_suite["table_groups_id"] @@ -498,6 +503,7 @@ def show_test_generation_command(modal, selected): def show_export_command(modal, selected): with modal.container(): + fm.render_modal_header("Observability Export Command for CLI", None) selected_test_suite = selected[0] test_suite_key = selected_test_suite["test_suite"] project_key = selected_test_suite["project_code"] @@ -509,6 +515,7 @@ def run_export_command(modal, selected): selected_test_suite = selected[0] with modal.container(): + fm.render_modal_header("Run Observability Export", None) container = st.empty() with container: st.markdown(":green[**Execute the test export for the current Test Suite**]") From 9ae795053360a61c982938f7385077d33e33b36c Mon Sep 17 00:00:00 2001 From: "Chip.Bloche" Date: Tue, 16 Jul 2024 09:04:05 -0400 Subject: [PATCH 14/22] Improved profiling functionality, test definition uniqueness --- .../commands/queries/generate_tests_query.py | 2 + testgen/commands/run_execute_tests.py | 2 +- testgen/commands/run_generate_tests.py | 3 +- .../030_initialize_new_schema_structure.sql | 21 ++- .../050_populate_new_schema_metadata.sql | 30 ++-- .../dbsetup/060_create_standard_views.sql | 55 ++++--- .../dbupgrade/0105_incremental_upgrade.sql | 41 +++++ .../ex_finalize_test_run_results.sql | 1 + .../project_profiling_query_mssql.yaml | 2 +- .../project_ddf_query_postgresql.sql | 10 +- .../project_profiling_query_postgresql.yaml | 2 +- .../profiling/project_ddf_query_redshift.sql | 10 +- .../project_profiling_query_redshift.yaml | 2 +- .../project_profiling_query_snowflake.yaml | 2 +- .../project_profiling_query_trino.yaml | 2 +- .../gen_funny_cat_tests/gen_test_constant.sql | 5 +- .../gen_test_distinct_value_ct.sql | 5 +- .../gen_funny_cat_tests/gen_test_row_ct.sql | 5 +- .../gen_test_row_ct_pct.sql | 5 +- .../gen_retrieve_or_insert_test_suite.sql | 58 +++++++ .../generation/gen_standard_tests.sql | 3 +- testgen/template/parms/parms_test_gen.sql | 1 + .../profiling/functional_datatype.sql | 151 ++++++++++++------ .../profiling/secondary_profiling_columns.sql | 11 +- .../ex_get_test_column_list_tg.sql | 94 +++++++---- testgen/ui/queries/table_group_queries.py | 29 +++- testgen/ui/queries/test_definition_queries.py | 22 +-- .../ui/services/test_definition_service.py | 8 - testgen/ui/views/profiling_anomalies.py | 25 +-- testgen/ui/views/profiling_modal.py | 40 +++++ testgen/ui/views/table_groups.py | 70 ++++++++ testgen/ui/views/test_definitions.py | 34 ++-- testgen/ui/views/test_results.py | 72 ++++----- testgen/ui/views/test_suites.py | 2 +- 34 files changed, 566 insertions(+), 259 deletions(-) create mode 100644 testgen/template/dbupgrade/0105_incremental_upgrade.sql create mode 100644 testgen/template/generation/gen_retrieve_or_insert_test_suite.sql create mode 100644 testgen/ui/views/profiling_modal.py diff --git a/testgen/commands/queries/generate_tests_query.py b/testgen/commands/queries/generate_tests_query.py index 696598f..73fbb32 100644 --- a/testgen/commands/queries/generate_tests_query.py +++ b/testgen/commands/queries/generate_tests_query.py @@ -13,6 +13,7 @@ class CDeriveTestsSQL: table_groups_id = "" data_schema = "" test_suite = "" + test_suite_id = "" generation_set = "" as_of_date = "" sql_flavor = "" @@ -38,6 +39,7 @@ def ReplaceParms(self, strInputString): strInputString = strInputString.replace("{TABLE_GROUPS_ID}", self.table_groups_id) strInputString = strInputString.replace("{RUN_DATE}", self.run_date) strInputString = strInputString.replace("{TEST_SUITE}", self.test_suite) + strInputString = strInputString.replace("{TEST_SUITE_ID}", self.test_suite_id) strInputString = strInputString.replace("{GENERATION_SET}", self.generation_set) strInputString = strInputString.replace("{AS_OF_DATE}", self.as_of_date) strInputString = strInputString.replace("{DATA_SCHEMA}", self.data_schema) diff --git a/testgen/commands/run_execute_tests.py b/testgen/commands/run_execute_tests.py index 43ab72d..ff0e3cf 100644 --- a/testgen/commands/run_execute_tests.py +++ b/testgen/commands/run_execute_tests.py @@ -93,7 +93,7 @@ def run_test_queries(strTestRunID, strTestTime, strProjectCode, strTestSuite, mi if intErrors > 0: booErrors = True error_msg = ( - f"Errors were encountered executing aggregate tests. ({intErrors} errors occurred.) " + f"Errors were encountered executing Referential Tests. ({intErrors} errors occurred.) " "Please check log. " ) LOG.warning(error_msg) diff --git a/testgen/commands/run_generate_tests.py b/testgen/commands/run_generate_tests.py index 266a52a..922ce46 100644 --- a/testgen/commands/run_generate_tests.py +++ b/testgen/commands/run_generate_tests.py @@ -18,7 +18,7 @@ def run_test_gen_queries(strTableGroupsID, strTestSuite, strGenerationSet=None): LOG.info("CurrentStep: Retrieving General Parameters for Test Suite " + strTestSuite) dctParms = RetrieveTestGenParms(strTableGroupsID, strTestSuite) - # Set Project Connection Parms in db_bridgers from retrieved parms + # Set Project Connection Parms from retrieved parms LOG.info("CurrentStep: Assigning Connection Parameters") AssignConnectParms( dctParms["project_code"], @@ -37,6 +37,7 @@ def run_test_gen_queries(strTableGroupsID, strTestSuite, strGenerationSet=None): # Set static parms clsTests.project_code = dctParms["project_code"] clsTests.test_suite = strTestSuite + clsTests.test_suite_id = dctParms["test_suite_id"] clsTests.generation_set = strGenerationSet if strGenerationSet is not None else "" clsTests.connection_id = str(dctParms["connection_id"]) clsTests.table_groups_id = strTableGroupsID diff --git a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql index d29aef3..46cb0b3 100644 --- a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql +++ b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql @@ -84,7 +84,14 @@ CREATE TABLE table_groups profile_sample_min_count BIGINT DEFAULT 100000, profiling_delay_days VARCHAR(3) DEFAULT '0' , profile_do_pair_rules VARCHAR(3) DEFAULT 'N', - profile_pair_rule_pct INTEGER DEFAULT 95 + profile_pair_rule_pct INTEGER DEFAULT 95, + data_source VARCHAR(40), + source_system VARCHAR(40), + data_location VARCHAR(40), + source_process VARCHAR(40), + business_domain VARCHAR(40), + stakeholder_group VARCHAR(40), + transform_level VARCHAR(40) ); CREATE TABLE profiling_runs ( @@ -139,8 +146,9 @@ CREATE TABLE test_definitions ( PRIMARY KEY, project_code VARCHAR(30), table_groups_id UUID, - profile_run_id UUID, + profile_run_id UUID, test_type VARCHAR(200), + test_suite_id UUID, test_suite VARCHAR(200), test_description VARCHAR(1000), test_action VARCHAR(100), @@ -354,7 +362,7 @@ CREATE TABLE data_table_chars ( source_process VARCHAR(40), business_domain VARCHAR(40), stakeholder_group VARCHAR(40), - transformation_level VARCHAR(40), + transform_level VARCHAR(40), aggregation_level VARCHAR(40), add_date TIMESTAMP, drop_date TIMESTAMP, @@ -509,6 +517,7 @@ CREATE TABLE test_results ( test_type VARCHAR(50) CONSTRAINT test_results_test_types_test_type_fk REFERENCES test_types, + test_suite_id UUID, test_suite VARCHAR(200), test_definition_id UUID, auto_gen BOOLEAN, @@ -643,6 +652,9 @@ CREATE UNIQUE INDEX uix_td_id CREATE INDEX ix_td_tg ON test_definitions(table_groups_id); +CREATE INDEX ix_td_ts_tc + ON test_definitions(test_suite_id, table_name, column_name, test_type); + -- Index test_runs CREATE INDEX ix_trun_pc_ts_time ON test_runs(project_code, test_suite, test_starttime); @@ -666,6 +678,9 @@ CREATE INDEX ix_tr_tt CREATE INDEX ix_tr_pc_sctc_tt ON test_results(project_code, test_suite, schema_name, table_name, column_names, test_type); +CREATE INDEX ix_tr_ts_tctt + ON test_results(test_suite_id, table_name, column_names, test_type); + -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -- PROFILING OPTIMIZATION -- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/testgen/template/dbsetup/050_populate_new_schema_metadata.sql b/testgen/template/dbsetup/050_populate_new_schema_metadata.sql index f08ba06..0b6869e 100644 --- a/testgen/template/dbsetup/050_populate_new_schema_metadata.sql +++ b/testgen/template/dbsetup/050_populate_new_schema_metadata.sql @@ -112,7 +112,7 @@ VALUES ('1004', 'Alpha_Trunc', 'Alpha Truncation', 'Maximum character count con ('1015', 'Future_Date', 'Past Dates', 'Latest date is prior to test run date', 'Tests that the maximum date referenced in the column is no greater than the test date, consistent with baseline data', 'Future date found when absent in baseline data.', 'Future dates', NULL, 'general_type=''D''AND future_date_ct = 0', NULL, NULL, 'threshold_value', '0', 'Maximum Future Date Count', NULL, 'Fail', 'CAT', 'column', 'Timeliness', 'Recency', 'Expected count of future dates', NULL, 'Y'), ('1016', 'Future_Date_1Y', 'Future Year', 'Future dates within year of test run date', 'Tests that the maximum date referenced in the column is no greater than one year beyond the test date, consistent with baseline data', 'Future date beyond one-year found when absent in baseline.', 'Future dates post 1 year', NULL, 'general_type=''D''AND future_date_ct > 0 AND max_date <=''{AS_OF_DATE}''::DATE + INTERVAL''365 DAYS''', NULL, NULL, 'threshold_value', '0', 'Maximum Post 1-Year Future Date Count', NULL, 'Fail', 'CAT', 'column', 'Timeliness', 'Recency', 'Expected count of future dates beyond one year', 'Future Year looks for date values in the column that extend beyond one year after the test date. This would be appropriate for transactional dates where you would expect to find dates in the near future, but not beyond one year ahead. Errors could indicate invalid entries or possibly dummy dates representing blank values.', 'Y'), ('1017', 'Incr_Avg_Shift', 'New Shift', 'New record mean is consistent with reference', 'Tests for statistically-significant shift in mean of new values for column compared to average calculated at baseline.', 'Significant shift in average of new values vs. baseline avg', 'Z-score of mean shift', 'Absolute Z-score (number of SD''s outside mean) of prior avg - incremental avg', 'general_type=''N'' AND distinct_value_ct > 10 AND functional_data_type ilike ''Measure%'' AND column_name NOT ilike ''%latitude%'' AND column_name NOT ilike ''%longitude%''', NULL, NULL, 'baseline_value_ct,baseline_sum,baseline_avg,baseline_sd,threshold_value', 'value_ct,(avg_value * value_ct)::FLOAT,avg_value,stdev_value,2', 'Value Count at Baseline,Sum at Baseline,Mean Value at Baseline,Std Deviation at Baseline,Threshold Max Z-Score', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Maximum Z-Score (number of SD''s beyond mean) expected', 'This is a more sensitive test than Average Shift, because it calculates an incremental difference in the average of new values compared to the average of values at baseline. This is appropriate for a cumulative dataset only, because it calculates the average of new entries based on the assumption that the count and average of records present at baseline are still present at the time of the test. This test compares the mean of new values with the standard deviation of the baseline average to calculate a Z-score. If the new mean falls outside the Z-score threshold, a shift is detected. Potential Z-score thresholds may range from 0 to 3, depending on the sensitivity you prefer. A failed test could indicate a quality issue or a legitimate shift in new data that should be noted and assessed by business users. Consider this test along with Variability Increase. If variability rises too, process, methodology or measurement flaws could be at issue. If variability remains consistent, the problem is more likely to be with the source data itself.', 'Y'), - ('1018', 'LOV_All', 'All Values', 'List of expected values all present in column', 'Tests that all values match a pipe-delimited list of expected values and that all expected values are present', 'Column values found don''t exactly match the expected list of values', 'Values found', NULL, NULL, NULL, NULL, 'threshold_value', NULL, 'List of Expected Values', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'List of values expected, in form (''Val1'',''Val2)', 'This is a more restrictive form of Value Match, testing that all values in the dataset match the list provided, and also that all values present in the list appear at least once in the dataset. This would be appropriate for tables where all category values in the column are represented at least once.', 'Y'), + ('1018', 'LOV_All', 'Value Match All', 'List of expected values all present in column', 'Tests that all values match a pipe-delimited list of expected values and that all expected values are present', 'Column values found don''t exactly match the expected list of values', 'Values found', NULL, NULL, NULL, NULL, 'threshold_value', NULL, 'List of Expected Values', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'List of values expected, in form (''Val1'',''Val2)', 'This is a more restrictive form of Value Match, testing that all values in the dataset match the list provided, and also that all values present in the list appear at least once in the dataset. This would be appropriate for tables where all category values in the column are represented at least once.', 'Y'), ('1019', 'LOV_Match', 'Value Match', 'All column values present in expected list', 'Tests that all values in the column match the list-of-values identified in baseline data.', 'Values not matching expected List-of-Values from baseline.', 'Non-matching records', NULL, 'top_freq_values > '''' AND distinct_value_ct BETWEEN 2 and 10 AND NOT (functional_data_type LIKE ''%date%'' OR lower(datatype_suggestion) LIKE ''%date%'' OR general_type = ''D'' OR lower(column_name) IN (''file_name'', ''filename''))', NULL, NULL, 'baseline_value,threshold_value', '''('' || SUBSTRING( CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 2) > '''' THEN '','''''' || TRIM( REPLACE ( SPLIT_PART(top_freq_values, ''|'' , 2), '''''''' , '''''''''''' ) ) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 4) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 4), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 6) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 6), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 8) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 8), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 10) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 10), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 12) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 12), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 14) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 14), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 16) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 16), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 18) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 18), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END || CASE WHEN SPLIT_PART(top_freq_values, ''|'' , 20) > '''' THEN '','''''' || TRIM(REPLACE(SPLIT_PART(top_freq_values, ''|'' , 20), '''''''' , '''''''''''' )) || '''''''' ELSE '''' END, 2, 999) || '')'',0', 'List of Expected Values,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'List of values expected, in form (''Val1'',''Val2)', 'This tests that all values in the column match the hard-coded list provided. This is relevant when the list of allowable values is small and not expected to change often. Even if new values might occasionally be added, this test is useful for downstream data products to provide warning that assumptions and logic may need to change.', 'Y'), ('1020', 'Min_Date', 'Minimum Date', 'All dates on or after set minimum', 'Tests that the earliest date referenced in the column is no earlier than baseline data', 'The earliest date value found is before the earliest value at baseline.', 'Dates prior to limit', NULL, 'general_type=''D''and min_date IS NOT NULL AND distinct_value_ct > 1', NULL, NULL, 'baseline_value,threshold_value', 'min_date,0', 'Minimum Date at Baseline,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of dates prior to minimum', 'This test is appropriate for a cumulative dataset only, because it assumes all prior values are still present. It''s appropriate where new records are added with more recent dates, but old dates dates do not change.', 'Y'), ('1021', 'Min_Val', 'Minimum Value', 'All values at or above set minimum', 'Tests that the minimum value present in the column is no lower than the minimum value in baseline data', 'Minimum column value less than baseline.', 'Values under limit', NULL, 'general_type=''N''and min_value IS NOT NULL AND (distinct_value_ct >= 2 OR (distinct_value_ct=2 and min_value<>0 and max_value<>1))', NULL, NULL, 'baseline_value,threshold_value', 'min_value,0', 'Minimum Value at Baseline,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of values under limit', 'This test is appropriate for a cumulative dataset only, assuming all prior values are still present. It is also appropriate for any measure that has an absolute, definable minimum value, or a heuristic that makes senes for valid data.', 'Y'), @@ -134,19 +134,19 @@ VALUES ('1004', 'Alpha_Trunc', 'Alpha Truncation', 'Maximum character count con ('1043', 'Valid_Characters', 'Valid Characters', 'Column contains no invalid characters', 'Tests for the presence of non-printing characters, leading spaces, or surrounding quotes.', 'Invalid characters, such as non-printing characters, leading spaces, or surrounding quotes, were found.', 'Invalid records', 'Expected count of values with invalid characters', 'general_type = ''A''', NULL, NULL, 'threshold_value', '0', NULL, 'The acceptable number of records with invalid character values present.', 'Warning', 'CAT', 'column', 'Validity', 'Schema Drift', 'Threshold Invalid Value Count', 'This test looks for the presence of non-printing ASCII characters that are considered non-standard in basic text processing. It also identifies leading spaces and values enclosed in quotes. Values that fail this test may be artifacts of data conversion, or just more difficult to process or analyze downstream.', 'N'), ('1044', 'Valid_US_Zip', 'Valid US Zip', 'Valid USA Postal Codes', 'Tests that postal codes match the 5 or 9 digit standard US format', 'Invalid US Zip Code formats found.', 'Invalid Zip Codes', 'Expected count of values with invalid Zip Codes', 'functional_data_type = ''Zip''', NULL, NULL, 'threshold_value', '0', NULL, NULL, 'Warning', 'CAT', 'column', 'Validity', 'Schema Drift', 'Threshold Invalid Value Count', NULL, 'Y'), ('1045', 'Valid_US_Zip3', 'Valid US Zip-3 ', 'Valid USA Zip-3 Prefix', 'Tests that postal codes match the 3 digit format of a regional prefix.', 'Invalid 3-digit US Zip Code regional prefix formats found.', 'Invalid Zip-3 Prefix', 'Expected count of values with invalid Zip-3 Prefix Codes', 'functional_data_type = ''Zip3''', NULL, NULL, 'threshold_value', '0', NULL, NULL, 'Warning', 'CAT', 'column', 'Validity', 'Schema Drift', 'Threshold Invalid Zip3 Count', 'This test looks for the presence of values that fail to match the three-digit numeric code expected for US Zip Code regional prefixes. These prefixes are often used to roll up Zip Code data to a regional level, and may be critical to anonymize detailed data and protect PID. Depending on your needs and regulatory requirements, longer zip codes could place PID at risk.', 'Y'), - ('1006', 'Condition_Flag', 'Custom Condition', 'Column values match pre-defined condition', 'Tests that each record in the table matches a pre-defined, custom condition', 'Value(s) found not matching defined condition.', 'Values Failing', NULL, NULL, 'Test Focus', 'Specify a brief descriptor of the focus of this test that is unique within this Test Suite for the Table and Test Type. This distinguishes this test from others of the same type on the same table. Example: `Quantity Consistency` if you are testing that quantity ordered matches quantity shipped.', 'threshold_value,custom_query', NULL, 'Threshold Error Count,Custom SQL Expression', 'The number of errors that are acceptable before test fails.|Expression should evaluate to TRUE to register an error or FALSE if no error. An expression can reference only columns in the selected table.', 'Fail', 'CAT', 'custom', 'Validity', 'Schema Drift', 'Count of records that don''t meet test condition', 'Custom Condition is a business-rule test for a user-defined error condition based on the value of one or more columns. The condition is applied to each record within the table, and the count of records failing the condition is added up. If that count exceeds a threshold of errors, the test as a whole is failed. This test is ideal for error conditions that TestGen cannot automatically infer, and any condition that involves the values of more than one column in the same record. Performance of this test is fast, since it is performed together with other aggregate tests. Interpretation is based on the user-defined meaning of the test.', 'Y'), + ('1006', 'Condition_Flag', 'Custom Condition', 'Column values match pre-defined condition', 'Tests that each record in the table matches a pre-defined, custom condition', 'Value(s) found not matching defined condition.', 'Values Failing', NULL, NULL, 'Test Focus', 'Specify a brief descriptor of the focus of this test that is unique within this Test Suite for the Table and Test Type. This distinguishes this test from others of the same type on the same table. Example: `Quantity Consistency` if you are testing that quantity ordered matches quantity shipped.', 'threshold_value,custom_query', NULL, 'Threshold Error Count,Custom SQL Expression (TRUE on error)', 'The number of errors that are acceptable before test fails.|Expression should evaluate to TRUE to register an error or FALSE if no error. An expression can reference only columns in the selected table.', 'Fail', 'CAT', 'custom', 'Validity', 'Schema Drift', 'Count of records that don''t meet test condition', 'Custom Condition is a business-rule test for a user-defined error condition based on the value of one or more columns. The condition is applied to each record within the table, and the count of records failing the condition is added up. If that count exceeds a threshold of errors, the test as a whole is failed. This test is ideal for error conditions that TestGen cannot automatically infer, and any condition that involves the values of more than one column in the same record. Performance of this test is fast, since it is performed together with other aggregate tests. Interpretation is based on the user-defined meaning of the test.', 'Y'), ('1031', 'Row_Ct', 'Row Count', 'Number of rows is at or above threshold', 'Tests that the count of records has not decreased from the baseline count.', 'Row count less than baseline count.', 'Row count', NULL, 'TEMPLATE', NULL, NULL, 'threshold_value', NULL, 'Threshold Minimum Record Count', NULL, 'Fail', 'CAT', 'table', 'Completeness', 'Volume', 'Expected minimum row count', 'Because this tests the row count against a constant minimum threshold, it''s appropriate for any dataset, as long as the number of rows doesn''t radically change from refresh to refresh. But it''s not responsive to change over time. You may want to adjust the threshold periodically if you are dealing with a cumulative dataset.', 'Y'), ('1032', 'Row_Ct_Pct', 'Row Range', 'Number of rows within percent range of threshold', 'Tests that the count of records is within a percentage above or below the baseline count.', 'Row Count is outside of threshold percent of baseline count.', 'Percent of baseline', 'Row count percent above or below baseline', 'TEMPLATE', NULL, NULL, 'baseline_ct,threshold_value', NULL, 'Baseline Record Count,Threshold Pct Above or Below Baseline', NULL, 'Fail', 'CAT', 'table', 'Completeness', 'Volume', 'Expected percent window below or above baseline', 'This test is better than Row Count for an incremental or windowed dataset where you would expect the row count to range within a percentage of baseline.', 'Y'), ('1008', 'CUSTOM', 'Custom Test', 'Custom-defined business rule', 'Custom SQL Query Test', 'Errors were detected according to test definition.', 'Errors found', 'Count of errors identified by query', NULL, 'Test Focus', 'Specify a brief descriptor of the focus of this test that is unique within this Test Suite for the Table and Test Type. This distinguishes this test from others of the same type on the same table. Example: `Order Total Matches Detail` if you are testing that the total in one table matches the sum of lines in another.', 'custom_query', NULL, 'Custom SQL Query Returning Error Records', 'Query should return records indicating one or more errors. The test passes if no records are returned. Results of the query will be shown when you click `Review Source Data` for a failed test, so be sure to include enough data in your results to follow-up. \n\nA query can refer to any tables in the database. You must hard-code the schema or use `{DATA_SCHEMA}` to represent the schema defined for the Table Group.', 'Fail', 'QUERY', 'custom', 'Accuracy', 'Data Drift', 'Expected count of errors found by custom query', 'This business-rule test is highly flexible, covering any error state that can be expressed by a SQL query against one or more tables in the database. In operation, the user-defined query is embedded within a parent query returning the count of error rows identified. Any row returned by the query is interpreted as a single error condition in the test. Note that this query is run independently of other tests, and that performance will be slower, depending in large part on the efficiency of the query you write. Interpretation is based on the user-defined meaning of the test. Your query might be written to return errors in individual rows identified by joining tables. Or it might return an error based on a multi-column aggregate condition returning a single row if an error is found. This query is run separately when you click `Review Source Data` from Test Results, so be sure to include enough data in your results to follow-up. Interpretation is based on the user-defined meaning of the test.', 'Y'), - ('1500', 'Aggregate_Balance', 'Aggregate Balance', 'Aggregate values per group match reference', 'Tests for exact match in aggregate values for each set of column values vs. reference dataset', 'Aggregate measure per set of column values does not exactly match reference dataset.', 'Mismatched measures', NULL, NULL, 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_schema_name,match_table_name,match_column_names,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'Record Subset Condition,Grouping Columns,Group Subset Condition,Matching Schema Name,Matching Table Name,Matching Aggregate Expression,Matching Record Subset Condition,Matching Grouping Columns,Matching Group Subset Condition', 'Condition defining a subset of records in main table (e.g. WHERE clause) - OPTIONAL|Category columns in main table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in main table (e.g. HAVING clause) - OPTIONAL|Schema location of matching table|Matching table name|Agregate column expression in matching table: one of `SUM([column_name])` or `COUNT([column_name])`|Condition defining a subset of records in matching table (e.g. WHERE clause) - OPTIONAL|Category columns in matching table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in matching table (e.g. HAVING clause) - OPTIONAL', 'Fail', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected count of group totals not matching aggregate value', 'This test compares sums or counts of a column rolled up to one or more category combinations. It''s ideal for confirming that two datasets exactly match -- that the sum of a measure or count of a value hasn''t changed or shifted between categories. Use this test to compare a raw and processed version of the same dataset, or to confirm that an aggregated table exactly matches the detail table that it''s built from. An error here means that one or more value combinations fail to match. New categories or combinations will cause failure.', 'Y'), - ('1501', 'Aggregate_Minimum', 'Aggregate Minimum', 'Aggregate values per group are at or above reference', 'Tests that aggregate values for each set of column values are at least the same as reference dataset', 'Aggregate measure per set of column values is not at least the same as reference dataset.', 'Mismatched measures', NULL, NULL, 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_schema_name,match_table_name,match_column_names,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'Record Subset Condition,Grouping Columns,Group Subset Condition,Matching Schema Name,Matching Table Name,Matching Aggregate Expression,Matching Record Subset Condition,Matching Grouping Columns,Matching Group Subset Condition', 'Condition defining a subset of records in main table (e.g. WHERE clause) - OPTIONAL,Category columns in main table separated by commas (e.g. GROUP BY columns),Condition defining a subset of aggregate records in main table (e.g. HAVING clause) - OPTIONAL,Schema location of matching table,Matching table name,Aggregate column expression in matching table (e.g. `SUM(sales)`),Condition defining a subset of records in matching table (e.g. WHERE clause) - OPTIONAL,Category columns in matching table separated by commas (e.g. GROUP BY columns),Condition defining a subset of aggregate records in matching table (e.g. HAVING clause) - OPTIONAL', 'Fail', 'QUERY', 'referential', 'Accuracy', 'Data Drift', 'Expected count of group totals below aggregate value', 'This test compares sums or counts of a column rolled up to one or more category combinations, but requires a match or increase in the aggregate value, rather than an exact match. Use it to confirm that aggregate values have not dropped for any set of categories, even if some values may rise. This test is useful to compare an older and newer version of a cumulative dataset. An error here means that one or more values per category set fail to match or exceed the prior dataset. New categories or combinations are allowed (but can be restricted independently with a Combo_Match test). Both tables must be present to run this test.', 'Y'), - ('1502', 'Combo_Match', 'Combo Match', 'Column values or combinations found in reference', 'Tests for the presence of one or a set of column values in a reference table', 'Column value combinations are not found in reference table values.', 'Missing values', NULL, NULL, 'Categorical Column List', 'Specify one or more Categorical columns, separated by commas. \n\nDo not use continuous mesurements here. Do not use numeric values unless they represent discrete categories.', 'subset_condition,having_condition,match_schema_name,match_table_name,match_groupby_names,match_subset_condition,match_having_condition', NULL, 'Record Subset Condition,Group Subset Condition,Reference Schema Name,Reference Table Name,Matching Columns,Matching Record Subset Condition,Matching Group Subset Condition', 'Condition defining a subset of records in main table to evaluate - OPTIONAL|Condition based on aggregate expression used to exclude value combinations in source table (e.g. `SUM(sales) < 100`) - OPTIONAL|Schema location of matching table|Matching table name|Column Names in reference table used to validate source table values (separated by commas)|Condition defining a subset of records in reference table to match against - OPTIONAL|Condition based on aggregate expression used to exclude value combinations in reference table (e.g. `SUM(sales) < 100`) - OPTIONAL', 'Fail', 'QUERY', 'referential', 'Validity', 'Schema Drift', 'Expected count of non-matching value combinations', 'This test verifies that values, or combinations of values, that are present in the main table are also found in a reference table. This is a useful test for referential integrity between fact and dimension tables. You can also use it to confirm the validity of combinations of values found together within each record, such as bottle / pint / milk and carton / dozen / eggs. An error here means that one or more value combinations in the main table are not found in the reference table. Both tables must be present to run this test.', 'Y'), - ('1503', 'Distribution_Shift', 'Distribution Shift', 'Probability distribution consistent with reference', 'Tests the closeness of match between two distributions of aggregate measures across combinations of column values, using Jensen-Shannon Divergence test', 'Divergence between two distributions exceeds specified threshold.', 'Divergence level (0-1)', 'Jensen-Shannon Divergence, from 0 (identical distributions), to 1.0 (max divergence)', NULL, 'Categorical Column List', 'Specify one or more Categorical columns, separated by commas. \n\nDo not use continuous mesurements here. Do not use numeric values unless they represent discrete categories.', 'subset_condition,match_schema_name,match_table_name,match_groupby_names,match_subset_condition', NULL, 'Record Subset Condition,Reference Schema Name,Reference Table Name,Matching Columns to Compare,Matching Record Subset Condition', 'Condition defining a subset of records in main table to evaluate - OPTIONAL|Schema location of matching table|Matching table name|Column Names in reference table used to compare counts with source table values (separated by commas)|Condition defining a subset of records in reference table to match against - OPTIONAL', 'Warning', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected maximum divergence level between 0 and 1', 'This test measures the similarity of two sets of counts per categories, by using their proportional counts as probability distributions. Using Jensen-Shannon divergence, a measure of relative entropy or difference between two distributions, the test assigns a score ranging from 0, meaning that the distributions are identical, to 1, meaning that the distributions are completely unrelated. This test can be used to compare datasets that may not match exactly, but should have similar distributions. For example, it is a useful sanity check for data from different sources that you would expect to have a consistent spread, such as shipment of building materials per state and construction projects by state. Scores can be compared over time even if the distributions are not identical -- a dataset can be expected to maintain a comparable divergence score with a reference dataset over time. Both tables must be present to run this test.', 'Y'), - ('1508', 'Timeframe_Combo_Gain', 'Timeframe Keep', 'Latest timeframe has at least all value combinations from prior period', 'Tests that column values in most recent time-window include at least same as prior time window', 'Column values in most recent time-window don''t include all values in prior window.', 'Mismatched values', NULL, NULL, 'Categorical Column List', 'Specify one or more Categorical columns, separated by commas. Make sure not to use continuous mesurements here. Do not use numeric values unless they represent discrete categories.', 'window_date_column,window_days,subset_condition', NULL, 'Date Column for Time Windows,Time Window in Days,Record Subset Condition', 'The date column used to define the time windows. This must be a DATE or DATETIME type.|Length in days of the time window. The test will compare the most recent period of days to the prior period of the same duration.|Condition defining a subset of records in main table to evaluate - OPTIONAL', 'Fail', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected count of missing value combinations', NULL, 'Y'), - ('1509', 'Timeframe_Combo_Match', 'Timeframe Match', 'Column value combinations from latest timeframe same as prior period', 'Tests for presence of same column values in most recent time-window vs. prior time window', 'Column values don''t match in most recent time-windows.', 'Mismatched values', NULL, NULL, 'Categorical Column List', 'Specify one or more Categorical columns, separated by commas. \n\nDo not use continuous mesurements here. Do not use numeric values unless they represent discrete categories.', 'window_date_column,window_days,subset_condition', NULL, 'Date Column for Time Windows,Time Window in Days,Record Subset Condition', NULL, 'Fail', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected count of non-matching value combinations', NULL, 'Y'), + ('1500', 'Aggregate_Balance', 'Aggregate Balance', 'Aggregate values per group match reference', 'Tests for exact match in aggregate values for each set of column values vs. reference dataset', 'Aggregate measure per set of column values does not exactly match reference dataset.', 'Mismatched measures', NULL, NULL, 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_schema_name,match_table_name,match_column_names,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'Record Subset Condition,Grouping Columns,Group Subset Condition,Matching Schema Name,Matching Table Name,Matching Aggregate Expression,Matching Record Subset Condition,Matching Grouping Columns,Matching Group Subset Condition', 'Condition defining a subset of records in main table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in main table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in main table (e.g. HAVING clause) - OPTIONAL|Schema location of matching table|Matching table name|Agregate column expression in matching table: one of `SUM([column_name])` or `COUNT([column_name])`|Condition defining a subset of records in matching table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in matching table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in matching table (e.g. HAVING clause) - OPTIONAL', 'Fail', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected count of group totals not matching aggregate value', 'This test compares sums or counts of a column rolled up to one or more category combinations across two different tables. Both tables must be accessible at the same time. It''s ideal for confirming that two datasets exactly match -- that the sum of a measure or count of a value hasn''t changed or shifted between categories. Use this test to compare a raw and processed version of the same dataset, or to confirm that an aggregated table exactly matches the detail table that it''s built from. An error here means that one or more value combinations fail to match. New categories or combinations will cause failure.', 'Y'), + ('1501', 'Aggregate_Minimum', 'Aggregate Minimum', 'Aggregate values per group are at or above reference', 'Tests that aggregate values for each set of column values are at least the same as reference dataset', 'Aggregate measure per set of column values is not at least the same as reference dataset.', 'Mismatched measures', NULL, NULL, 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_schema_name,match_table_name,match_column_names,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'Record Subset Condition,Grouping Columns,Group Subset Condition,Matching Schema Name,Matching Table Name,Matching Aggregate Expression,Matching Record Subset Condition,Matching Grouping Columns,Matching Group Subset Condition', 'Condition defining a subset of records in main table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in main table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in main table (e.g. HAVING clause) - OPTIONAL|Schema location of reference table|Reference table name|Aggregate column expression in reference table (e.g. `SUM(sales)`)|Condition defining a subset of records in reference table, written like a condition within a SQL WHERE clause - OPTIONAL|Category columns in reference table separated by commas (e.g. GROUP BY columns)|Condition defining a subset of aggregate records in reference table (e.g. HAVING clause) - OPTIONAL', 'Fail', 'QUERY', 'referential', 'Accuracy', 'Data Drift', 'Expected count of group totals below aggregate value', 'This test compares sums or counts of a column rolled up to one or more category combinations, but requires a match or increase in the aggregate value, rather than an exact match, across two different tables. Both tables must be accessible at the same time. Use this to confirm that aggregate values have not dropped for any set of categories, even if some values may rise. This test is useful to compare an older and newer version of a cumulative dataset. An error here means that one or more values per category set fail to match or exceed the prior dataset. New categories or combinations are allowed (but can be restricted independently with a Combo_Match test). Both tables must be present to run this test.', 'Y'), + ('1502', 'Combo_Match', 'Reference Match', 'Column values or combinations found in reference', 'Tests for the presence of one or a set of column values in a reference table', 'Column value combinations are not found in reference table values.', 'Missing values', NULL, NULL, 'Categorical Column List', 'Specify one or more Categorical columns, separated by commas. \n\nDo not use continuous mesurements here. Do not use numeric values unless they represent discrete categories.', 'subset_condition,having_condition,match_schema_name,match_table_name,match_groupby_names,match_subset_condition,match_having_condition', NULL, 'Record Subset Condition,Group Subset Condition,Reference Schema Name,Reference Table Name,Matching Columns,Matching Record Subset Condition,Matching Group Subset Condition', 'Condition defining a subset of records in main table to evaluate, written like a condition within a SQL WHERE clause - OPTIONAL|Condition based on aggregate expression used to exclude value combinations in source table, written like a condition within a SQL HAVING clause (e.g. `SUM(sales) < 100`) - OPTIONAL|Schema location of matching table|Matching table name|Column Names in reference table used to validate source table values (separated by commas)|Condition defining a subset of records in reference table to match against, written like a condition within a SQL WHERE clause - OPTIONAL|Condition based on aggregate expression used to exclude value combinations in reference table, written like a condition within a SQL HAVING clause (e.g. `SUM(sales) < 100`) - OPTIONAL', 'Fail', 'QUERY', 'referential', 'Validity', 'Schema Drift', 'Expected count of non-matching value combinations', 'This test verifies that values, or combinations of values, that are present in the main table are also found in a reference table. This is a useful test for referential integrity between fact and dimension tables. You can also use it to confirm the validity of a code or category, or of combinations of values that should only be found together within each record, such as product/size/color. An error here means that one or more category combinations in the main table are not found in the reference table. Both tables must be present to run this test.', 'Y'), + ('1503', 'Distribution_Shift', 'Distribution Shift', 'Probability distribution consistent with reference', 'Tests the closeness of match between two distributions of aggregate measures across combinations of column values, using Jensen-Shannon Divergence test', 'Divergence between two distributions exceeds specified threshold.', 'Divergence level (0-1)', 'Jensen-Shannon Divergence, from 0 (identical distributions), to 1.0 (max divergence)', NULL, 'Categorical Column List', 'Specify one or more Categorical columns, separated by commas. Do not use continuous mesurements here. Do not use numeric values unless they represent discrete categories.', 'subset_condition,match_schema_name,match_table_name,match_groupby_names,match_subset_condition', NULL, 'Record Subset Condition,Reference Schema Name,Reference Table Name,Matching Columns to Compare,Matching Record Subset Condition', 'Condition defining a subset of records in main table to evaluate, written like a condition within a SQL WHERE clause - OPTIONAL|Schema location of matching table|Matching table name|Column Names in reference table used to compare counts with source table values (separated by commas)|Condition defining a subset of records in reference table to match against, written like a condition within a SQL WHERE clause - OPTIONAL', 'Warning', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected maximum divergence level between 0 and 1', 'This test measures the similarity of two sets of counts per categories, by using their proportional counts as probability distributions. Using Jensen-Shannon divergence, a measure of relative entropy or difference between two distributions, the test assigns a score ranging from 0, meaning that the distributions are identical, to 1, meaning that the distributions are completely unrelated. This test can be used to compare datasets that may not match exactly, but should have similar distributions. For example, it is a useful sanity check for data from different sources that you would expect to have a consistent spread, such as shipment of building materials per state and construction projects by state. Scores can be compared over time even if the distributions are not identical -- a dataset can be expected to maintain a comparable divergence score with a reference dataset over time. Both tables must be present to run this test.', 'Y'), + ('1508', 'Timeframe_Combo_Gain', 'Timeframe No Drops', 'Latest timeframe has at least all value combinations from prior period', 'Tests that column values in most recent time-window include at least same as prior time window', 'Column values in most recent time-window don''t include all values in prior window.', 'Mismatched values', NULL, NULL, 'Categorical Column List', 'Specify one or more Categorical columns, separated by commas. Make sure not to use continuous measurements here. Do not use numeric values unless they represent discrete categories.', 'window_date_column,window_days,subset_condition', NULL, 'Date Column for Time Windows,Time Window in Days,Record Subset Condition', 'The date column used to define the time windows. This must be a DATE or DATETIME type.|Length in days of the time window. The test will compare the most recent period of days to the prior period of the same duration.|Condition defining a subset of records in main table to evaluate, written like a condition within a SQL WHERE clause - OPTIONAL', 'Fail', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected count of missing value combinations', 'This test checks a single transactional table to verify that categorical values or combinations that are present in the most recent time window you define include at least all those found in the prior time window of the same duration. Missing values in the latest time window will trigger the test to fail. New values are permitted. Use this test to confirm that codes or categories are not lost across successive time periods in a transactional table.', 'Y'), + ('1509', 'Timeframe_Combo_Match', 'Timeframe Match', 'Column value combinations from latest timeframe same as prior period', 'Tests for presence of same column values in most recent time-window vs. prior time window', 'Column values don''t match in most recent time-windows.', 'Mismatched values', NULL, NULL, 'Categorical Column List', 'Specify one or more Categorical columns, separated by commas. Do not use continuous measurements here. Do not use numeric values unless they represent discrete categories.', 'window_date_column,window_days,subset_condition', NULL, 'Date Column for Time Windows,Time Window in Days,Record Subset Condition', NULL, 'Fail', 'QUERY', 'referential', 'Consistency', 'Data Drift', 'Expected count of non-matching value combinations', 'This test checks a single transactional table (such as a fact table) to verify that categorical values or combinations that are present in the most recent time window you define match those found in the prior time window of the same duration. New or missing values in the latest time window will trigger the test to fail. Use this test to confirm the consistency in the occurrence of codes or categories across successive time periods in a transactional table.', 'Y'), ('1504', 'Aggregate_Pct_Above', 'Aggregate Pct Above', 'Aggregate values per group exceed reference', 'Tests that aggregate values for each set of column values exceed values for reference dataset', 'Aggregate measure per set of column values fails to exceed the reference dataset.', 'Mismatched measures', NULL, NULL, 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_column_names,match_schema_name,match_table_name,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'TODO Fill in default_parm_prompts match_schema_name,TODO Fill in default_parm_prompts match_table_name,TODO Fill in default_parm_prompts match_column_names,TODO Fill in default_parm_prompts match_subset_condition,TODO Fill in default_parm_prompts match_groupby_names,TODO Fill in default_parm_prompts match_having_condition,TODO Fill in default_parm_prompts subset_condition,TODO Fill in default_parm_prompts groupby_names,TODO Fill in default_parm_prompts having_condition', NULL, 'Fail', 'QUERY', 'referential', 'Accuracy', 'Data Drift', 'Expected count of group totals with not exceeding aggregate measure', NULL, 'N'), ('1505', 'Aggregate_Pct_Within', 'Aggregate Pct Within', 'Aggregate values per group exceed reference', 'Tests that aggregate values for each set of column values exceed values for reference dataset', 'Aggregate measure per set of column values fails to exceed the reference dataset.', 'Mismatched measures', NULL, NULL, 'Aggregate Expression', 'Specify an aggregate column expression: one of `SUM([column_name])` or `COUNT([column_name])`', 'subset_condition,groupby_names,having_condition,match_column_names,match_schema_name,match_table_name,match_subset_condition,match_groupby_names,match_having_condition', NULL, 'TODO Fill in default_parm_prompts match_schema_name,TODO Fill in default_parm_prompts match_table_name,TODO Fill in default_parm_prompts match_column_names,TODO Fill in default_parm_prompts match_subset_condition,TODO Fill in default_parm_prompts match_groupby_names,TODO Fill in default_parm_prompts match_having_condition,TODO Fill in default_parm_prompts subset_condition,TODO Fill in default_parm_prompts groupby_names,TODO Fill in default_parm_prompts having_condition', NULL, 'Fail', 'QUERY', 'referential', 'Accuracy', 'Data Drift', 'Expected count of group totals with not exceeding aggregate measure', NULL, 'N'), @@ -1237,9 +1237,13 @@ VALUES ('measure', 'meter|m|metre'), ('med_dose', 'oral|po'), ('med_dose', 'per rectum|pr'), ('med_dose', 'drops|gtt|gtts'), - ('med_dose', 'treatment|trx|tx'), - ('med_dose', 'new patients|new patient|new pt|nrx'), - ('med_dose', 'prescription|rx'), + + ('med_tx', 'treatment|trx|tx'), + ('med_tx', 'new patients|new patient|new pt|nrx'), + ('med_tx', 'patient|pat|pt|px'), + ('med_tx', 'prescription|rx'), + ('med_tx', 'hcp|md|dr'), + ('inv_uom', 'each|ea'), ('inv_uom', 'piece|pc|pieces|pcs'), ('inv_uom', 'set|sets'), @@ -1288,9 +1292,9 @@ VALUES ('measure', 'meter|m|metre'), ('status', 'in process|in progress|active'), ('status', 'retain|keep'), ('status', 'remove|drop|delete|del'), - ('status', 'low|l'), + ('status', 'low|lo|l'), ('status', 'medium|moderate|med|m'), - ('status', 'high|h'), + ('status', 'high|hi|h'), ('status', 'same|sm'), ('status', 'average|mean|avg'), ('status', 'decreased|decrease|decr|down|dn'), diff --git a/testgen/template/dbsetup/060_create_standard_views.sql b/testgen/template/dbsetup/060_create_standard_views.sql index 5379797..788e278 100644 --- a/testgen/template/dbsetup/060_create_standard_views.sql +++ b/testgen/template/dbsetup/060_create_standard_views.sql @@ -9,19 +9,18 @@ DROP VIEW IF EXISTS v_latest_profile_results CASCADE; CREATE VIEW v_latest_profile_results AS - WITH last_run AS ( SELECT project_code, - table_groups_id, - schema_name, - MAX(run_date) AS last_run_date - FROM profile_results - GROUP BY project_code, table_groups_id, schema_name ) -SELECT p.* + WITH last_run AS ( SELECT table_groups_id, + MAX(profiling_starttime) AS last_run_date + FROM profiling_runs + GROUP BY table_groups_id ) +SELECT r.* FROM last_run lr - JOIN profile_results p - ON lr.project_code = p.project_code - AND lr.table_groups_id = p.table_groups_id - AND lr.schema_name = p.schema_name - AND lr.last_run_date = p.run_date; +INNER JOIN profiling_runs p + ON lr.table_groups_id = p.table_groups_id + AND lr.last_run_date = p.profiling_starttime +INNER JOIN profile_results r + ON p.id = r.profile_run_id; + DROP VIEW IF EXISTS v_latest_profile_anomalies; @@ -43,7 +42,7 @@ INNER JOIN profile_anomaly_types t INNER JOIN profiling_runs pr ON (r.profile_run_id = pr.id) INNER JOIN last_profile_date l - ON (r.table_groups_id = l.table_groups_id + ON (pr.table_groups_id = l.table_groups_id AND pr.profiling_starttime = l.last_profile_run_date); @@ -95,13 +94,14 @@ SELECT r.id as test_run_id, SUM(result_code) as passed_ct, COALESCE(SUM(CASE WHEN tr.result_status = 'Failed' THEN 1 END), 0) as failed_ct, COALESCE(SUM(CASE WHEN tr.result_status = 'Warning' THEN 1 END), 0) as warning_ct, - process_id + r.process_id FROM test_runs r INNER JOIN projects p ON (r.project_code = p.project_code) INNER JOIN test_results tr ON (r.id = tr.test_run_id) -GROUP BY r.id, r.project_code, p.project_name, r.test_suite, r.test_starttime, r.test_endtime, r.status, r.log_message; +GROUP BY r.id, r.project_code, r.test_suite, r.test_starttime, r.test_endtime, + r.process_id, r.status, r.log_message, p.project_name; DROP VIEW IF EXISTS v_test_results; @@ -145,32 +145,35 @@ SELECT p.project_name, r.project_code, r.table_groups_id, r.id as test_result_id, c.id as connection_id, - ts.id as test_suite_id, + r.test_suite_id, r.test_definition_id as test_definition_id_runtime, - d.id as test_definition_id_current, - r.test_run_id as test_run_id + CASE + WHEN r.auto_gen = TRUE THEN d.id + ELSE r.test_definition_id + END as test_definition_id_current, + r.test_run_id as test_run_id, + r.auto_gen FROM test_results r INNER JOIN test_types tt ON (r.test_type = tt.test_type) LEFT JOIN test_definitions d - ON (r.project_code = d.project_code - AND r.test_suite = d.test_suite - AND r.schema_name = d.schema_name + ON (r.test_suite_id = d.test_suite_id AND r.table_name = d.table_name AND r.column_names = COALESCE(d.column_name, 'N/A') - AND r.test_type = d.test_type) + AND r.test_type = d.test_type + AND r.auto_gen = TRUE + AND d.last_auto_gen_date IS NOT NULL) INNER JOIN test_suites ts - ON (r.project_code = ts.project_code - AND r.test_suite = ts.test_suite) + ON (r.test_suite_id = ts.id) INNER JOIN projects p ON (r.project_code = p.project_code) INNER JOIN table_groups tg - ON (d.table_groups_id = tg.id) + ON (r.table_groups_id = tg.id) INNER JOIN connections cn ON (tg.connection_id = cn.connection_id) LEFT JOIN cat_test_conditions c ON (cn.sql_flavor = c.sql_flavor - AND d.test_type = c.test_type); + AND r.test_type = c.test_type); DROP VIEW IF EXISTS v_queued_observability_results; diff --git a/testgen/template/dbupgrade/0105_incremental_upgrade.sql b/testgen/template/dbupgrade/0105_incremental_upgrade.sql new file mode 100644 index 0000000..867407e --- /dev/null +++ b/testgen/template/dbupgrade/0105_incremental_upgrade.sql @@ -0,0 +1,41 @@ +SET SEARCH_PATH TO {SCHEMA_NAME}; + +ALTER TABLE test_results + ADD COLUMN test_suite_id UUID; + +UPDATE test_results + SET test_suite_id = s.id + FROM test_results r +INNER JOIN test_suites s + ON (r.table_groups_id = s.table_groups_id + AND r.test_suite = s.test_suite) + WHERE test_results.id = r.id; + +CREATE INDEX ix_tr_ts_tctt + ON test_results(test_suite_id, table_name, column_names, test_type); + +ALTER TABLE test_definitions + ADD COLUMN test_suite_id UUID; + +UPDATE test_definitions + SET test_suite_id = s.id + FROM test_definitions d +INNER JOIN test_suites s + ON (D.table_groups_id = s.table_groups_id + AND d.test_suite = s.test_suite) + WHERE test_definitions.id = d.id; + +ALTER TABLE table_groups + ADD COLUMN data_source VARCHAR(40), + ADD COLUMN source_system VARCHAR(40), + ADD COLUMN data_location VARCHAR(40), + ADD COLUMN source_process VARCHAR(40), + ADD COLUMN business_domain VARCHAR(40), + ADD COLUMN stakeholder_group VARCHAR(40), + ADD COLUMN transform_level VARCHAR(40); + +ALTER TABLE data_table_chars + RENAME COLUMN transformation_level to transform_level; + +ALTER TABLE data_column_chars + RENAME COLUMN transformation_level to transform_level; diff --git a/testgen/template/execution/ex_finalize_test_run_results.sql b/testgen/template/execution/ex_finalize_test_run_results.sql index 7eb5454..e1b3c8e 100644 --- a/testgen/template/execution/ex_finalize_test_run_results.sql +++ b/testgen/template/execution/ex_finalize_test_run_results.sql @@ -23,6 +23,7 @@ UPDATE test_results ELSE '' END), table_groups_id = d.table_groups_id, + test_suite_id = s.id, auto_gen = d.last_auto_gen_date IS NOT NULL FROM test_results r INNER JOIN test_suites s diff --git a/testgen/template/flavors/mssql/profiling/project_profiling_query_mssql.yaml b/testgen/template/flavors/mssql/profiling/project_profiling_query_mssql.yaml index d3e80e9..5ebda4a 100644 --- a/testgen/template/flavors/mssql/profiling/project_profiling_query_mssql.yaml +++ b/testgen/template/flavors/mssql/profiling/project_profiling_query_mssql.yaml @@ -18,7 +18,7 @@ strTemplate02_all: | SUM(CASE WHEN "{COL_NAME}" IS NULL THEN 1 ELSE 0 END) AS null_value_ct, strTemplate03_ADN: MIN(LEN("{COL_NAME}")) AS min_length, MAX(LEN("{COL_NAME}")) AS max_length, - AVG(CAST(LEN("{COL_NAME}") AS FLOAT)) AS avg_length, + AVG(CAST(NULLIF(LEN("{COL_NAME}"), 0) AS FLOAT)) AS avg_length, strTemplate03_else: NULL as min_length, NULL as max_length, NULL as avg_length, diff --git a/testgen/template/flavors/postgresql/profiling/project_ddf_query_postgresql.sql b/testgen/template/flavors/postgresql/profiling/project_ddf_query_postgresql.sql index 120b575..5f2d780 100644 --- a/testgen/template/flavors/postgresql/profiling/project_ddf_query_postgresql.sql +++ b/testgen/template/flavors/postgresql/profiling/project_ddf_query_postgresql.sql @@ -10,8 +10,9 @@ SELECT '{PROJECT_CODE}' as project_code, WHEN c.data_type = 'character varying' THEN 'varchar(' || CAST(c.character_maximum_length AS VARCHAR) || ')' WHEN c.data_type = 'character' THEN 'char(' || CAST(c.character_maximum_length AS VARCHAR) || ')' - WHEN c.data_type = 'numeric' THEN 'numeric(' || CAST(c.numeric_precision AS VARCHAR) || ',' || - CAST(c.numeric_scale AS VARCHAR) || ')' + WHEN c.data_type = 'numeric' THEN 'numeric' + || COALESCE( '(' || CAST(c.numeric_precision AS VARCHAR) || ',' + || CAST(c.numeric_scale AS VARCHAR) || ')', '') ELSE c.data_type END AS data_type, COALESCE(c.character_maximum_length, CASE WHEN c.data_type IN ('text', 'character varying') THEN 65535 END) @@ -32,7 +33,10 @@ SELECT '{PROJECT_CODE}' as project_code, THEN 'N' ELSE 'X' END AS general_type, - numeric_scale > 0 as is_decimal + CASE + WHEN c.data_type = 'numeric' THEN COALESCE(numeric_scale, 1) > 0 + ELSE numeric_scale > 0 + END as is_decimal FROM information_schema.columns c WHERE c.table_schema = '{DATA_SCHEMA}' {TABLE_CRITERIA} ORDER BY c.table_schema, c.table_name, c.ordinal_position diff --git a/testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml b/testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml index 873ec6e..db02274 100644 --- a/testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml +++ b/testgen/template/flavors/postgresql/profiling/project_profiling_query_postgresql.yaml @@ -18,7 +18,7 @@ strTemplate02_all: | SUM(CASE WHEN "{COL_NAME}" IS NULL THEN 1 ELSE 0 END) AS null_value_ct, strTemplate03_ADN: MIN(LENGTH(CAST("{COL_NAME}" AS TEXT))) AS min_length, MAX(LENGTH(CAST("{COL_NAME}" AS TEXT))) AS max_length, - AVG(LENGTH(CAST("{COL_NAME}" AS TEXT))::FLOAT) AS avg_length, + AVG(NULLIF(LENGTH(CAST("{COL_NAME}" AS TEXT)), 0)::FLOAT) AS avg_length, strTemplate03_else: NULL as min_length, NULL as max_length, NULL as avg_length, diff --git a/testgen/template/flavors/redshift/profiling/project_ddf_query_redshift.sql b/testgen/template/flavors/redshift/profiling/project_ddf_query_redshift.sql index 918fe4f..0ba198b 100644 --- a/testgen/template/flavors/redshift/profiling/project_ddf_query_redshift.sql +++ b/testgen/template/flavors/redshift/profiling/project_ddf_query_redshift.sql @@ -8,8 +8,9 @@ SELECT '{PROJECT_CODE}' as project_code, WHEN c.data_type = 'character varying' THEN 'varchar(' || CAST(c.character_maximum_length AS VARCHAR) || ')' WHEN c.data_type = 'character' THEN 'char(' || CAST(c.character_maximum_length AS VARCHAR) || ')' - WHEN c.data_type = 'numeric' THEN 'numeric(' || CAST(c.numeric_precision AS VARCHAR) || ',' || - CAST(c.numeric_scale AS VARCHAR) || ')' + WHEN c.data_type = 'numeric' THEN 'numeric' + || COALESCE( '(' || CAST(c.numeric_precision AS VARCHAR) || ',' + || CAST(c.numeric_scale AS VARCHAR) || ')', '') ELSE c.data_type END AS data_type, c.character_maximum_length, c.ordinal_position, @@ -28,7 +29,10 @@ SELECT '{PROJECT_CODE}' as project_code, THEN 'N' ELSE 'X' END AS general_type, - numeric_scale > 0 as is_decimal + CASE + WHEN c.data_type = 'numeric' THEN COALESCE(numeric_scale, 1) > 0 + ELSE numeric_scale > 0 + END as is_decimal FROM information_schema.columns c WHERE c.table_schema = '{DATA_SCHEMA}' {TABLE_CRITERIA} ORDER BY c.table_schema, c.table_name, c.ordinal_position diff --git a/testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml b/testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml index 308e605..8856fb2 100644 --- a/testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml +++ b/testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml @@ -18,7 +18,7 @@ strTemplate02_all: | SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct, strTemplate03_ADN: MIN(LEN("{COL_NAME}")) AS min_length, MAX(LEN("{COL_NAME}")) AS max_length, - AVG(LEN("{COL_NAME}")::FLOAT) AS avg_length, + AVG(NULLIF(LEN("{COL_NAME}"), 0)::FLOAT) AS avg_length, strTemplate03_else: NULL as min_length, NULL as max_length, NULL as avg_length, diff --git a/testgen/template/flavors/snowflake/profiling/project_profiling_query_snowflake.yaml b/testgen/template/flavors/snowflake/profiling/project_profiling_query_snowflake.yaml index 011c80b..5b3ab3e 100644 --- a/testgen/template/flavors/snowflake/profiling/project_profiling_query_snowflake.yaml +++ b/testgen/template/flavors/snowflake/profiling/project_profiling_query_snowflake.yaml @@ -18,7 +18,7 @@ strTemplate02_all: | SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct, strTemplate03_ADN: MIN(LEN("{COL_NAME}")) AS min_length, MAX(LEN("{COL_NAME}")) AS max_length, - AVG(LEN("{COL_NAME}")::FLOAT) AS avg_length, + AVG(NULLIF(LEN("{COL_NAME}"), 0)::FLOAT) AS avg_length, strTemplate03_else: NULL as min_length, NULL as max_length, NULL as avg_length, diff --git a/testgen/template/flavors/trino/profiling/project_profiling_query_trino.yaml b/testgen/template/flavors/trino/profiling/project_profiling_query_trino.yaml index e9006c9..0968a2d 100644 --- a/testgen/template/flavors/trino/profiling/project_profiling_query_trino.yaml +++ b/testgen/template/flavors/trino/profiling/project_profiling_query_trino.yaml @@ -18,7 +18,7 @@ strTemplate02_all: | SUM(NVL2("{COL_NAME}", 0, 1)) AS null_value_ct, strTemplate03_ADN: MIN(LENGTH("{COL_NAME}")) AS min_length, MAX(LENGTH("{COL_NAME}")) AS max_length, - AVG(CAST(LENGTH("{COL_NAME}") AS REAL)) AS avg_length, + AVG(CAST(NULLIF(LENGTH("{COL_NAME}"), 0) AS REAL)) AS avg_length, strTemplate03_else: NULL as min_length, NULL as max_length, NULL as avg_length, diff --git a/testgen/template/gen_funny_cat_tests/gen_test_constant.sql b/testgen/template/gen_funny_cat_tests/gen_test_constant.sql index 04434ac..c0e2204 100644 --- a/testgen/template/gen_funny_cat_tests/gen_test_constant.sql +++ b/testgen/template/gen_funny_cat_tests/gen_test_constant.sql @@ -1,6 +1,6 @@ -- Then insert new tests where a locked test is not already present INSERT INTO test_definitions (project_code, table_groups_id, profile_run_id, - test_type, test_suite, + test_type, test_suite, test_suite_id, schema_name, table_name, column_name, skip_errors, last_auto_gen_date, test_active, baseline_value, threshold_value, profiling_as_of_date) @@ -69,6 +69,7 @@ WITH last_run AS (SELECT r.table_groups_id, MAX(run_date) AS last_run_date END ) = 1 ), newtests AS ( SELECT 'Constant'::VARCHAR AS test_type, '{TEST_SUITE}'::VARCHAR AS test_suite, + '{TEST_SUITE_ID}'::UUID AS test_suite_id, c.profile_run_id, c.project_code, c.schema_name, c.table_name, c.column_name, @@ -90,7 +91,7 @@ newtests AS ( SELECT 'Constant'::VARCHAR AS test_type, WHERE (s.generation_set IS NOT NULL OR '{GENERATION_SET}' = '') ) SELECT n.project_code, '{TABLE_GROUPS_ID}'::UUID as table_groups_id, n.profile_run_id, - n.test_type, n.test_suite, n.schema_name, n.table_name, n.column_name, + n.test_type, n.test_suite, n.test_suite_id, n.schema_name, n.table_name, n.column_name, 0 as skip_errors, '{RUN_DATE}'::TIMESTAMP as auto_gen_date, 'Y' as test_active, COALESCE(baseline_value, '') as baseline_value, '0' as threshold_value, '{AS_OF_DATE}'::TIMESTAMP diff --git a/testgen/template/gen_funny_cat_tests/gen_test_distinct_value_ct.sql b/testgen/template/gen_funny_cat_tests/gen_test_distinct_value_ct.sql index bab7bfd..3b00304 100644 --- a/testgen/template/gen_funny_cat_tests/gen_test_distinct_value_ct.sql +++ b/testgen/template/gen_funny_cat_tests/gen_test_distinct_value_ct.sql @@ -1,6 +1,6 @@ -- FIRST TYPE OF CONSTANT IS HANDLED IN SEPARATE SQL FILE gen_standard_tests.sql using generic parameters -- Second type: constants with changing values (1 distinct value) -INSERT INTO test_definitions (project_code, table_groups_id, profile_run_id, test_type, test_suite, +INSERT INTO test_definitions (project_code, table_groups_id, profile_run_id, test_type, test_suite, test_suite_id, schema_name, table_name, column_name, skip_errors, last_auto_gen_date, test_active, baseline_value_ct, threshold_value, profiling_as_of_date) @@ -68,6 +68,7 @@ WITH last_run AS (SELECT r.table_groups_id, MAX(run_date) AS last_run_date END ) > 1 ), newtests AS ( SELECT 'Distinct_Value_Ct'::VARCHAR AS test_type, '{TEST_SUITE}'::VARCHAR AS test_suite, + '{TEST_SUITE_ID}'::UUID AS test_suite_id, c.project_code, c.table_groups_id, c.profile_run_id, c.schema_name, c.table_name, c.column_name, c.run_date AS last_run_date, @@ -83,7 +84,7 @@ newtests AS ( SELECT 'Distinct_Value_Ct'::VARCHAR AS test_type, WHERE (s.generation_set IS NOT NULL OR '{GENERATION_SET}' = '') ) SELECT n.project_code, n.table_groups_id, n.profile_run_id, - n.test_type, n.test_suite, + n.test_type, n.test_suite, n.test_suite_id, n.schema_name, n.table_name, n.column_name, 0 as skip_errors, '{RUN_DATE}'::TIMESTAMP as last_auto_gen_date, 'Y' as test_active, distinct_value_ct as baseline_value_ct, distinct_value_ct as threshold_value, diff --git a/testgen/template/gen_funny_cat_tests/gen_test_row_ct.sql b/testgen/template/gen_funny_cat_tests/gen_test_row_ct.sql index e5b9c9a..dacf48b 100644 --- a/testgen/template/gen_funny_cat_tests/gen_test_row_ct.sql +++ b/testgen/template/gen_funny_cat_tests/gen_test_row_ct.sql @@ -1,5 +1,5 @@ -- Insert new tests where a locked test is not already present -INSERT INTO test_definitions (project_code, table_groups_id, profile_run_id, test_type, test_suite, +INSERT INTO test_definitions (project_code, table_groups_id, profile_run_id, test_type, test_suite, test_suite_id, schema_name, table_name, skip_errors, threshold_value, last_auto_gen_date, test_active, baseline_ct, profiling_as_of_date) @@ -28,6 +28,7 @@ WITH last_run AS (SELECT r.table_groups_id, MAX(run_date) AS last_run_date newtests AS (SELECT project_code, table_groups_id, profile_run_id, 'Row_Ct' AS test_type, '{TEST_SUITE}' AS test_suite, + '{TEST_SUITE_ID}'::UUID AS test_suite_id, schema_name, table_name, MAX(record_ct) as record_ct @@ -42,7 +43,7 @@ WITH last_run AS (SELECT r.table_groups_id, MAX(run_date) AS last_run_date GROUP BY project_code, table_groups_id, profile_run_id, test_type, test_suite, schema_name, table_name ) SELECT n.project_code, n.table_groups_id, n.profile_run_id, - n.test_type, n.test_suite, + n.test_type, n.test_suite, n.test_suite_id, n.schema_name, n.table_name, 0 as skip_errors, record_ct AS threshold_value, '{RUN_DATE}'::TIMESTAMP as last_auto_gen_date, diff --git a/testgen/template/gen_funny_cat_tests/gen_test_row_ct_pct.sql b/testgen/template/gen_funny_cat_tests/gen_test_row_ct_pct.sql index a18f15f..0113d01 100644 --- a/testgen/template/gen_funny_cat_tests/gen_test_row_ct_pct.sql +++ b/testgen/template/gen_funny_cat_tests/gen_test_row_ct_pct.sql @@ -1,5 +1,5 @@ -- Insert new tests where a locked test is not already present -INSERT INTO test_definitions (project_code, table_groups_id, profile_run_id, test_type, test_suite, +INSERT INTO test_definitions (project_code, table_groups_id, profile_run_id, test_type, test_suite, test_suite_id, schema_name, table_name, skip_errors, last_auto_gen_date, profiling_as_of_date, test_active, baseline_ct, threshold_value) @@ -29,6 +29,7 @@ WITH last_run AS (SELECT r.table_groups_id, MAX(run_date) AS last_run_date SELECT project_code, table_groups_id, profile_run_id, 'Row_Ct_Pct' AS test_type, '{TEST_SUITE}' AS test_suite, + '{TEST_SUITE_ID}'::UUID AS test_suite_id, schema_name, table_name, MAX(record_ct) as record_ct @@ -44,7 +45,7 @@ WITH last_run AS (SELECT r.table_groups_id, MAX(run_date) AS last_run_date test_type, test_suite, schema_name, table_name HAVING MAX(record_ct) >= 500) SELECT n.project_code, n.table_groups_id, n.profile_run_id, - n.test_type, n.test_suite, + n.test_type, n.test_suite, n.test_suite_id, n.schema_name, n.table_name, 0 as skip_errors, '{RUN_DATE}'::TIMESTAMP as last_auto_gen_date, '{AS_OF_DATE}'::TIMESTAMP as profiling_as_of_date, diff --git a/testgen/template/generation/gen_retrieve_or_insert_test_suite.sql b/testgen/template/generation/gen_retrieve_or_insert_test_suite.sql new file mode 100644 index 0000000..da89bc0 --- /dev/null +++ b/testgen/template/generation/gen_retrieve_or_insert_test_suite.sql @@ -0,0 +1,58 @@ +WITH existing_rec + AS ( SELECT tg.project_code, tg.connection_id, + cc.sql_flavor, + cc.project_host, + cc.project_port, + cc.project_user, + cc.project_db, + tg.table_group_schema, + s.export_to_observability, + s.test_suite, + s.id as test_suite_id, + cc.url, + cc.connect_by_url, + CURRENT_TIMESTAMP AT TIME ZONE + 'UTC' - CAST(tg.profiling_delay_days AS INTEGER) * INTERVAL '1 day' AS profiling_as_of_date + FROM table_groups tg + INNER JOIN connections cc + ON (tg.connection_id = cc.connection_id) + INNER JOIN test_suites s + ON (tg.id = s.table_groups_id + AND '{TEST_SUITE}' = s.test_suite) + WHERE tg.id = '{TABLE_GROUPS_ID}' ), +new_rec + AS ( INSERT INTO test_suites + (project_code, test_suite, connection_id, table_groups_id, test_suite_description, + component_type, component_key) + SELECT '{PROJECT_CODE}', '{TEST_SUITE}', {CONNECTION_ID}, '{TABLE_GROUPS_ID}', '{TEST_SUITE} Test Suite', + 'dataset', '{TEST_SUITE}' + WHERE NOT EXISTS + (SELECT 1 + FROM test_suites + WHERE table_groups_id = '{TABLE_GROUPS_ID}' + AND test_suite = '{TEST_SUITE}') + RETURNING id as test_suite_id, test_suite, table_groups_id, export_to_observability ) +SELECT project_code, connection_id, sql_flavor, + project_host, project_port, project_user, project_db, table_group_schema, + export_to_observability, test_suite, test_suite_id, url, connect_by_url, profiling_as_of_date + FROM existing_rec + UNION ALL +SELECT tg.project_code, tg.connection_id, + cc.sql_flavor, + cc.project_host, + cc.project_port, + cc.project_user, + cc.project_db, + tg.table_group_schema, + s.export_to_observability, + s.test_suite, + s.test_suite_id, + cc.url, + cc.connect_by_url, + CURRENT_TIMESTAMP AT TIME ZONE + 'UTC' - CAST(tg.profiling_delay_days AS INTEGER) * INTERVAL '1 day' AS profiling_as_of_date + FROM new_rec s +INNER JOIN table_groups tg + ON (s.table_groups_id = tg.id) +INNER JOIN connections cc + ON (tg.connection_id = cc.connection_id); \ No newline at end of file diff --git a/testgen/template/generation/gen_standard_tests.sql b/testgen/template/generation/gen_standard_tests.sql index b916420..b253f94 100644 --- a/testgen/template/generation/gen_standard_tests.sql +++ b/testgen/template/generation/gen_standard_tests.sql @@ -1,5 +1,5 @@ -- Insert new tests where a locked test is not already present -INSERT INTO test_definitions (project_code, table_groups_id, profile_run_id, test_type, test_suite, +INSERT INTO test_definitions (project_code, table_groups_id, profile_run_id, test_type, test_suite, test_suite_id, schema_name, table_name, column_name, skip_errors, test_active, last_auto_gen_date, profiling_as_of_date, {DEFAULT_PARM_COLUMNS} ) @@ -35,6 +35,7 @@ SELECT '{PROJECT_CODE}' as project_code, n.profile_run_id, '{TEST_TYPE}' AS test_type, '{TEST_SUITE}' AS test_suite, + '{TEST_SUITE_ID}' AS test_suite_id, n.schema_name, n.table_name, n.column_name, 0 as skip_errors, 'Y' as test_active, '{RUN_DATE}'::TIMESTAMP as last_auto_gen_date, '{AS_OF_DATE}'::TIMESTAMP as profiling_as_of_date, diff --git a/testgen/template/parms/parms_test_gen.sql b/testgen/template/parms/parms_test_gen.sql index 446e10c..ece395e 100644 --- a/testgen/template/parms/parms_test_gen.sql +++ b/testgen/template/parms/parms_test_gen.sql @@ -7,6 +7,7 @@ SELECT tg.project_code, tg.connection_id, tg.table_group_schema, s.export_to_observability, s.test_suite, + s.id::VARCHAR as test_suite_id, cc.url, cc.connect_by_url, CURRENT_TIMESTAMP AT TIME ZONE 'UTC' - CAST(tg.profiling_delay_days AS integer) * INTERVAL '1 day' as profiling_as_of_date diff --git a/testgen/template/profiling/functional_datatype.sql b/testgen/template/profiling/functional_datatype.sql index 65f187e..2389721 100644 --- a/testgen/template/profiling/functional_datatype.sql +++ b/testgen/template/profiling/functional_datatype.sql @@ -1,5 +1,4 @@ --- Updated script -- - +-- First Clear -- UPDATE profile_results SET functional_data_type = NULL, functional_table_type = NULL @@ -64,7 +63,6 @@ WHERE profile_run_id = '{PROFILE_RUN_ID}' . Check varchar attributes (or attributes not give date datatype) Look at min_length and max_length to determine if a field is date or timestamp - */ UPDATE profile_results @@ -108,9 +106,9 @@ SET functional_data_type = THEN 'Transactional Date (Qtr)' ELSE 'Date (TBD)' END - WHEN column_type = 'date' OR (general_type <> 'D' AND max_length <= 11 AND min_length > 0 ) + WHEN column_type = 'date' THEN 'Date Stamp' - WHEN column_type = 'timestamp' OR (general_type <> 'D' AND max_length > 11 AND min_length > 0) + WHEN column_type = 'timestamp' THEN 'DateTime Stamp' ELSE functional_data_type END @@ -118,6 +116,23 @@ WHERE profile_run_id = '{PROFILE_RUN_ID}' AND functional_data_type IS NULL AND (general_type = 'D' OR (value_ct = date_ct + zero_length_ct AND value_ct > 0)); +-- Character Date +UPDATE profile_results +SET functional_data_type = 'Date Stamp' +WHERE profile_run_id = '{PROFILE_RUN_ID}' + AND functional_data_type IS NULL + AND distinct_pattern_ct = 1 + AND min_text >= '1900' AND max_text <= '2200' + AND TRIM(SPLIT_PART(top_patterns, '|', 2)) = 'NNNN-NN-NN'; + +-- Character Timestamp +UPDATE profile_results +SET functional_data_type = 'DateTime Stamp' +WHERE profile_run_id = '{PROFILE_RUN_ID}' + AND functional_data_type IS NULL + AND distinct_pattern_ct = 1 + AND TRIM(SPLIT_PART(top_patterns, '|', 2)) = 'NNNN-NN-NN NN:NN:NN'; + -- Assign PERIODS: Period Year, Period Qtr, Period Month, Period Week, Period DOW UPDATE profile_results SET functional_data_type = 'Period Year' @@ -147,6 +162,19 @@ WHERE profile_run_id = '{PROFILE_RUN_ID}' AND SPLIT_PART(top_patterns, '|', 2) ~ '^\s*NNNN[-_]AN\s*$') ); +UPDATE profile_results +SET functional_data_type = 'Period Year-Mon' +WHERE profile_run_id = '{PROFILE_RUN_ID}' + AND functional_data_type IS NULL + AND column_name ILIKE '%mo%' + AND min_text >= '1900' AND max_text <= '2200' + AND ( + (avg_length BETWEEN 6.8 AND 7.2 + AND SPLIT_PART(top_patterns, '|', 2) ~ '^\s*NNNN[-_]NN\s*$') + OR (avg_length BETWEEN 7.8 AND 8.2 + AND UPPER(SPLIT_PART(top_patterns, '|', 2)) ~ '^\s*NNNN[-_]AAA\s*$') + ); + UPDATE profile_results SET functional_data_type = 'Period Month' WHERE profile_run_id = '{PROFILE_RUN_ID}' @@ -241,32 +269,69 @@ INNER JOIN profile_results s -- Assign Name UPDATE profile_results - SET functional_data_type = 'Person Name' + SET functional_data_type = 'Person Full Name' WHERE profile_run_id = '{PROFILE_RUN_ID}' AND functional_data_type IS NULL - AND column_name ~ '^(approver|full|contact|emp|employee|hcp|manager|mgr_|middle|nick|party|person|preferred|rep|reviewer|salesperson|spouse)(_| |)name$'; + AND avg_length <= 20 + AND avg_embedded_spaces BETWEEN 0.9 AND 2.0 + AND ( column_name ~ '(approver|full|contact|emp|employee|hcp|manager|mgr_|party|person|preferred|rep|reviewer|salesperson|spouse)(_| |)(name|nm)$' + OR column_name IN ('name', 'nm') ); -- Assign First Name UPDATE profile_results - SET functional_data_type = 'Person First Name' + SET functional_data_type = 'Person Given Name' WHERE profile_run_id = '{PROFILE_RUN_ID}' - AND column_name SIMILAR TO '%f(|i)rst%n(|a)m%%' AND avg_length <= 8 - AND avg_embedded_spaces < 0.2; + AND avg_embedded_spaces < 0.2 + AND (column_name SIMILAR TO '%f(|i)rst(_| |)n(|a)m%%' + OR column_name SIMILAR TO '%(middle|mdl)(_| |)n(|a)m%%' + OR column_name SIMILAR TO '%nick(_| |)n(|a)m%%'); -- Assign Last Name UPDATE profile_results SET functional_data_type = 'Person Last Name' WHERE profile_run_id = '{PROFILE_RUN_ID}' - AND column_name SIMILAR TO '%l(|a)st%n(|a)m%' + AND column_name SIMILAR TO '%l(|a)st(_| |)n(|a)m%' AND avg_length BETWEEN 5 and 8 - AND avg_embedded_spaces < 0.2; + AND avg_embedded_spaces < 0.2 + AND (column_name SIMILAR TO '%l(|a)st(_| |)n(|a)m%' + OR column_name SIMILAR TO '%sur(_| |)n(|a)m%'); UPDATE profile_results SET functional_data_type = 'Entity Name' WHERE profile_run_id = '{PROFILE_RUN_ID}' AND functional_data_type IS NULL - AND column_name ~ '^(|acct|account|affiliation|branch|business|co|comp|company|corp|corporate|cust|customer|distributor|employer|entity|firm|franchise|hco|org|organization|supplier|vendor|hospital|practice|clinic)(_| |)name$'; + AND general_type = 'A' + AND column_name ~ '(acct|account|affiliation|branch|business|co|comp|company|corp|corporate|cust|customer|distributor|employer|entity|firm|franchise|hco|org|organization|site|supplier|vendor|hospital|practice|clinic)(_| |)(name|nm)$'; + +-- Assign Boolean +/* + Boolean - If distinct_value_ct is equal to (1 or 2) and (min_text and max_text) values fall in the categories specified + Numeric column types are not boolean. + */ +UPDATE profile_results +SET functional_data_type = + CASE WHEN general_type = 'B' + OR (distinct_value_ct = 2 + AND ((LOWER(min_text) = 'no' AND LOWER(max_text) = 'yes') + OR (LOWER(min_text) = 'n' AND LOWER(max_text) = 'y') + OR (LOWER(min_text) = 'false' AND LOWER(max_text) = 'true') + OR (LOWER(min_text) = '0' AND LOWER(max_text) = '1') + OR (min_value = 0 AND max_value = 1 AND lower(column_type) NOT ILIKE '%numeric%'))) + THEN 'Boolean' + WHEN general_type = 'B' + OR (distinct_value_ct = 1 -- we can have only 1 value populated but it can still be boolean + AND ( (LOWER(min_text) in ('no','yes') AND LOWER(max_text) in ('no','yes')) + OR (LOWER(min_text) in ('n','y') AND LOWER(max_text) in ('n','y')) + OR (LOWER(min_text) in ('false','true') AND LOWER(max_text) in ('f','t')) + OR (LOWER(min_text) in ('0','1') AND LOWER(max_text) in ('0','1')) + OR (min_value = 0 AND max_value = 1 AND lower(column_type) NOT ILIKE '%numeric%'))) + THEN 'Boolean' + ELSE functional_data_type + END +WHERE profile_run_id = '{PROFILE_RUN_ID}' + AND functional_data_type IS NULL; + -- 4. Assign CODE, CATEGORY, ID, ATTRIBUTE & DESCRIPTION /* @@ -274,19 +339,21 @@ WHERE profile_run_id = '{PROFILE_RUN_ID}' Id - If more than 80% of records are populated and 95% are unique without spaces and consistent length and have a distinct record count of more than 200 Code - If more than 80% of records are populated and 95% are unique without spaces and consistent length - and have a distinct record count of less than or equal to 200 - . If distinct record count is more than 200 and the field has varying length, + and have a distinct record count of less than or equal to 200. + If distinct record count is more than 200 and the field has varying length, Attribute - Short length with less than 3 words Description - More than 3 words and longer length . If distinct record count is between 2 and 200, Code - No spaces (single word) with less than 15 maximum length Category - Spaces allowed, no restriction on length - */ - UPDATE profile_results SET functional_data_type = - CASE WHEN includes_digit_ct > 0 + CASE WHEN ( lower(column_name) ~ '_(average|avg|count|ct|sum|total|tot)$' + OR lower(column_name) ~ '^(average|avg|count|ct|sum|total|tot)_' ) + AND numeric_ct = value_ct + AND value_ct > 1 THEN 'Measurement Text' + WHEN includes_digit_ct > 0 AND ( (max_length <= 20 AND avg_embedded_spaces < 0.1 -- Short without spaces AND value_ct / NULLIF(record_ct, 0)::FLOAT > 0.8 -- mostly populated AND distinct_value_ct / NULLIF(value_ct, 0)::FLOAT > 0.95) -- mostly unique @@ -315,42 +382,24 @@ SET functional_data_type = END WHERE profile_run_id = '{PROFILE_RUN_ID}' AND functional_data_type IS NULL - AND general_type='A' AND functional_data_type IS NULL + AND general_type='A' AND LOWER(datatype_suggestion) SIMILAR TO '(%varchar%)'; - --- 5. Assign BOOLEAN & FLAG +-- 5. Assign FLAG /* - Boolean - If distinct_value_ct is equal to (1 or 2) and (min_text and max_text) values fall in the categories specified Flag - is set only if there is an unknown data type or if it's null. Alpha values with distinct_value_ct between 3 and 5, - Few, short words with only alpha characters. Numeric column types are not boolean. + Few, short words with only alpha characters. */ UPDATE profile_results SET functional_data_type = - CASE WHEN general_type = 'B' - OR (distinct_value_ct = 2 - AND ((LOWER(min_text) = 'no' AND LOWER(max_text) = 'yes') - OR (LOWER(min_text) = 'n' AND LOWER(max_text) = 'y') - OR (LOWER(min_text) = 'false' AND LOWER(max_text) = 'true') - OR (LOWER(min_text) = '0' AND LOWER(max_text) = '1') - OR (min_value = 0 AND max_value = 1 AND lower(column_type) NOT ILIKE '%numeric%'))) - THEN 'Boolean' - WHEN general_type = 'B' - OR (distinct_value_ct = 1 -- we can have only 1 value populated but it can still be boolean - AND ( (LOWER(min_text) in ('no','yes') AND LOWER(max_text) in ('no','yes')) - OR (LOWER(min_text) in ('n','y') AND LOWER(max_text) in ('n','y')) - OR (LOWER(min_text) in ('false','true') AND LOWER(max_text) in ('f','t')) - OR (LOWER(min_text) in ('0','1') AND LOWER(max_text) in ('0','1')) - OR (min_value = 0 AND max_value = 1 AND lower(column_type) NOT ILIKE '%numeric%'))) - THEN 'Boolean' + CASE WHEN general_type = 'A' AND distinct_value_ct BETWEEN 3 AND 5 AND (lower(column_type) NOT ILIKE '%numeric%' OR lower(datatype_suggestion) NOT ILIKE '%numeric%')-- should not be decimal AND (min_length > 1 AND max_length <= 7) - AND (functional_data_type IS NULL OR upper(functional_data_type) = 'UNKNOWN') - AND (fn_charcount(top_patterns, 'A') > 0) + AND functional_data_type IS NULL + AND fn_charcount(top_patterns, 'A') > 0 THEN 'Flag' - ELSE functional_data_type END WHERE profile_run_id = '{PROFILE_RUN_ID}' AND functional_data_type IS NULL; @@ -452,11 +501,23 @@ WHERE profile_results.profile_run_id = '{PROFILE_RUN_ID}' UPDATE profile_results SET functional_data_type = 'Measurement Pct' -WHERE functional_data_type IN ('Measurement', 'Measurement Discrete', 'UNKNOWN') +WHERE profile_run_id = '{PROFILE_RUN_ID}' + AND functional_data_type IN ('Measurement', 'Measurement Discrete', 'UNKNOWN') AND general_type = 'N' AND min_value >= -200 AND max_value <= 200 - AND (column_name ILIKE '%pct%' OR column_name ILIKE '%percent%') - AND profile_run_id = '{PROFILE_RUN_ID}'; + AND (column_name ILIKE '%pct%' OR column_name ILIKE '%percent%'); + +UPDATE profile_results +SET functional_data_type = 'Measurement Pct' +WHERE profile_run_id = '{PROFILE_RUN_ID}' + AND functional_data_type = 'Code' + AND distinct_pattern_ct between 1 and 3 + AND value_ct = includes_digit_ct + AND min_text >= '0' + AND max_text <= '99' + AND TRIM(SPLIT_PART(top_patterns, '|', 2)) ~ '^N{1,3}(\.N+)?%$' + AND (TRIM(SPLIT_PART(top_patterns, '|', 4)) ~ '^N{1,3}(\.N+)?%$' OR distinct_pattern_ct < 2) + AND (TRIM(SPLIT_PART(top_patterns, '|', 6)) ~ '^N{1,3}(\.N+)?%$' OR distinct_pattern_ct < 3); --- END OF QUERY --- diff --git a/testgen/template/profiling/secondary_profiling_columns.sql b/testgen/template/profiling/secondary_profiling_columns.sql index 5e0ba1d..2c56b92 100644 --- a/testgen/template/profiling/secondary_profiling_columns.sql +++ b/testgen/template/profiling/secondary_profiling_columns.sql @@ -4,14 +4,9 @@ SELECT schema_name, table_name, column_name FROM profile_results p - WHERE p.project_code = '{PROJECT_CODE}' - AND p.schema_name = '{DATA_SCHEMA}' - AND p.run_date = '{RUN_DATE}' + WHERE p.profile_run_id = '{PROFILE_RUN_ID}' AND p.top_freq_values IS NULL AND p.general_type = 'A' - AND p.distinct_value_ct BETWEEN 2 and 40 + AND p.distinct_value_ct BETWEEN 2 and 70 AND p.max_length <= 70 -/* - AND 10 * (p.max_length + 15) < 1200 - */ - ; +; diff --git a/testgen/template/validate_tests/ex_get_test_column_list_tg.sql b/testgen/template/validate_tests/ex_get_test_column_list_tg.sql index e0f40fd..4c30ac5 100644 --- a/testgen/template/validate_tests/ex_get_test_column_list_tg.sql +++ b/testgen/template/validate_tests/ex_get_test_column_list_tg.sql @@ -2,36 +2,64 @@ SELECT DISTINCT schema_name || '.' || table_name || '.' || column_name AS column FROM ( SELECT cat_test_id, project_code, test_suite, - schema_name, - table_name, - UNNEST(STRING_TO_ARRAY(all_columns, '~|~')) AS column_name - FROM ( SELECT cat_test_id, - project_code, - test_suite, - schema_name, - table_name, - CONCAT_WS('~|~', column_name, - groupby_names, - window_date_column) AS all_columns - FROM test_definitions d - INNER JOIN test_types t - ON d.test_type = t.test_type - WHERE project_code = '{PROJECT_CODE}' - AND test_suite = '{TEST_SUITE}' - AND t.test_scope = 'column' - - UNION - SELECT cat_test_id, - project_code, - test_suite, - match_schema_name AS schema_name, - match_table_name AS table_name, - CONCAT_WS('~|~', - match_column_names, - match_groupby_names) AS all_columns - FROM test_definitions d - INNER JOIN test_types t - ON d.test_type = t.test_type - WHERE project_code = '{PROJECT_CODE}' - AND test_suite = '{TEST_SUITE}' - AND t.test_scope = 'column') a ) b; + schema_name AS schema_name, + table_name AS table_name, + TRIM(UNNEST(STRING_TO_ARRAY(column_name, ','))) as column_name + FROM test_definitions d + INNER JOIN test_types t + ON d.test_type = t.test_type + WHERE project_code = '{PROJECT_CODE}' + AND test_suite = '{TEST_SUITE}' + AND t.test_scope IN ('column', 'referential') + UNION + SELECT cat_test_id, + project_code, + test_suite, + schema_name AS schema_name, + table_name AS table_name, + TRIM(UNNEST(STRING_TO_ARRAY(groupby_names, ','))) as column_name + FROM test_definitions d + INNER JOIN test_types t + ON d.test_type = t.test_type + WHERE project_code = '{PROJECT_CODE}' + AND test_suite = '{TEST_SUITE}' + AND t.test_scope IN ('column', 'referential') + UNION + SELECT cat_test_id, + project_code, + test_suite, + schema_name AS schema_name, + table_name AS table_name, + TRIM(UNNEST(STRING_TO_ARRAY(window_date_column, ','))) as column_name + FROM test_definitions d + INNER JOIN test_types t + ON d.test_type = t.test_type + WHERE project_code = '{PROJECT_CODE}' + AND test_suite = '{TEST_SUITE}' + AND t.test_scope IN ('column', 'referential') + UNION + SELECT cat_test_id, + project_code, + test_suite, + match_schema_name AS schema_name, + match_table_name AS table_name, + TRIM(UNNEST(STRING_TO_ARRAY(match_column_names, ','))) as column_name + FROM test_definitions d + INNER JOIN test_types t + ON d.test_type = t.test_type + WHERE project_code = '{PROJECT_CODE}' + AND test_suite = '{TEST_SUITE}' + AND t.test_scope = 'referential' + UNION + SELECT cat_test_id, + project_code, + test_suite, + match_schema_name AS schema_name, + match_table_name AS table_name, + TRIM(UNNEST(STRING_TO_ARRAY(match_groupby_names, ','))) as column_name + FROM test_definitions d + INNER JOIN test_types t + ON d.test_type = t.test_type + WHERE project_code = '{PROJECT_CODE}' + AND test_suite = '{TEST_SUITE}' + AND t.test_scope = 'referential' ) cols; diff --git a/testgen/ui/queries/table_group_queries.py b/testgen/ui/queries/table_group_queries.py index a1ce0d5..4e047ee 100644 --- a/testgen/ui/queries/table_group_queries.py +++ b/testgen/ui/queries/table_group_queries.py @@ -10,6 +10,8 @@ def _get_select_statement(schema): profiling_include_mask, profiling_exclude_mask, profiling_table_set, profile_id_column_mask, profile_sk_column_mask, + data_source, source_system, data_location, business_domain, + transform_level, source_process, stakeholder_group, profile_use_sampling, profile_sample_percent, profile_sample_min_count, profiling_delay_days FROM {schema}.table_groups @@ -88,7 +90,14 @@ def edit(schema, table_group): profile_use_sampling='{'Y' if table_group["profile_use_sampling"] else 'N'}', profile_sample_percent='{table_group["profile_sample_percent"]}', profile_sample_min_count={int(table_group["profile_sample_min_count"])}, - profiling_delay_days='{table_group["profiling_delay_days"]}' + profiling_delay_days='{table_group["profiling_delay_days"]}', + data_source='{table_group["data_source"]}', + source_system='{table_group["source_system"]}', + data_location='{table_group["data_location"]}', + business_domain='{table_group["business_domain"]}', + transform_level='{table_group["transform_level"]}', + source_process='{table_group["source_process"]}', + stakeholder_group='{table_group["stakeholder_group"]}' where id = '{table_group["id"]}' ; @@ -112,7 +121,14 @@ def add(schema, table_group): profile_use_sampling, profile_sample_percent, profile_sample_min_count, - profiling_delay_days) + profiling_delay_days, + data_source, + source_system, + data_location, + business_domain, + transform_level, + source_process, + stakeholder_group) SELECT gen_random_uuid(), '{table_group["project_code"]}', @@ -126,7 +142,14 @@ def add(schema, table_group): '{table_group["profile_sk_column_mask"]}'::character varying, '{'Y' if table_group["profile_use_sampling"]=='True' else 'N' }'::character varying, '{table_group["profile_sample_percent"]}'::character varying, - {table_group["profile_sample_min_count"]}, '{table_group["profiling_delay_days"]}'::character varying + {table_group["profile_sample_min_count"]}, '{table_group["profiling_delay_days"]}'::character varying, + '{table_group["data_source"]}', + '{table_group["source_system"]}', + '{table_group["data_location"]}', + '{table_group["business_domain"]}', + '{table_group["transform_level"]}', + '{table_group["source_process"]}', + '{table_group["stakeholder_group"]}' ;""" db.execute_sql(sql) st.cache_data.clear() diff --git a/testgen/ui/queries/test_definition_queries.py b/testgen/ui/queries/test_definition_queries.py index 7e68d5f..6e27e11 100644 --- a/testgen/ui/queries/test_definition_queries.py +++ b/testgen/ui/queries/test_definition_queries.py @@ -15,22 +15,6 @@ def update_attribute(schema, test_definition_ids, attribute, value): st.cache_data.clear() -def get_test_definition_uniqueness(schema, test_definition): - sql = f""" - SELECT COUNT(*) - FROM {schema}.test_definitions d - WHERE True - """ - - sql += f" AND d.table_groups_id = '{test_definition['table_groups_id']}' \n" - sql += f" AND d.test_suite = NULLIF('{test_definition['test_suite']}','') \n" - sql += f" AND d.table_name = NULLIF('{test_definition['table_name']}','') \n" - sql += f" AND d.column_name = NULLIF('{test_definition['column_name']}','') \n" - sql += f" AND d.test_type = NULLIF('{test_definition['test_type']}','') \n" - - return db.retrieve_data(sql) - - @st.cache_data(show_spinner=False) def get_test_definitions(schema, project_code, test_suite, table_name, column_name, test_definition_ids): if table_name: @@ -45,7 +29,7 @@ def get_test_definitions(schema, project_code, test_suite, table_name, column_na SELECT d.schema_name, d.table_name, d.column_name, t.test_name_short, t.test_name_long, d.id::VARCHAR(50), - d.project_code, d.table_groups_id::VARCHAR(50), d.test_suite, + d.project_code, d.table_groups_id::VARCHAR(50), d.test_suite, d.test_suite_id::VARCHAR, d.test_type, d.cat_test_id::VARCHAR(50), d.test_active, CASE WHEN d.test_active = 'Y' THEN 'Yes' ELSE 'No' END as test_active_display, @@ -78,7 +62,7 @@ def get_test_definitions(schema, project_code, test_suite, table_name, column_na d.test_mode FROM {schema}.test_definitions d INNER JOIN {schema}.test_types t ON (d.test_type = t.test_type) - INNER JOIN {schema}.test_suites s ON (d.test_suite = s.test_suite) + INNER JOIN {schema}.test_suites s ON (d.test_suite_id = s.id) WHERE True """ @@ -176,6 +160,7 @@ def add(schema, test_definition): profile_run_id, test_type, test_suite, + test_suite_id, test_description, test_action, test_mode, @@ -221,6 +206,7 @@ def add(schema, test_definition): NULL AS profile_run_id, NULLIF('{test_definition["test_type"]}', '') as test_type, NULLIF('{test_definition["test_suite"]}', '') as test_suite, + '{test_definition["test_suite_id"]}'::UUID as test_suite_id, NULLIF('{test_definition["test_description"]}', '') as test_description, NULLIF('{test_definition["test_action"]}', '') as test_action, NULLIF('{test_definition["test_mode"]}', '') as test_mode, diff --git a/testgen/ui/services/test_definition_service.py b/testgen/ui/services/test_definition_service.py index aa33196..f64cf81 100644 --- a/testgen/ui/services/test_definition_service.py +++ b/testgen/ui/services/test_definition_service.py @@ -49,14 +49,6 @@ def update(test_definition): return test_definition_queries.update(schema, test_definition) -def check_test_definition_uniqueness(test_definition): - schema = st.session_state["dbschema"] - prepare_to_persist(test_definition) - record_count_raw = test_definition_queries.get_test_definition_uniqueness(schema, test_definition) - record_count = record_count_raw.iloc[0, 0] - return record_count - - def prepare_to_persist(test_definition): # severity if test_definition["severity"] and test_definition["severity"].startswith("Inherited"): diff --git a/testgen/ui/views/profiling_anomalies.py b/testgen/ui/views/profiling_anomalies.py index 76dd3e4..e8cce84 100644 --- a/testgen/ui/views/profiling_anomalies.py +++ b/testgen/ui/views/profiling_anomalies.py @@ -3,7 +3,6 @@ import plotly.express as px import streamlit as st -import testgen.ui.queries.profiling_queries as profiling_queries import testgen.ui.services.database_service as db import testgen.ui.services.form_service as fm import testgen.ui.services.query_service as dq @@ -11,7 +10,7 @@ from testgen.ui.components import widgets as testgen from testgen.ui.navigation.page import Page from testgen.ui.session import session -from testgen.ui.views.profiling_details import show_profiling_detail +from testgen.ui.views.profiling_modal import view_profiling_modal class ProfilingAnomaliesPage(Page): @@ -145,8 +144,9 @@ def render(self) -> None: with col2: # _, v_col2 = st.columns([0.3, 0.7]) v_col1, v_col2 = st.columns([0.5, 0.5]) - view_profiling( - v_col1, selected_row["table_name"], selected_row["column_name"], str_profile_run_id + view_profiling_modal( + v_col1, selected_row["table_name"], selected_row["column_name"], + str_profile_run_id=str_profile_run_id ) view_bad_data(v_col2, selected_row) @@ -478,23 +478,6 @@ def view_bad_data(button_container, selected_row): st.dataframe(df_bad, height=500, width=1050, hide_index=True) -def view_profiling(button_container, str_table_name, str_column_name, str_profiling_run_id): - str_header = f"Column: {str_column_name}, Table: {str_table_name}" - - df = profiling_queries.get_profiling_detail(str_profiling_run_id, str_table_name, str_column_name) - - profiling_modal = testgen.Modal(title=None, key="dk-anomaly-profiling-modal", max_width=1100) - - with button_container: - if st.button(":green[Profiling →]", help="Review profiling for highlighted column", use_container_width=True): - profiling_modal.open() - - if profiling_modal.is_open(): - with profiling_modal.container(): - fm.render_modal_header(str_header, None) - show_profiling_detail(df.iloc[0], 300) - - def do_disposition_update(selected, str_new_status): str_result = None if selected: diff --git a/testgen/ui/views/profiling_modal.py b/testgen/ui/views/profiling_modal.py new file mode 100644 index 0000000..5ebeeb1 --- /dev/null +++ b/testgen/ui/views/profiling_modal.py @@ -0,0 +1,40 @@ +import logging + +import streamlit as st + +import testgen.ui.queries.profiling_queries as profiling_queries +import testgen.ui.services.form_service as fm +from testgen.ui.components import widgets as testgen +from testgen.ui.views.profiling_details import show_profiling_detail + +LOG = logging.getLogger("testgen") + +BUTTON_TEXT = ":green[Profiling →]" # Profiling ⚲ +BUTTON_HELP = "Review profiling for highlighted column" +FORM_HEADER = "Profiling Results" + + +def view_profiling_modal(button_container, str_table_name, str_column_name, + str_profile_run_id=None, str_table_groups_id=None): + str_prompt = f"Column: {str_column_name}, Table: {str_table_name}" + + modal_viewer = testgen.Modal(title=None, key="dk-view", max_width=1100) + + with button_container: + if st.button( + BUTTON_TEXT, help=BUTTON_HELP, use_container_width=True + ): + modal_viewer.open() + + if modal_viewer.is_open(): + with modal_viewer.container(): + if not str_profile_run_id: + if str_table_groups_id: + str_profile_run_id = profiling_queries.get_latest_profile_run(str_table_groups_id) + + if str_profile_run_id: + df = profiling_queries.get_profiling_detail(str_profile_run_id, str_table_name, str_column_name) + if not df.empty: + fm.render_modal_header(str_title=FORM_HEADER, str_prompt=str_prompt) + show_profiling_detail(df.iloc[0], 300) + diff --git a/testgen/ui/views/table_groups.py b/testgen/ui/views/table_groups.py index cfe91ff..962b903 100644 --- a/testgen/ui/views/table_groups.py +++ b/testgen/ui/views/table_groups.py @@ -12,6 +12,7 @@ from testgen.commands.run_profiling_bridge import run_profiling_in_background from testgen.ui.components import widgets as testgen from testgen.ui.navigation.page import Page +from testgen.ui.services.string_service import empty_if_null from testgen.ui.session import session @@ -142,6 +143,15 @@ def show_record_detail(selected, profile_cli_command_modal, profile_command_moda "profiling_table_set", "profile_id_column_mask", "profile_sk_column_mask", + + "data_source", + "source_system", + "data_location", + "business_domain", + "transform_level", + "source_process", + "stakeholder_group", + "profile_use_sampling", "profile_sample_percent", "profile_sample_min_count", @@ -159,6 +169,15 @@ def show_record_detail(selected, profile_cli_command_modal, profile_command_moda "Explicit Table List", "ID Column Mask", "Surrogate Key Column Mask", + + "Data Source", + "Source System", + "Data Location", + "Business Domain", + "Transform Level", + "Source Process", + "Stakeholder Group", + "Uses Record Sampling", "Sample Record Percent", "Sample Minimum Record Count", @@ -303,10 +322,15 @@ def show_add_or_edit_modal(modal, mode, project_code, connection, selected=None) profiling_delay_days = int(selected_table_group["profiling_delay_days"]) if mode == "edit" else 0 left_column, right_column = st.columns([0.50, 0.50]) + profile_sampling_expander = st.expander("Sampling Parameters", expanded=False) with profile_sampling_expander: expander_left_column, expander_right_column = st.columns([0.50, 0.50]) + provenance_expander = st.expander("Data Provenance (Optional)", expanded=False) + with provenance_expander: + provenance_left_column, provenance_right_column = st.columns([0.50, 0.50]) + with st.form("Table Group Add / Edit", clear_on_submit=True): entity = { "id": table_group_id, @@ -382,6 +406,52 @@ def show_add_or_edit_modal(modal, mode, project_code, connection, selected=None) value=profile_sample_min_count, help="The minimum number of records to be included in any sample (if available)", ), + "data_source": provenance_left_column.text_input( + label="Data Source", + max_chars=40, + value=empty_if_null(selected_table_group["data_source"]) if mode == "edit" else "", + help="Original source of all tables in this dataset. This can be overridden at the table level. (Optional)", + ), + "source_system": provenance_left_column.text_input( + label="System of Origin", + max_chars=40, + value=empty_if_null(selected_table_group["source_system"]) if mode == "edit" else "", + help="Enterprise system source for all tables in this dataset. " + "This can be overridden at the table level. (Optional)", + ), + "business_domain": provenance_left_column.text_input( + label="Business Domain", + max_chars=40, + value=empty_if_null(selected_table_group["business_domain"]) if mode == "edit" else "", + help="Business division responsible for all tables in this dataset. " + "e.g. Finance, Sales, Manufacturing. (Optional)", + ), + "data_location": provenance_left_column.text_input( + label="Location", + max_chars=40, + value=empty_if_null(selected_table_group["data_location"]) if mode == "edit" else "", + help="Physical or virtual location of all tables in this dataset. " + "e.g. Headquarters, Cloud, etc. (Optional)", + ), + "transform_level": provenance_right_column.text_input( + label="Transform Level", + max_chars=40, + value=empty_if_null(selected_table_group["transform_level"]) if mode == "edit" else "", + help="Data warehouse processing layer. " + "Indicates the processing stage: e.g. Raw, Conformed, Processed, Reporting. (Optional)", + ), + "source_process": provenance_right_column.text_input( + label="Source Process", + max_chars=40, + value=empty_if_null(selected_table_group["source_process"]) if mode == "edit" else "", + help="The process, program or data flow that produced this data. (Optional)", + ), + "stakeholder_group": provenance_right_column.text_input( + label="Stakeholder Group", + max_chars=40, + value=empty_if_null(selected_table_group["stakeholder_group"]) if mode == "edit" else "", + help="Designator for data owners or stakeholders who are responsible for this data. (Optional)", + ), } submit_button_text = "Save" if mode == "edit" else "Add" diff --git a/testgen/ui/views/test_definitions.py b/testgen/ui/views/test_definitions.py index 74b1af7..a96da3b 100644 --- a/testgen/ui/views/test_definitions.py +++ b/testgen/ui/views/test_definitions.py @@ -16,6 +16,7 @@ from testgen.ui.services import authentication_service from testgen.ui.services.string_service import empty_if_null, snake_case_to_title_case from testgen.ui.session import session +from testgen.ui.views.profiling_modal import view_profiling_modal LOG = logging.getLogger("testgen") @@ -88,7 +89,8 @@ def render(self, **_) -> None: add_test_definition_modal.open() selected = show_test_defs_grid( - session.project, str_test_suite, str_table_name, str_column_name, do_multi_select, export_container + session.project, str_test_suite, str_table_name, str_column_name, do_multi_select, export_container, + str_table_groups_id ) # Display buttons @@ -301,6 +303,7 @@ def show_add_edit_modal( table_groups_id = selected_test_def["table_groups_id"] if mode == "edit" else table_group["id"] profile_run_id = selected_test_def["profile_run_id"] if mode == "edit" else "" test_suite_name = selected_test_def["test_suite"] if mode == "edit" else test_suite["test_suite"] + test_suite_id = test_suite["id"] test_action = empty_if_null(selected_test_def["test_action"]) if mode == "edit" else "" schema_name = selected_test_def["schema_name"] if mode == "edit" else table_group["table_group_schema"] table_name = empty_if_null(selected_test_def["table_name"]) if mode == "edit" else empty_if_null(str_table_name) @@ -398,6 +401,7 @@ def show_add_edit_modal( "test_suite": left_column.text_input( label="Test Suite Name", max_chars=200, value=test_suite_name, disabled=True ), + "test_suite_id": test_suite_id, "test_description": left_column.text_area( label="Test Description Override", max_chars=1000, @@ -629,12 +633,8 @@ def show_add_edit_modal( test_definition_service.update(test_definition) test_definition_modal.close() else: - error_message = validate_test_definition_uniqueness(test_definition, test_scope) - if error_message is not None: - st.error(error_message) - else: - test_definition_service.add(test_definition) - test_definition_modal.close() + test_definition_service.add(test_definition) + test_definition_modal.close() def validate_form(test_scope, test_type, test_definition, column_name_label): @@ -701,10 +701,11 @@ def update_test_definition(selected, attribute, value, message): def show_test_defs_grid( - str_project_code, str_test_suite_id, str_table_name, str_column_name, do_multi_select, export_container + str_project_code, str_test_suite, str_table_name, str_column_name, do_multi_select, export_container, + str_table_groups_id ): df = test_definition_service.get_test_definitions( - str_project_code, str_test_suite_id, str_table_name, str_column_name + str_project_code, str_test_suite, str_table_name, str_column_name ) date_service.accommodate_dataframe_to_timezone(df, st.session_state) @@ -779,7 +780,7 @@ def show_test_defs_grid( fm.render_excel_export( df, lst_export_columns, - f"Test Definitions for Test Suite {str_test_suite_id}", + f"Test Definitions for Test Suite {str_test_suite}", "{TIMESTAMP}", lst_wrap_columns, lst_export_headers, @@ -834,6 +835,13 @@ def show_test_defs_grid( int_data_width=700, lst_labels=labels, ) + + _, col_profile_button = right_column.columns([0.7, 0.3]) + view_profiling_modal( + col_profile_button, selected_row["table_name"], selected_row["column_name"], + str_table_groups_id=str_table_groups_id + ) + with right_column: st.write(generate_test_defs_help(row_selected["test_type"])) @@ -857,9 +865,9 @@ def generate_test_defs_help(str_test_type): **Default Test Severity:** {row["default_severity"]} -**Test Run Type:** {row["run_type"]} - - CAT tests are consolidated into aggregate queries and run faster. - - QUERY tests are executed individually and may take longer to run. +**Test Run Type:** {row["test_scope"]} + - COLUMN tests are consolidated into aggregate queries and execute faster. + - TABLE, REFERENTIAL and CUSTOM tests are executed individually and may take longer to run. **Data Quality Dimension:** {row["dq_dimension"]} """ diff --git a/testgen/ui/views/test_results.py b/testgen/ui/views/test_results.py index 8e58124..33b9b24 100644 --- a/testgen/ui/views/test_results.py +++ b/testgen/ui/views/test_results.py @@ -6,7 +6,6 @@ import plotly.graph_objects as go import streamlit as st -import testgen.ui.queries.profiling_queries as profiling_queries import testgen.ui.services.database_service as db import testgen.ui.services.form_service as fm import testgen.ui.services.query_service as dq @@ -16,7 +15,7 @@ from testgen.ui.navigation.page import Page from testgen.ui.services.string_service import empty_if_null from testgen.ui.session import session -from testgen.ui.views.profiling_details import show_profiling_detail +from testgen.ui.views.profiling_modal import view_profiling_modal from testgen.ui.views.test_definitions import show_add_edit_modal_by_test_definition ALWAYS_SPIN = False @@ -55,15 +54,16 @@ def render(self) -> None: tool_bar = tb.ToolBar(3, 1, 4, None) # Lookup Test Run - df = get_drill_test_run(str_sel_test_run) - if not df.empty: - with tool_bar.long_slots[0]: - time_columns = ["test_date"] - date_service.accommodate_dataframe_to_timezone(df, st.session_state, time_columns) - df["description"] = df["test_date"] + " | " + df["test_suite_description"] - str_sel_test_run = fm.render_select( - "Test Run", df, "description", "test_run_id", boo_required=True, boo_disabled=True - ) + if str_sel_test_run: + df = get_drill_test_run(str_sel_test_run) + if not df.empty: + with tool_bar.long_slots[0]: + time_columns = ["test_date"] + date_service.accommodate_dataframe_to_timezone(df, st.session_state, time_columns) + df["description"] = df["test_date"] + " | " + df["test_suite_description"] + str_sel_test_run = fm.render_select( + "Test Run", df, "description", "test_run_id", boo_required=True, boo_disabled=True + ) if str_sel_test_run: with tool_bar.long_slots[1]: @@ -204,9 +204,12 @@ def get_test_results_uncached(str_schema, str_run_id, str_sel_test_status): END as execution_error_ct, r.project_code, r.table_groups_id::VARCHAR, r.id::VARCHAR as test_result_id, r.test_run_id::VARCHAR, - c.id::VARCHAR as connection_id, ts.id::VARCHAR as test_suite_id, + c.id::VARCHAR as connection_id, r.test_suite_id::VARCHAR, r.test_definition_id::VARCHAR as test_definition_id_runtime, - d.id::VARCHAR as test_definition_id_current, + CASE + WHEN r.auto_gen = TRUE THEN d.id + ELSE r.test_definition_id + END::VARCHAR as test_definition_id_current, r.auto_gen FROM run_results r INNER JOIN {str_schema}.test_types tt @@ -214,12 +217,12 @@ def get_test_results_uncached(str_schema, str_run_id, str_sel_test_status): LEFT JOIN {str_schema}.test_definitions rd ON (r.test_definition_id = rd.id) LEFT JOIN {str_schema}.test_definitions d - ON (r.project_code = d.project_code - AND r.test_suite = d.test_suite - AND r.schema_name = d.schema_name + ON (r.test_suite_id = d.test_suite_id AND r.table_name = d.table_name AND r.column_names = COALESCE(d.column_name, 'N/A') - AND r.test_type = d.test_type) + AND r.test_type = d.test_type + AND r.auto_gen = TRUE + AND d.last_auto_gen_date IS NOT NULL) INNER JOIN {str_schema}.test_suites ts ON (r.project_code = ts.project_code AND r.test_suite = ts.test_suite) @@ -280,16 +283,17 @@ def get_test_result_summary(str_run_id): @st.cache_data(show_spinner=ALWAYS_SPIN) -def get_test_result_history(str_test_type, str_table_groups_id, str_table_name, str_column_names, +def get_test_result_history(str_test_type, str_test_suite_id, str_table_name, str_column_names, str_test_definition_id, auto_gen): str_schema = st.session_state["dbschema"] if auto_gen: str_where = f""" - WHERE table_groups_id = '{str_table_groups_id}' + WHERE test_suite_id = '{str_test_suite_id}' AND table_name = '{str_table_name}' AND column_names = '{str_column_names}' AND test_type = '{str_test_type}' + AND auto_gen = TRUE """ else: str_where = f""" @@ -644,7 +648,7 @@ def show_result_detail(str_run_id, str_sel_test_status, do_multi_select, export_ selected_row = selected_rows[len(selected_rows) - 1] dfh = get_test_result_history( selected_row["test_type"], - selected_row["table_groups_id"], + selected_row["test_suite_id"], selected_row["table_name"], selected_row["column_names"], selected_row["test_definition_id_runtime"], @@ -660,14 +664,14 @@ def show_result_detail(str_run_id, str_sel_test_status, do_multi_select, export_ with pg_col2: v_col1, v_col2, v_col3 = st.columns([0.33, 0.33, 0.33]) view_edit_test(v_col1, selected_row["test_definition_id_current"]) - view_profiling( - v_col2, selected_row["table_name"], selected_row["column_names"], selected_row["table_groups_id"] + view_profiling_modal( + v_col2, selected_row["table_name"], selected_row["column_names"], + str_table_groups_id=selected_row["table_groups_id"] ) view_bad_data(v_col3, selected_row) with pg_col1: fm.show_subheader(selected_row["test_name_short"]) - # st.caption(selected_row["test_name_long"]) st.markdown(f"###### {selected_row['test_description']}") st.caption(empty_if_null(selected_row["measure_uom_description"])) fm.render_grid_select(dfh, show_hist_columns) @@ -885,28 +889,6 @@ def view_bad_data(button_container, selected_row): st.dataframe(df_bad, height=500, width=1050, hide_index=True) -def view_profiling(button_container, str_table_name, str_column_name, str_table_groups_id): - str_header = f"Column: {str_column_name}, Table: {str_table_name}" - - # Retrieve latest profiling - str_profiling_run_id = profiling_queries.get_latest_profile_run(str_table_groups_id) - if str_profiling_run_id: - df = profiling_queries.get_profiling_detail(str_profiling_run_id, str_table_name, str_column_name) - if not df.empty: - profiling_modal = testgen.Modal(title=None, key="dk-anomaly-profiling-modal", max_width=1100) - - with button_container: - if st.button( - ":green[Profiling →]", help="Review profiling for highlighted column", use_container_width=True - ): - profiling_modal.open() - - if profiling_modal.is_open(): - with profiling_modal.container(): - fm.render_modal_header(str_header, None) - show_profiling_detail(df.iloc[0], 300) - - def view_edit_test(button_container, test_definition_id): edit_test_definition_modal = testgen.Modal(title=None, key="dk-test-definition-edit-modal", max_width=1100) with button_container: diff --git a/testgen/ui/views/test_suites.py b/testgen/ui/views/test_suites.py index 0d27907..b213127 100644 --- a/testgen/ui/views/test_suites.py +++ b/testgen/ui/views/test_suites.py @@ -430,7 +430,7 @@ def show_add_or_edit_modal(modal, mode, project_code, connection, table_group, s if " " in entity["test_suite"]: proposed_test_suite = entity["test_suite"].replace(" ", "-") st.error( - f"Blank spaces not allowed in field 'Test Suite Name'. Use - instead. i.e.: {proposed_test_suite}" + f"Blank spaces not allowed in field 'Test Suite Name'. Use dash or underscore instead. i.e.: {proposed_test_suite}" ) else: if mode == "edit": From 0de8355cf1141861a1fdb88aa4b15601f1dc7a0d Mon Sep 17 00:00:00 2001 From: "Chip.Bloche" Date: Tue, 16 Jul 2024 09:45:07 -0400 Subject: [PATCH 15/22] misc(ui): change wording from profiling anomalies to hygiene issues in ui --- testgen/ui/views/profiling_anomalies.py | 36 ++++++++++++------------- testgen/ui/views/profiling_details.py | 2 +- testgen/ui/views/profiling_summary.py | 4 +-- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/testgen/ui/views/profiling_anomalies.py b/testgen/ui/views/profiling_anomalies.py index e8cce84..7075c1e 100644 --- a/testgen/ui/views/profiling_anomalies.py +++ b/testgen/ui/views/profiling_anomalies.py @@ -14,19 +14,19 @@ class ProfilingAnomaliesPage(Page): - path = "profiling/anomalies" + path = "profiling/hygiene" can_activate: typing.ClassVar = [ lambda: session.authentication_status or "login", ] def render(self) -> None: export_container = fm.render_page_header( - "Profiling Anomalies", + "Hygiene Issues", "https://docs.datakitchen.io/article/dataops-testgen-help/profile-anomalies", lst_breadcrumbs=[ {"label": "Overview", "path": "overview"}, {"label": "Data Profiling", "path": "profiling"}, - {"label": "Profiling Anomalies", "path": None}, + {"label": "Hygiene Issues", "path": None}, ], ) @@ -61,14 +61,14 @@ def render(self) -> None: str_likelihood = st.selectbox("Issue Likelihood", lst_status_options) with tool_bar.short_slots[0]: - str_help = "Toggle on to perform actions on multiple anomalies" + str_help = "Toggle on to perform actions on multiple Hygiene Issues" do_multi_select = st.toggle("Multi-Select", help=str_help) if str_table_groups_id: # Get summary counts df_sum = get_profiling_anomaly_summary(str_profile_run_id) - # Get anomaly list + # Get hygiene issue list df_pa = get_profiling_anomalies(str_profile_run_id, str_likelihood) # Retrieve disposition action (cache refreshed) @@ -90,7 +90,7 @@ def render(self) -> None: ] # TODO: Can we reintegrate percents below: # tool_bar.set_prompt( - # f"Anomalies Found: {df_sum.at[0, 'issue_ct']} issues in {df_sum.at[0, 'column_ct']} columns, {df_sum.at[0, 'table_ct']} tables in schema {df_pa.loc[0, 'schema_name']}" + # f"Hygiene Issues Found: {df_sum.at[0, 'issue_ct']} issues in {df_sum.at[0, 'column_ct']} columns, {df_sum.at[0, 'table_ct']} tables in schema {df_pa.loc[0, 'schema_name']}" # ) # Show main grid and retrieve selections selected = fm.render_grid_select( @@ -111,7 +111,7 @@ def render(self) -> None: ] lst_wrap_columns = ["anomaly_description", "suggested_action"] fm.render_excel_export( - df_pa, lst_export_columns, "Profiling Anomalies", "{TIMESTAMP}", lst_wrap_columns + df_pa, lst_export_columns, "Hygiene Screen", "{TIMESTAMP}", lst_wrap_columns ) if selected: @@ -120,7 +120,7 @@ def render(self) -> None: else: selected_row = None - # Display anomaly detail for selected row + # Display hygiene issue detail for selected row if not selected_row: st.markdown(":orange[Select a record to see more information.]") else: @@ -138,7 +138,7 @@ def render(self) -> None: "likelihood_explanation", "suggested_action", ], - "Anomaly Information", + "Hygiene Issue Detail", int_data_width=700, ) with col2: @@ -186,11 +186,11 @@ def render(self) -> None: lst_cached_functions=[get_anomaly_disposition, get_profiling_anomaly_summary], ) else: - tool_bar.set_prompt("No Anomalies Found") + tool_bar.set_prompt("No Hygiene Issues Found") # Help Links st.markdown( - "[Help on Anomalies](https://docs.datakitchen.io/article/dataops-testgen-help/profile-anomalies)" + "[Help on Hygiene Issues](https://docs.datakitchen.io/article/dataops-testgen-help/profile-anomalies)" ) # with st.sidebar: @@ -352,11 +352,11 @@ def replace_parms(str_query): lst_query[0]["connect_by_url"], ) if df.empty: - return "ND", "Data that violates Anomaly criteria is not present in the current dataset.", None + return "ND", "Data that violates Hygiene Issue criteria is not present in the current dataset.", None else: return "OK", None, df else: - return "NA", "A source data lookup for this Anomaly is not available.", None + return "NA", "A source data lookup for this Issue is not available.", None except Exception as e: return "ERR", f"Source data lookup query caused an error:\n\n{e.args[0]}", None @@ -432,7 +432,7 @@ def write_frequency_graph(df_tests): df_count = df_count.sort_values(by="frequency", ascending=True) # Create a horizontal bar chart using Plotly Express - fig = px.bar(df_count, x="frequency", y="anomaly_name", orientation="h", title="Anomaly Frequency") + fig = px.bar(df_count, x="frequency", y="anomaly_name", orientation="h", title="Issue Frequency") fig.update_layout(title_font={"color": "green"}, paper_bgcolor="rgba(0,0,0,0)", plot_bgcolor="rgba(0,0,0,0)") if len(df_count) <= 5: # fig.update_layout(bargap=0.9) @@ -447,7 +447,7 @@ def view_bad_data(button_container, selected_row): with button_container: if st.button( - ":green[Source Data →]", help="Review current source data for highlighted anomaly", use_container_width=True + ":green[Source Data →]", help="Review current source data for highlighted issue", use_container_width=True ): bad_data_modal.open() @@ -458,7 +458,7 @@ def view_bad_data(button_container, selected_row): fm.show_prompt(str_header) # Show the detail line - fm.render_html_list(selected_row, ["detail"], None, 700, ["Anomaly Detail"]) + fm.render_html_list(selected_row, ["detail"], None, 700, ["Hygiene Issue Detail"]) with st.spinner("Retrieving source data..."): bad_data_status, bad_data_msg, df_bad = get_bad_data(selected_row) @@ -482,9 +482,9 @@ def do_disposition_update(selected, str_new_status): str_result = None if selected: if len(selected) > 1: - str_which = f"of {len(selected)} anomalies to {str_new_status}" + str_which = f"of {len(selected)} issues to {str_new_status}" elif len(selected) == 1: - str_which = f"of one anomaly to {str_new_status}" + str_which = f"of one issue to {str_new_status}" str_schema = st.session_state["dbschema"] if not dq.update_anomaly_disposition(selected, str_schema, str_new_status): diff --git a/testgen/ui/views/profiling_details.py b/testgen/ui/views/profiling_details.py index 0fb8386..b75983c 100644 --- a/testgen/ui/views/profiling_details.py +++ b/testgen/ui/views/profiling_details.py @@ -59,7 +59,7 @@ def write_profile_screen(selected_row): selected_row["profile_run_id"], selected_row["table_name"], selected_row["column_name"] ) if not df_screen.empty: - with st.expander("**Profiling Anomalies**"): + with st.expander("**Hygiene Issues**"): # fm.render_markdown_table(df_screen, ["column_name", "anomaly_name", "detail"]) st.dataframe(df_screen, use_container_width=True, hide_index=True) diff --git a/testgen/ui/views/profiling_summary.py b/testgen/ui/views/profiling_summary.py index 987acc8..b21d825 100644 --- a/testgen/ui/views/profiling_summary.py +++ b/testgen/ui/views/profiling_summary.py @@ -124,14 +124,14 @@ def open_drill_downs(dct_selected_rows, button_slots): st.experimental_rerun() if button_slots[1].button( - f":{'gray' if not dct_selected_rows else 'green'}[Anomalies →]", + f":{'gray' if not dct_selected_rows else 'green'}[Hygiene →]", help="Review potential data problems identified in profiling", use_container_width=True, disabled=not dct_selected_rows, ): st.session_state["drill_profile_run"] = dct_selected_row["profiling_run_id"] st.session_state["drill_profile_tg"] = dct_selected_row["table_groups_id"] - session.current_page = "profiling/anomalies" + session.current_page = "profiling/hygiene" session.current_page_args = {} st.experimental_rerun() From ed6babeb664f62c4fb17efdf48e078c7ba036f56 Mon Sep 17 00:00:00 2001 From: "Chip.Bloche" Date: Tue, 16 Jul 2024 10:13:55 -0400 Subject: [PATCH 16/22] fix(dbupgrade): add missing index --- .../template/dbsetup/030_initialize_new_schema_structure.sql | 2 +- testgen/template/dbupgrade/0105_incremental_upgrade.sql | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql index 46cb0b3..cf7bd04 100644 --- a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql +++ b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql @@ -387,7 +387,7 @@ CREATE TABLE data_column_chars ( source_process VARCHAR(40), business_domain VARCHAR(40), stakeholder_group VARCHAR(40), - transformation_level VARCHAR(40), + transform_level VARCHAR(40), aggregation_level VARCHAR(40), add_date TIMESTAMP, last_mod_date TIMESTAMP, diff --git a/testgen/template/dbupgrade/0105_incremental_upgrade.sql b/testgen/template/dbupgrade/0105_incremental_upgrade.sql index 867407e..e83567d 100644 --- a/testgen/template/dbupgrade/0105_incremental_upgrade.sql +++ b/testgen/template/dbupgrade/0105_incremental_upgrade.sql @@ -25,6 +25,9 @@ INNER JOIN test_suites s AND d.test_suite = s.test_suite) WHERE test_definitions.id = d.id; +CREATE INDEX ix_td_ts_tc + ON test_definitions(test_suite_id, table_name, column_name, test_type); + ALTER TABLE table_groups ADD COLUMN data_source VARCHAR(40), ADD COLUMN source_system VARCHAR(40), From 6af9d35795e10201b1da8724c3b44c357d49c187 Mon Sep 17 00:00:00 2001 From: "Chip.Bloche" Date: Tue, 16 Jul 2024 10:20:26 -0400 Subject: [PATCH 17/22] fix(dbupgrade): renumbered upgrade script --- ...{0105_incremental_upgrade.sql => 0104_incremental_upgrade.sql} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename testgen/template/dbupgrade/{0105_incremental_upgrade.sql => 0104_incremental_upgrade.sql} (100%) diff --git a/testgen/template/dbupgrade/0105_incremental_upgrade.sql b/testgen/template/dbupgrade/0104_incremental_upgrade.sql similarity index 100% rename from testgen/template/dbupgrade/0105_incremental_upgrade.sql rename to testgen/template/dbupgrade/0104_incremental_upgrade.sql From fbadf8537e4224123db266727cb84ccc765c4feb Mon Sep 17 00:00:00 2001 From: Alex Fernandez Date: Wed, 17 Jul 2024 11:55:46 -0300 Subject: [PATCH 18/22] add logs to functional tests --- testgen/commands/run_upgrade_db_config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/testgen/commands/run_upgrade_db_config.py b/testgen/commands/run_upgrade_db_config.py index e07c714..24a4c0a 100644 --- a/testgen/commands/run_upgrade_db_config.py +++ b/testgen/commands/run_upgrade_db_config.py @@ -129,6 +129,7 @@ def run_upgrade_db_config() -> bool: # Retrieve and execute upgrade scripts, if any lstQueries, max_prefix = _get_upgrade_scripts(upgrade_dir, params_mapping, min_val=strNextPrefix) if len(lstQueries) > 0: + LOG.info(f"Uppdating db config qty of queries: {len(lstQueries)}. New prefix: {max_prefix}. Queries: {lstQueries}") has_been_upgraded = _execute_upgrade_scripts(params_mapping, lstQueries) else: has_been_upgraded = False From c8f07e9044f05a341df34f39701707b42ebf182a Mon Sep 17 00:00:00 2001 From: Alex Fernandez Date: Wed, 17 Jul 2024 12:36:49 -0300 Subject: [PATCH 19/22] add logs to functional tests. changed --- testgen/commands/run_upgrade_db_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testgen/commands/run_upgrade_db_config.py b/testgen/commands/run_upgrade_db_config.py index 24a4c0a..be82d7d 100644 --- a/testgen/commands/run_upgrade_db_config.py +++ b/testgen/commands/run_upgrade_db_config.py @@ -128,8 +128,8 @@ def run_upgrade_db_config() -> bool: # Retrieve and execute upgrade scripts, if any lstQueries, max_prefix = _get_upgrade_scripts(upgrade_dir, params_mapping, min_val=strNextPrefix) + LOG.info(f"Updating db config qty of queries: {len(lstQueries)}. New prefix: {max_prefix}. Queries: {lstQueries}") if len(lstQueries) > 0: - LOG.info(f"Uppdating db config qty of queries: {len(lstQueries)}. New prefix: {max_prefix}. Queries: {lstQueries}") has_been_upgraded = _execute_upgrade_scripts(params_mapping, lstQueries) else: has_been_upgraded = False From 5d3e22c2cde2cdc1d7f493d50416eb65a30c86e2 Mon Sep 17 00:00:00 2001 From: Alex Fernandez Date: Wed, 17 Jul 2024 13:28:54 -0300 Subject: [PATCH 20/22] misc: fix ci/cd --- testgen/template/dbupgrade/0105_incremental_upgrade.sql | 1 + 1 file changed, 1 insertion(+) create mode 100644 testgen/template/dbupgrade/0105_incremental_upgrade.sql diff --git a/testgen/template/dbupgrade/0105_incremental_upgrade.sql b/testgen/template/dbupgrade/0105_incremental_upgrade.sql new file mode 100644 index 0000000..027b7d6 --- /dev/null +++ b/testgen/template/dbupgrade/0105_incremental_upgrade.sql @@ -0,0 +1 @@ +SELECT 1; \ No newline at end of file From f8a621cacad1c2c551c92c88bcd305c3fddaf915 Mon Sep 17 00:00:00 2001 From: Alex Fernandez Date: Thu, 18 Jul 2024 13:23:46 -0300 Subject: [PATCH 21/22] #13: Support KeyPair Authentication for Snowflake --- Dockerfile | 2 + pyproject.toml | 1 + testgen/commands/run_execute_cat_tests.py | 3 + testgen/commands/run_execute_tests.py | 3 + testgen/commands/run_generate_tests.py | 3 + testgen/commands/run_get_entities.py | 7 +- testgen/commands/run_profiling_bridge.py | 3 + testgen/commands/run_quick_start.py | 3 + testgen/commands/run_setup_profiling_tools.py | 73 ++-- .../commands/run_test_parameter_validation.py | 3 + testgen/common/database/database_service.py | 36 +- .../common/database/flavor/flavor_service.py | 62 ++- .../database/flavor/mssql_flavor_service.py | 16 +- .../flavor/redshift_flavor_service.py | 25 +- .../flavor/snowflake_flavor_service.py | 50 ++- .../database/flavor/trino_flavor_service.py | 26 +- testgen/common/get_pipeline_parms.py | 3 - .../030_initialize_new_schema_structure.sql | 5 +- .../dbupgrade/0106_incremental_upgrade.sql | 5 + .../grant_execute_privileges_mssql.sql | 1 - .../create_functions_postgresql.sql | 21 - .../grant_execute_privileges_snowflake.sql | 4 +- .../template/get_entities/get_connection.sql | 5 +- testgen/template/parms/parms_profiling.sql | 3 + .../template/parms/parms_test_execution.sql | 3 + testgen/template/parms/parms_test_gen.sql | 3 + testgen/ui/queries/connection_queries.py | 63 ++- testgen/ui/services/connection_service.py | 79 +++- testgen/ui/services/database_service.py | 23 +- .../ui/services/test_definition_service.py | 3 + testgen/ui/views/connections.py | 230 +---------- testgen/ui/views/connections_base.py | 367 ++++++++++++++++++ testgen/ui/views/profiling_anomalies.py | 5 +- testgen/ui/views/test_results.py | 11 +- 34 files changed, 750 insertions(+), 400 deletions(-) create mode 100644 testgen/template/dbupgrade/0106_incremental_upgrade.sql create mode 100644 testgen/ui/views/connections_base.py diff --git a/Dockerfile b/Dockerfile index 1366f18..5aff6ba 100644 --- a/Dockerfile +++ b/Dockerfile @@ -33,6 +33,8 @@ RUN TG_METADATA_DB_USER=- TG_METADATA_DB_PASSWORD=- TG_METADATA_DB_HOST=- TG_MET ARG TESTGEN_VERSION ENV TESTGEN_VERSION=v$TESTGEN_VERSION +ENV STREAMLIT_SERVER_MAX_UPLOAD_SIZE=2 + WORKDIR /dk ENTRYPOINT ["testgen"] diff --git a/pyproject.toml b/pyproject.toml index fb278fc..2978f92 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,6 +61,7 @@ dependencies = [ "xlsxwriter==3.2.0", "psutil==5.9.8", "concurrent_log_handler==0.9.25", + "cryptography==42.0.8", ] [project.optional-dependencies] diff --git a/testgen/commands/run_execute_cat_tests.py b/testgen/commands/run_execute_cat_tests.py index 0b4b9e2..6dcd755 100644 --- a/testgen/commands/run_execute_cat_tests.py +++ b/testgen/commands/run_execute_cat_tests.py @@ -95,6 +95,9 @@ def run_cat_test_queries( dctParms["sql_flavor"], dctParms["url"], dctParms["connect_by_url"], + dctParms["connect_by_key"], + dctParms["private_key"], + dctParms["private_key_passphrase"], "PROJECT", ) diff --git a/testgen/commands/run_execute_tests.py b/testgen/commands/run_execute_tests.py index ff0e3cf..e25a697 100644 --- a/testgen/commands/run_execute_tests.py +++ b/testgen/commands/run_execute_tests.py @@ -44,6 +44,9 @@ def run_test_queries(strTestRunID, strTestTime, strProjectCode, strTestSuite, mi dctParms["sql_flavor"], dctParms["url"], dctParms["connect_by_url"], + dctParms["connect_by_key"], + dctParms["private_key"], + dctParms["private_key_passphrase"], "PROJECT", ) diff --git a/testgen/commands/run_generate_tests.py b/testgen/commands/run_generate_tests.py index 922ce46..fadc6ce 100644 --- a/testgen/commands/run_generate_tests.py +++ b/testgen/commands/run_generate_tests.py @@ -31,6 +31,9 @@ def run_test_gen_queries(strTableGroupsID, strTestSuite, strGenerationSet=None): dctParms["sql_flavor"], dctParms["url"], dctParms["connect_by_url"], + dctParms["connect_by_key"], + dctParms["private_key"], + dctParms["private_key_passphrase"], "PROJECT", ) diff --git a/testgen/commands/run_get_entities.py b/testgen/commands/run_get_entities.py index 1f76fa0..89f0d51 100644 --- a/testgen/commands/run_get_entities.py +++ b/testgen/commands/run_get_entities.py @@ -1,6 +1,7 @@ import logging from testgen.common import RetrieveDBResultsToList, read_template_sql_file +from testgen.common.encrypt import DecryptText LOG = logging.getLogger("testgen") @@ -32,7 +33,11 @@ def run_get_connection(connection_id): sql_template = read_template_sql_file("get_connection.sql", "get_entities") sql_template = sql_template.replace("{CONNECTION_ID}", str(connection_id)) rows, _ = RetrieveDBResultsToList("DKTG", sql_template) - return rows.pop() + connection = rows.pop()._asdict() + connection["password"] = DecryptText(connection["project_pw_encrypted"]) if connection["project_pw_encrypted"] else None + connection["private_key"] = DecryptText(connection["private_key"]) if connection["private_key"] else None + connection["private_key_passphrase"] = DecryptText(connection["private_key_passphrase"]) if connection["private_key_passphrase"] else "" + return connection def run_table_group_list(project_code): diff --git a/testgen/commands/run_profiling_bridge.py b/testgen/commands/run_profiling_bridge.py index 781891f..430eae6 100644 --- a/testgen/commands/run_profiling_bridge.py +++ b/testgen/commands/run_profiling_bridge.py @@ -261,6 +261,9 @@ def run_profiling_queries(strTableGroupsID, spinner=None): dctParms["sql_flavor"], dctParms["url"], dctParms["connect_by_url"], + dctParms["connect_by_key"], + dctParms["private_key"], + dctParms["private_key_passphrase"], "PROJECT", ) diff --git a/testgen/commands/run_quick_start.py b/testgen/commands/run_quick_start.py index 796302d..503ca34 100644 --- a/testgen/commands/run_quick_start.py +++ b/testgen/commands/run_quick_start.py @@ -80,6 +80,9 @@ def _prepare_connection_to_target_database(params_mapping): params_mapping["SQL_FLAVOR"], None, None, + False, + None, + None, "PROJECT", ) diff --git a/testgen/commands/run_setup_profiling_tools.py b/testgen/commands/run_setup_profiling_tools.py index 8f98d67..c2d42f3 100644 --- a/testgen/commands/run_setup_profiling_tools.py +++ b/testgen/commands/run_setup_profiling_tools.py @@ -7,13 +7,46 @@ LOG = logging.getLogger("testgen") -def _get_params_mapping(project_qc_schema: str, user: str) -> dict: +def _get_params_mapping(project_qc_schema: str, user: str, user_role: str | None) -> dict: return { "DATA_QC_SCHEMA": project_qc_schema, "DB_USER": user, + "DB_USER_ROLE": user_role, } +def get_setup_profiling_tools_queries(sql_flavor, create_qc_schema, skip_granting_privileges, project_qc_schema, user, user_role=None): + queries = [] + + params_mapping = _get_params_mapping(project_qc_schema, user, user_role) + + if create_qc_schema: + queries.extend( + get_queries_for_command( + f"flavors/{sql_flavor}/setup_profiling_tools", + params_mapping, + mask=rf"^.*create_qc_schema_{sql_flavor}.sql$", + ) + ) + + queries.extend( + get_queries_for_command( + f"flavors/{sql_flavor}/setup_profiling_tools", params_mapping, mask=rf"^.*functions_{sql_flavor}.sql$" + ) + ) + + if not skip_granting_privileges: + queries.extend( + get_queries_for_command( + f"flavors/{sql_flavor}/setup_profiling_tools", + params_mapping, + mask=rf"^.*grant_execute_privileges_{sql_flavor}.sql$", + ) + ) + + return queries + + def run_setup_profiling_tools( connection_id: str | int, dry_run: bool, @@ -21,12 +54,19 @@ def run_setup_profiling_tools( db_user: str | None = None, db_password: str | None = None, skip_granting_privileges: bool = False, + admin_private_key_passphrase: str | None = None, + admin_private_key: str | None = None, + user_role: str | None = None, ) -> str: connection = run_get_connection(str(connection_id)) # Set Project Connection Parms in common.db_bridgers from retrieved parms LOG.info("CurrentStep: Assigning Connection Parms") user = db_user or connection["project_user"] + connect_by_key = admin_private_key is not None or connection["connect_by_key"] + private_key_passphrase = admin_private_key_passphrase if admin_private_key is not None else connection["private_key_passphrase"] + private_key = admin_private_key if admin_private_key is not None else connection["private_key"] + AssignConnectParms( connection["project_key"], connection["connection_id"], @@ -38,38 +78,17 @@ def run_setup_profiling_tools( connection["sql_flavor"], connection["url"], connection["connect_by_url"], + connect_by_key, + private_key, + private_key_passphrase, "PROJECT", ) project_qc_schema = connection["project_qc_schema"] sql_flavor = connection["sql_flavor"] + user = connection["project_user"] - params_mapping = _get_params_mapping(project_qc_schema, connection["project_user"]) - queries = [] - - if create_qc_schema: - queries.extend( - get_queries_for_command( - f"flavors/{sql_flavor}/setup_profiling_tools", - params_mapping, - mask=rf"^.*create_qc_schema_{sql_flavor}.sql$", - ) - ) - - queries.extend( - get_queries_for_command( - f"flavors/{sql_flavor}/setup_profiling_tools", params_mapping, mask=rf"^.*functions_{sql_flavor}.sql$" - ) - ) - - if not skip_granting_privileges: - queries.extend( - get_queries_for_command( - f"flavors/{sql_flavor}/setup_profiling_tools", - params_mapping, - mask=rf"^.*grant_execute_privileges_{sql_flavor}.sql$", - ) - ) + queries = get_setup_profiling_tools_queries(sql_flavor, create_qc_schema, skip_granting_privileges, project_qc_schema, user, user_role) if not dry_run: RunActionQueryList("PROJECT", queries, user_override=db_user, pwd_override=db_password) diff --git a/testgen/commands/run_test_parameter_validation.py b/testgen/commands/run_test_parameter_validation.py index 6e0b524..357ee22 100644 --- a/testgen/commands/run_test_parameter_validation.py +++ b/testgen/commands/run_test_parameter_validation.py @@ -26,6 +26,9 @@ def run_parameter_validation_queries( dctParms["sql_flavor"], dctParms["url"], dctParms["connect_by_url"], + dctParms["connect_by_key"], + dctParms["private_key"], + dctParms["private_key_passphrase"], "PROJECT", ) diff --git a/testgen/common/database/database_service.py b/testgen/common/database/database_service.py index bbed234..d7949bc 100644 --- a/testgen/common/database/database_service.py +++ b/testgen/common/database/database_service.py @@ -38,6 +38,9 @@ class CConnectParms: sql_flavor = "" url = "" connect_by_url = "" + connect_by_key = "" + private_key = "" + private_key_passphrase = "" password = None def __init__(self, connectname): @@ -83,6 +86,9 @@ def AssignConnectParms( flavor, url, connect_by_url, + connect_by_key, + private_key, + private_key_passphrase, connectname="PROJECT", password=None, ): @@ -100,6 +106,9 @@ def AssignConnectParms( clsConnectParms.password = password clsConnectParms.url = url clsConnectParms.connect_by_url = connect_by_url + clsConnectParms.connect_by_key = connect_by_key + clsConnectParms.private_key = private_key + clsConnectParms.private_key_passphrase = private_key_passphrase def _RetrieveProjectPW(strProjectCode, strConnID): @@ -169,6 +178,9 @@ def _GetDBCredentials(strCredentialSet): "dbtype": clsConnectParms.sql_flavor, "url": clsConnectParms.url, "connect_by_url": clsConnectParms.connect_by_url, + "connect_by_key": clsConnectParms.connect_by_key, + "private_key": clsConnectParms.private_key, + "private_key_passphrase": clsConnectParms.private_key_passphrase, } elif strCredentialSet == "DKTG": # Get credentials from functions in my_dk_credentials.py @@ -204,9 +216,8 @@ def _InitDBConnection(strCredentialSet, strRaw="N", strAdmin="N", user_override= con = _InitDBConnection_appdb(dctCredentials, strCredentialSet, strRaw, strAdmin, user_override, pwd_override) else: flavor_service = get_flavor_service(dctCredentials["dbtype"]) - con = _InitDBConnection_target_db( - flavor_service, dctCredentials, strCredentialSet, strRaw, user_override, pwd_override - ) + flavor_service.init(dctCredentials) + con = _InitDBConnection_target_db(flavor_service, strCredentialSet, strRaw, user_override, pwd_override) return con @@ -277,9 +288,7 @@ def _InitDBConnection_appdb( return con -def _InitDBConnection_target_db( - flavor_service, dctCredentials, strCredentialSet, strRaw="N", user_override=None, pwd_override=None -): +def _InitDBConnection_target_db(flavor_service, strCredentialSet, strRaw="N", user_override=None, pwd_override=None): # Get DBEngine using credentials if strCredentialSet in dctDBEngines: # Retrieve existing engine from store @@ -287,18 +296,21 @@ def _InitDBConnection_target_db( else: # Handle user override if user_override is not None: - dctCredentials["user"] = user_override + flavor_service.override_user(user_override) # Handle password override if pwd_override is not None: strPW = pwd_override - else: + elif not flavor_service.is_connect_by_key(): strPW = _GetDBPassword(strCredentialSet) + else: + strPW = None # Open a new engine with appropriate connection parms - strConnect = flavor_service.get_connection_string(dctCredentials, strPW) + is_password_overwritten = pwd_override is not None + strConnect = flavor_service.get_connection_string(strPW, is_password_overwritten) connect_args = {"connect_timeout": 3600} - connect_args.update(flavor_service.get_connect_args()) + connect_args.update(flavor_service.get_connect_args(is_password_overwritten)) try: # Timeout in seconds: 1 hour = 60 * 60 second = 3600 @@ -306,10 +318,10 @@ def _InitDBConnection_target_db( dctDBEngines[strCredentialSet] = dbEngine except SQLAlchemyError as e: - raise ValueError(f"Failed to create engine for database {dctCredentials['dbname']}") from e + raise ValueError(f"Failed to create engine for database {flavor_service.get_db_name}") from e # Second, create a connection from our engine - queries = flavor_service.get_pre_connection_queries(dctCredentials) + queries = flavor_service.get_pre_connection_queries() if strRaw == "N": connection = dbEngine.connect() for query in queries: diff --git a/testgen/common/database/flavor/flavor_service.py b/testgen/common/database/flavor/flavor_service.py index d12dfed..1928808 100644 --- a/testgen/common/database/flavor/flavor_service.py +++ b/testgen/common/database/flavor/flavor_service.py @@ -1,10 +1,56 @@ from abc import abstractmethod from testgen import settings +from testgen.common.encrypt import DecryptText class FlavorService: - def get_connect_args(self): + + url = None + connect_by_url = None + username = None + host = None + port = None + dbname = None + flavor = None + dbschema = None + connect_by_key = None + private_key = None + private_key_passphrase = None + catalog = None + + def init(self, connection_params: dict): + self.url = connection_params.get("url", None) + self.connect_by_url = connection_params.get("connect_by_url", False) + self.username = connection_params.get("user") + self.host = connection_params.get("host") + self.port = connection_params.get("port") + self.dbname = connection_params.get("dbname") + self.flavor = connection_params.get("flavor") + self.dbschema = connection_params.get("dbschema", None) + self.connect_by_key = connection_params.get("connect_by_key", False) + self.catalog = connection_params.get("catalog", None) + + private_key = connection_params.get("private_key", None) + if isinstance(private_key, memoryview): + private_key = DecryptText(private_key) + self.private_key = private_key + + private_key_passphrase = connection_params.get("private_key_passphrase", None) + if isinstance(private_key_passphrase, memoryview): + private_key_passphrase = DecryptText(private_key_passphrase) + self.private_key_passphrase = private_key_passphrase + + def override_user(self, user_override: str): + self.username = user_override + + def get_db_name(self) -> str: + return self.dbname + + def is_connect_by_key(self) -> str: + return self.connect_by_key + + def get_connect_args(self, is_password_overwritten: bool = False): # NOQA ARG002 if settings.SKIP_DATABASE_CERTIFICATE_VERIFICATION: return {"TrustServerCertificate": "yes"} return {} @@ -12,18 +58,18 @@ def get_connect_args(self): def get_concat_operator(self): return "||" - def get_connection_string(self, dctCredentials, strPW): - if dctCredentials["connect_by_url"]: - header = self.get_connection_string_head(dctCredentials, strPW) - url = header + dctCredentials["url"] + def get_connection_string(self, strPW, is_password_overwritten: bool = False): + if self.connect_by_url: + header = self.get_connection_string_head(strPW) + url = header + self.url return url else: - return self.get_connection_string_from_fields(dctCredentials, strPW) + return self.get_connection_string_from_fields(strPW, is_password_overwritten) @abstractmethod - def get_connection_string_from_fields(self, dctCredentials, strPW): + def get_connection_string_from_fields(self, strPW, is_password_overwritten: bool = False): raise NotImplementedError("Subclasses must implement this method") @abstractmethod - def get_connection_string_head(self, dctCredentials, strPW): + def get_connection_string_head(self, strPW): raise NotImplementedError("Subclasses must implement this method") diff --git a/testgen/common/database/flavor/mssql_flavor_service.py b/testgen/common/database/flavor/mssql_flavor_service.py index b0b3b07..cfbb9c5 100644 --- a/testgen/common/database/flavor/mssql_flavor_service.py +++ b/testgen/common/database/flavor/mssql_flavor_service.py @@ -4,31 +4,27 @@ class MssqlFlavorService(FlavorService): - def get_connection_string_head(self, dctCredentials, strPW): - username = dctCredentials["user"] + def get_connection_string_head(self, strPW): + username = self.username password = quote_plus(strPW) strConnect = f"mssql+pyodbc://{username}:{password}@" return strConnect - def get_connection_string_from_fields(self, dctCredentials, strPW): - username = dctCredentials["user"] + def get_connection_string_from_fields(self, strPW, is_password_overwritten: bool = False): # NOQA ARG002 password = quote_plus(strPW) - hostname = dctCredentials["host"] - port = dctCredentials["port"] - dbname = dctCredentials["dbname"] strConnect = ( - f"mssql+pyodbc://{username}:{password}@{hostname}:{port}/{dbname}?driver=ODBC+Driver+18+for+SQL+Server" + f"mssql+pyodbc://{self.username}:{password}@{self.host}:{self.port}/{self.dbname}?driver=ODBC+Driver+18+for+SQL+Server" ) - if "synapse" in hostname: + if "synapse" in self.host: strConnect += "&autocommit=True" return strConnect - def get_pre_connection_queries(self, dctCredentials): # noqa ARG002 + def get_pre_connection_queries(self): # ARG002 return [ "SET ANSI_DEFAULTS ON;", "SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED;", diff --git a/testgen/common/database/flavor/redshift_flavor_service.py b/testgen/common/database/flavor/redshift_flavor_service.py index dd1a9b2..e3ed1a2 100644 --- a/testgen/common/database/flavor/redshift_flavor_service.py +++ b/testgen/common/database/flavor/redshift_flavor_service.py @@ -4,30 +4,19 @@ class RedshiftFlavorService(FlavorService): - def get_connection_string_head(self, dctCredentials, strPW): - strConnect = "{}://{}:{}@".format( - dctCredentials["flavor"], - dctCredentials["user"], - quote_plus(strPW), - ) + def get_connection_string_head(self, strPW): + strConnect = f"{self.flavor}://{self.username}:{quote_plus(strPW)}@" return strConnect - def get_connection_string_from_fields(self, dctCredentials, strPW): + def get_connection_string_from_fields(self, strPW, is_password_overwritten: bool = False): # NOQA ARG002 # STANDARD FORMAT: strConnect = 'flavor://username:password@host:port/database' - strConnect = "{}://{}:{}@{}:{}/{}".format( - dctCredentials["flavor"], - dctCredentials["user"], - quote_plus(strPW), - dctCredentials["host"], - dctCredentials["port"], - dctCredentials["dbname"], - ) + strConnect = f"{self.flavor}://{self.username}:{quote_plus(strPW)}@{self.host}:{self.port}/{self.dbname}" return strConnect - def get_pre_connection_queries(self, dctCredentials): + def get_pre_connection_queries(self): return [ - "SET SEARCH_PATH = '" + dctCredentials["dbschema"] + "'", + "SET SEARCH_PATH = '" + self.dbschema + "'", ] - def get_connect_args(self): + def get_connect_args(self, is_password_overwritten: bool = False): # NOQA ARG002 return {} diff --git a/testgen/common/database/flavor/snowflake_flavor_service.py b/testgen/common/database/flavor/snowflake_flavor_service.py index 1232a0e..c6d213b 100644 --- a/testgen/common/database/flavor/snowflake_flavor_service.py +++ b/testgen/common/database/flavor/snowflake_flavor_service.py @@ -1,21 +1,48 @@ from urllib.parse import quote_plus +from cryptography.hazmat.backends import default_backend +from cryptography.hazmat.primitives import serialization + from testgen.common.database.flavor.flavor_service import FlavorService class SnowflakeFlavorService(FlavorService): - def get_connection_string_head(self, dctCredentials, strPW): - strConnect = "snowflake://{}:{}@".format(dctCredentials["user"], quote_plus(strPW)) + + def get_connect_args(self, is_password_overwritten: bool = False): + connect_args = super().get_connect_args(is_password_overwritten) + + if self.connect_by_key and not is_password_overwritten: + # https://docs.snowflake.com/en/developer-guide/python-connector/sqlalchemy#key-pair-authentication-support + private_key_passphrase = self.private_key_passphrase.encode() if self.private_key_passphrase else None + private_key = serialization.load_pem_private_key( + self.private_key.encode(), + password=private_key_passphrase, + backend=default_backend(), + ) + + private_key_bytes = private_key.private_bytes( + encoding=serialization.Encoding.DER, + format=serialization.PrivateFormat.PKCS8, + encryption_algorithm=serialization.NoEncryption(), + ) + + connect_args.update({"private_key": private_key_bytes}) + return connect_args + + def get_connection_string_head(self, strPW): + if self.connect_by_key and not strPW: + strConnect = f"snowflake://{self.username}@" + else: + strConnect = f"snowflake://{self.username}:{quote_plus(strPW)}@" return strConnect - def get_connection_string_from_fields(self, dctCredentials, strPW): + def get_connection_string_from_fields(self, strPW, is_password_overwritten: bool = False): # SNOWFLAKE FORMAT: strConnect = 'flavor://username:password@host/database' # optionally + '/[schema]' + '?warehouse=xxx' # NOTE: Snowflake host should NOT include ".snowflakecomputing.com" def get_raw_host_name(host): endings = [ - ".azure.snowflakecomputing.com", ".snowflakecomputing.com", ] for ending in endings: @@ -24,13 +51,18 @@ def get_raw_host_name(host): return host[0:i] return host - raw_host = get_raw_host_name(dctCredentials["host"]) - strConnect = "snowflake://{}:{}@{}/{}/{}".format( - dctCredentials["user"], quote_plus(strPW), raw_host, dctCredentials["dbname"], dctCredentials["dbschema"] - ) + raw_host = get_raw_host_name(self.host) + host = raw_host + if self.port != "443": + host += ":" + self.port + + if self.connect_by_key and not is_password_overwritten: + strConnect = f"snowflake://{self.username}@{host}/{self.dbname}/{self.dbschema}" + else: + strConnect = f"snowflake://{self.username}:{quote_plus(strPW)}@{host}/{self.dbname}/{self.dbschema}" return strConnect - def get_pre_connection_queries(self, dctCredentials): # noqa ARG002 + def get_pre_connection_queries(self): # ARG002 return [ "ALTER SESSION SET MULTI_STATEMENT_COUNT = 0;", "ALTER SESSION SET WEEK_START = 7;", diff --git a/testgen/common/database/flavor/trino_flavor_service.py b/testgen/common/database/flavor/trino_flavor_service.py index 6200e35..788fcae 100644 --- a/testgen/common/database/flavor/trino_flavor_service.py +++ b/testgen/common/database/flavor/trino_flavor_service.py @@ -4,30 +4,18 @@ class TrinoFlavorService(FlavorService): - def get_connection_string_head(self, dctCredentials, strPW): - strConnect = "{}://{}:{}@".format( - dctCredentials["flavor"], - dctCredentials["user"], - quote_plus(strPW), - ) + def get_connection_string_head(self, strPW): + strConnect = f"{self.flavor}://{self.username}:{quote_plus(strPW)}@" return strConnect - def get_connection_string_from_fields(self, dctCredentials, strPW): + def get_connection_string_from_fields(self, strPW, is_password_overwritten: bool = False): # NOQA ARG002 # STANDARD FORMAT: strConnect = 'flavor://username:password@host:port/catalog' - strConnect = "{}://{}:{}@{}:{}/{}".format( - dctCredentials["flavor"], - dctCredentials["user"], - quote_plus(strPW), - dctCredentials["host"], - dctCredentials["port"], - dctCredentials["catalog"], # "postgresql" - ) - return strConnect + return f"{self.flavor}://{self.username}:{quote_plus(strPW)}@{self.host}:{self.port}/{self.catalog}" - def get_pre_connection_queries(self, dctCredentials): + def get_pre_connection_queries(self): return [ - "USE " + dctCredentials["catalog"] + "." + dctCredentials["dbschema"], + "USE " + self.catalog + "." + self.dbschema, ] - def get_connect_args(self): + def get_connect_args(self, is_password_overwritten: bool = False): # NOQA ARG002 return {} diff --git a/testgen/common/get_pipeline_parms.py b/testgen/common/get_pipeline_parms.py index faef766..c3f81d6 100644 --- a/testgen/common/get_pipeline_parms.py +++ b/testgen/common/get_pipeline_parms.py @@ -16,10 +16,7 @@ def RetrieveProfilingParms(strTableGroupsID): lstParms[0]["project_code"] == "" or lstParms[0]["connection_id"] == "" or lstParms[0]["sql_flavor"] == "" - or lstParms[0]["project_host"] == "" - or lstParms[0]["project_port"] == "" or lstParms[0]["project_user"] == "" - or lstParms[0]["project_db"] == "" or lstParms[0]["profile_use_sampling"] == "" or lstParms[0]["profile_sample_percent"] == "" or lstParms[0]["profile_sample_min_count"] == "" diff --git a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql index cf7bd04..405b57e 100644 --- a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql +++ b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql @@ -60,7 +60,10 @@ CREATE TABLE connections ( max_threads INTEGER DEFAULT 4, max_query_chars INTEGER, url VARCHAR(200) default '', - connect_by_url BOOLEAN default FALSE + connect_by_url BOOLEAN default FALSE, + connect_by_key BOOLEAN DEFAULT FALSE, + private_key BYTEA, + private_key_passphrase BYTEA ); CREATE TABLE table_groups diff --git a/testgen/template/dbupgrade/0106_incremental_upgrade.sql b/testgen/template/dbupgrade/0106_incremental_upgrade.sql new file mode 100644 index 0000000..1322a3d --- /dev/null +++ b/testgen/template/dbupgrade/0106_incremental_upgrade.sql @@ -0,0 +1,5 @@ +SET SEARCH_PATH TO {SCHEMA_NAME}; + +ALTER TABLE connections ADD COLUMN connect_by_key BOOLEAN DEFAULT FALSE; +ALTER TABLE connections ADD COLUMN private_key BYTEA; +ALTER TABLE connections ADD COLUMN private_key_passphrase BYTEA; diff --git a/testgen/template/flavors/mssql/setup_profiling_tools/grant_execute_privileges_mssql.sql b/testgen/template/flavors/mssql/setup_profiling_tools/grant_execute_privileges_mssql.sql index ed438fb..22b4576 100644 --- a/testgen/template/flavors/mssql/setup_profiling_tools/grant_execute_privileges_mssql.sql +++ b/testgen/template/flavors/mssql/setup_profiling_tools/grant_execute_privileges_mssql.sql @@ -1,2 +1 @@ --- GRANT CONTROL, REFERENCES ON SCHEMA::{DATA_QC_SCHEMA} TO {DB_USER}; GRANT EXECUTE ON SCHEMA::{DATA_QC_SCHEMA} TO {DB_USER}; diff --git a/testgen/template/flavors/postgresql/setup_profiling_tools/create_functions_postgresql.sql b/testgen/template/flavors/postgresql/setup_profiling_tools/create_functions_postgresql.sql index c78c430..cff460f 100644 --- a/testgen/template/flavors/postgresql/setup_profiling_tools/create_functions_postgresql.sql +++ b/testgen/template/flavors/postgresql/setup_profiling_tools/create_functions_postgresql.sql @@ -1,24 +1,3 @@ -/* -CREATE OR REPLACE FUNCTION {DATA_QC_SCHEMA}.DATEDIFF(difftype character varying, firstdate timestamp without time zone, seconddate timestamp without time zone) -RETURNS BIGINT AS $$ -BEGIN - RETURN CASE - WHEN UPPER(difftype) IN ('DAY', 'DD', 'D') THEN - DATE(seconddate) - DATE(firstdate) - WHEN UPPER(difftype) IN ('WEEK','WK', 'W') THEN - (DATE(seconddate) - DATE(firstdate)) / 7 - WHEN UPPER(difftype) IN ('MON', 'MONTH', 'MM') THEN - (DATE_PART('year', seconddate) - DATE_PART('year', firstdate)) * 12 + (DATE_PART('month', seconddate) - DATE_PART('month', firstdate)) - WHEN UPPER(difftype) IN ('QUARTER', 'QTR', 'Q') THEN - ((DATE_PART('year', seconddate) - DATE_PART('year', firstdate)) * 4) + (DATE_PART('quarter', seconddate) - DATE_PART('quarter', firstdate)) - WHEN UPPER(difftype) IN ('YEAR', 'YY', 'Y') THEN - DATE_PART('year', seconddate) - DATE_PART('year', firstdate) - ELSE - NULL::BIGINT - END; -END; -$$ LANGUAGE plpgsql IMMUTABLE STRICT; -*/ CREATE OR REPLACE FUNCTION {DATA_QC_SCHEMA}.DATEDIFF(difftype character varying, firstdate timestamp without time zone, seconddate timestamp without time zone) RETURNS BIGINT AS $$ SELECT diff --git a/testgen/template/flavors/snowflake/setup_profiling_tools/grant_execute_privileges_snowflake.sql b/testgen/template/flavors/snowflake/setup_profiling_tools/grant_execute_privileges_snowflake.sql index 4b1d0fe..2a60aa7 100644 --- a/testgen/template/flavors/snowflake/setup_profiling_tools/grant_execute_privileges_snowflake.sql +++ b/testgen/template/flavors/snowflake/setup_profiling_tools/grant_execute_privileges_snowflake.sql @@ -1,4 +1,6 @@ CREATE ROLE IF NOT EXISTS dk_qc_role; GRANT ALL PRIVILEGES ON SCHEMA {DATA_QC_SCHEMA} TO ROLE dk_qc_role; -GRANT ROLE dk_qc_role TO USER {DB_USER}; +GRANT USAGE ON FUNCTION {DATA_QC_SCHEMA}.fndk_isnum(VARCHAR) TO dk_qc_role; +GRANT USAGE ON FUNCTION {DATA_QC_SCHEMA}.fndk_isdate(VARCHAR) TO dk_qc_role; +GRANT ROLE dk_qc_role TO USER {DB_USER}; \ No newline at end of file diff --git a/testgen/template/get_entities/get_connection.sql b/testgen/template/get_entities/get_connection.sql index ebfe713..b24c7ba 100644 --- a/testgen/template/get_entities/get_connection.sql +++ b/testgen/template/get_entities/get_connection.sql @@ -13,6 +13,9 @@ SELECT max_query_chars, project_qc_schema, url, - connect_by_url + connect_by_url, + connect_by_key, + private_key, + private_key_passphrase FROM connections WHERE connection_id = {CONNECTION_ID}; diff --git a/testgen/template/parms/parms_profiling.sql b/testgen/template/parms/parms_profiling.sql index 6309ad4..eabb737 100644 --- a/testgen/template/parms/parms_profiling.sql +++ b/testgen/template/parms/parms_profiling.sql @@ -3,6 +3,9 @@ SELECT cc.project_code, cc.sql_flavor, cc.url, cc.connect_by_url, + cc.connect_by_key, + cc.private_key, + cc.private_key_passphrase, cc.project_host, cc.project_port, cc.project_user, diff --git a/testgen/template/parms/parms_test_execution.sql b/testgen/template/parms/parms_test_execution.sql index ef779b4..3091c32 100644 --- a/testgen/template/parms/parms_test_execution.sql +++ b/testgen/template/parms/parms_test_execution.sql @@ -2,6 +2,9 @@ SELECT g.project_code, g.connection_id::varchar(50), cc.sql_flavor, cc.project_host, cc.project_port, cc.project_user, cc.project_db, tg.table_group_schema, cc.project_qc_schema, + cc.connect_by_key, + cc.private_key, + cc.private_key_passphrase, cc.max_threads, cc.max_query_chars, cc.url, cc.connect_by_url FROM test_suites g INNER JOIN connections cc ON (g.connection_id = cc.connection_id) diff --git a/testgen/template/parms/parms_test_gen.sql b/testgen/template/parms/parms_test_gen.sql index ece395e..5328d6b 100644 --- a/testgen/template/parms/parms_test_gen.sql +++ b/testgen/template/parms/parms_test_gen.sql @@ -3,6 +3,9 @@ SELECT tg.project_code, tg.connection_id, cc.project_host, cc.project_port, cc.project_user, + cc.connect_by_key, + cc.private_key, + cc.private_key_passphrase, cc.project_db, tg.table_group_schema, s.export_to_observability, diff --git a/testgen/ui/queries/connection_queries.py b/testgen/ui/queries/connection_queries.py index b7424b9..dc10bed 100644 --- a/testgen/ui/queries/connection_queries.py +++ b/testgen/ui/queries/connection_queries.py @@ -10,7 +10,7 @@ def get_by_id(connection_id): SELECT id::VARCHAR(50), project_code, connection_id, connection_name, sql_flavor, project_host, project_port, project_user, project_qc_schema, project_db, project_pw_encrypted, NULL as password, - max_threads, max_query_chars, url, connect_by_url + max_threads, max_query_chars, url, connect_by_url, connect_by_key, private_key, private_key_passphrase FROM {str_schema}.connections WHERE connection_id = '{connection_id}' """ @@ -23,7 +23,8 @@ def get_connections(project_code): SELECT id::VARCHAR(50), project_code, connection_id, connection_name, sql_flavor, project_host, project_port, project_user, project_qc_schema, project_db, project_pw_encrypted, NULL as password, - max_threads, max_query_chars, connect_by_url, url + max_threads, max_query_chars, connect_by_url, url, connect_by_key, private_key, + private_key_passphrase FROM {str_schema}.connections WHERE project_code = '{project_code}' ORDER BY connection_id @@ -37,7 +38,7 @@ def get_table_group_names_by_connection(schema: str, connection_ids: list[str]) return db.retrieve_data(str_sql) -def edit_connection(schema, connection, encrypted_password): +def edit_connection(schema, connection, encrypted_password, encrypted_private_key, encrypted_private_key_passphrase): sql = f"""UPDATE {schema}.connections SET project_code = '{connection["project_code"]}', sql_flavor = '{connection["sql_flavor"]}', @@ -47,35 +48,65 @@ def edit_connection(schema, connection, encrypted_password): project_db = '{connection["project_db"]}', project_qc_schema = '{connection["project_qc_schema"]}', connection_name = '{connection["connection_name"]}', - project_pw_encrypted = '{encrypted_password}', max_threads = '{connection["max_threads"]}', max_query_chars = '{connection["max_query_chars"]}', url = '{connection["url"]}', - connect_by_url = '{connection["connect_by_url"]}' - WHERE - connection_id = '{connection["connection_id"]}';""" + connect_by_key = '{connection["connect_by_key"]}', + connect_by_url = '{connection["connect_by_url"]}'""" + + if encrypted_password: + sql += f""", project_pw_encrypted = '{encrypted_password}' """ + + if encrypted_private_key: + sql += f""", private_key = '{encrypted_private_key}' """ + + if encrypted_private_key_passphrase: + sql += f""", private_key_passphrase = '{encrypted_private_key_passphrase}' """ + + sql += f""" WHERE connection_id = '{connection["connection_id"]}';""" db.execute_sql(sql) st.cache_data.clear() -def add_connection(schema, connection, encrypted_password): - sql = f"""INSERT INTO {schema}.connections - (project_code, sql_flavor, url, connect_by_url, project_host, project_port, project_user, project_db, project_qc_schema, - connection_name, project_pw_encrypted, max_threads, max_query_chars) - SELECT +def add_connection(schema, connection, encrypted_password, encrypted_private_key, encrypted_private_key_passphrase): + + sql_header = f"""INSERT INTO {schema}.connections + (project_code, sql_flavor, url, connect_by_url, connect_by_key, + project_host, project_port, project_user, project_db, project_qc_schema, + connection_name,""" + + sql_footer = f""" SELECT '{connection["project_code"]}' as project_code, '{connection["sql_flavor"]}' as sql_flavor, '{connection["url"]}' as url, - '{connection["connect_by_url"]}' as connect_by_url, + {connection["connect_by_url"]} as connect_by_url, + {connection["connect_by_key"]} as connect_by_key, '{connection["project_host"]}' as project_host, '{connection["project_port"]}' as project_port, '{connection["project_user"]}' as project_user, '{connection["project_db"]}' as project_db, '{connection["project_qc_schema"]}' as project_qc_schema, - '{connection["connection_name"]}' as connection_name, - '{encrypted_password}' as project_pw_encrypted, - '{connection["max_threads"]}' as max_threads, + '{connection["connection_name"]}' as connection_name, """ + + if encrypted_password: + sql_header += "project_pw_encrypted, " + sql_footer += f""" '{encrypted_password}' as project_pw_encrypted, """ + + if encrypted_private_key: + sql_header += "private_key, " + sql_footer += f""" '{encrypted_private_key}' as private_key, """ + + if encrypted_private_key_passphrase: + sql_header += "private_key_passphrase, " + sql_footer += f""" '{encrypted_private_key_passphrase}' as private_key_passphrase, """ + + sql_header += """max_threads, max_query_chars) """ + + sql_footer += f""" '{connection["max_threads"]}' as max_threads, '{connection["max_query_chars"]}' as max_query_chars;""" + + sql = sql_header + sql_footer + db.execute_sql(sql) st.cache_data.clear() diff --git a/testgen/ui/services/connection_service.py b/testgen/ui/services/connection_service.py index ae59a5b..faad168 100644 --- a/testgen/ui/services/connection_service.py +++ b/testgen/ui/services/connection_service.py @@ -16,43 +16,72 @@ def get_by_id(connection_id, hide_passwords: bool = True): connections_df = connection_queries.get_by_id(connection_id) + decrypt_connections(connections_df, hide_passwords) connection = connections_df.to_dict(orient="records")[0] - - if hide_passwords: - connection["password"] = "***" # noqa S105 - else: - encrypted_password = connection["project_pw_encrypted"] - password = DecryptText(encrypted_password) - connection["password"] = password - return connection def get_connections(project_code, hide_passwords: bool = False): connections = connection_queries.get_connections(project_code) + decrypt_connections(connections, hide_passwords) + return connections + + +def decrypt_connections(connections, hide_passwords: bool = False): for index, connection in connections.iterrows(): if hide_passwords: password = "***" # noqa S105 + private_key = "***" # S105 + private_key_passphrase = "***" # noqa S105 else: - encrypted_password = connection["project_pw_encrypted"] - password = DecryptText(encrypted_password) - connection["password"] = password + password = DecryptText(connection["project_pw_encrypted"]) if connection["project_pw_encrypted"] else None + private_key = DecryptText(connection["private_key"]) if connection["private_key"] else None + private_key_passphrase = DecryptText(connection["private_key_passphrase"]) if connection["private_key_passphrase"] else "" connections.at[index, "password"] = password - return connections + connections.at[index, "private_key"] = private_key + connections.at[index, "private_key_passphrase"] = private_key_passphrase + + +def encrypt_credentials(connection): + encrypted_password = EncryptText(connection["password"]) if connection["password"] else None + encrypted_private_key = EncryptText(connection["private_key"]) if connection["private_key"] else None + encrypted_private_key_passphrase = EncryptText(connection["private_key_passphrase"]) if connection["private_key_passphrase"] else None + return encrypted_password, encrypted_private_key, encrypted_private_key_passphrase def edit_connection(connection): empty_cache() - encrypted_password = EncryptText(connection["password"]) schema = st.session_state["dbschema"] - connection_queries.edit_connection(schema, connection, encrypted_password) + connection = pre_save_connection_process(connection) + encrypted_password, encrypted_private_key, encrypted_private_key_passphrase = encrypt_credentials(connection) + connection_queries.edit_connection(schema, connection, encrypted_password, encrypted_private_key, encrypted_private_key_passphrase) def add_connection(connection): empty_cache() - encrypted_password = EncryptText(connection["password"]) schema = st.session_state["dbschema"] - connection_queries.add_connection(schema, connection, encrypted_password) + connection = pre_save_connection_process(connection) + encrypted_password, encrypted_private_key, encrypted_private_key_passphrase = encrypt_credentials(connection) + connection_queries.add_connection(schema, connection, encrypted_password, encrypted_private_key, encrypted_private_key_passphrase) + + +def pre_save_connection_process(connection): + if connection["connect_by_url"]: + url = connection["url"] + if url: + url_sections = url.split("/") + if len(url_sections) > 0: + host_port = url_sections[0] + host_port_sections = host_port.split(":") + if len(host_port_sections) > 0: + connection["project_host"] = host_port_sections[0] + connection["project_port"] = "".join(host_port_sections[1:]) + else: + connection["project_host"] = host_port + connection["project_port"] = "" + if len(url_sections) > 1: + connection["project_db"] = url_sections[1] + return connection def delete_connections(connection_ids): @@ -96,6 +125,9 @@ def init_profiling_sql(project_code, connection, table_group_schema=None): sql_flavor = connection["sql_flavor"] url = connection["url"] connect_by_url = connection["connect_by_url"] + connect_by_key = connection["connect_by_key"] + private_key = connection["private_key"] + private_key_passphrase = connection["private_key_passphrase"] project_host = connection["project_host"] project_port = connection["project_port"] project_db = connection["project_db"] @@ -117,6 +149,9 @@ def init_profiling_sql(project_code, connection, table_group_schema=None): sql_flavor, url, connect_by_url, + connect_by_key, + private_key, + private_key_passphrase, connectname="PROJECT", password=password, ) @@ -154,10 +189,10 @@ def test_qc_connection(project_code, connection, init_profiling=True): return qc_results -def create_qc_schema(connection_id, create_qc_schema, db_user, db_password, skip_granting_privileges): +def create_qc_schema(connection_id, create_qc_schema, db_user, db_password, skip_granting_privileges, admin_private_key_passphrase=None, admin_private_key=None, user_role=None): dry_run = False empty_cache() - run_setup_profiling_tools(connection_id, dry_run, create_qc_schema, db_user, db_password, skip_granting_privileges) + run_setup_profiling_tools(connection_id, dry_run, create_qc_schema, db_user, db_password, skip_granting_privileges, admin_private_key_passphrase, admin_private_key, user_role) def form_overwritten_connection_url(connection): @@ -170,12 +205,16 @@ def form_overwritten_connection_url(connection): "port": connection["project_port"], "dbname": connection["project_db"], "url": None, - "connect_by_url": False, + "connect_by_url": None, + "connect_by_key": connection["connect_by_key"], + "private_key": None, + "private_key_passphrase": "", "dbschema": "", } db_type = get_db_type(flavor) flavor_service = get_flavor_service(db_type) - connection_string = flavor_service.get_connection_string(connection_credentials, "") + flavor_service.init(connection_credentials) + connection_string = flavor_service.get_connection_string("") return connection_string diff --git a/testgen/ui/services/database_service.py b/testgen/ui/services/database_service.py index 1e7f33f..fd2fac9 100644 --- a/testgen/ui/services/database_service.py +++ b/testgen/ui/services/database_service.py @@ -244,9 +244,7 @@ def apply_df_edits(df_original, df_edited, str_table, lst_id_columns, no_update_ return booStatus -def _start_target_db_engine(flavor, host, port, db_name, user, password, url, connect_by_url): - flavor_service = get_flavor_service(flavor) - +def _start_target_db_engine(flavor, host, port, db_name, user, password, url, connect_by_url, connect_by_key, private_key, private_key_passphrase): connection_params = { "flavor": flavor if flavor != "redshift" else "postgresql", "user": user, @@ -255,25 +253,30 @@ def _start_target_db_engine(flavor, host, port, db_name, user, password, url, co "dbname": db_name, "url": url, "connect_by_url": connect_by_url, + "connect_by_key": connect_by_key, + "private_key": private_key, + "private_key_passphrase": private_key_passphrase, "dbschema": None, } - connection_string = flavor_service.get_connection_string(connection_params, password) + flavor_service = get_flavor_service(flavor) + flavor_service.init(connection_params) + connection_string = flavor_service.get_connection_string(password) connect_args = {"connect_timeout": 3600} connect_args.update(flavor_service.get_connect_args()) - return create_engine(connection_string, connect_args=connect_args) -def retrieve_target_db_data(flavor, host, port, db_name, user, password, url, connect_by_url, sql_query, decrypt=False): +def retrieve_target_db_data(flavor, host, port, db_name, user, password, url, connect_by_url, connect_by_key, private_key, private_key_passphrase, sql_query, decrypt=False): if decrypt: password = DecryptText(password) - db_engine = _start_target_db_engine(flavor, host, port, db_name, user, password, url, connect_by_url) + db_engine = _start_target_db_engine(flavor, host, port, db_name, user, password, url, connect_by_url, connect_by_key, private_key, private_key_passphrase) with db_engine.connect() as connection: query_result = connection.execute(text(sql_query)) return query_result.fetchall() -def retrieve_target_db_df(flavor, host, port, db_name, user, password, sql_query, url, connect_by_url): - password = DecryptText(password) - db_engine = _start_target_db_engine(flavor, host, port, db_name, user, password, url, connect_by_url) +def retrieve_target_db_df(flavor, host, port, db_name, user, password, sql_query, url, connect_by_url, connect_by_key, private_key, private_key_passphrase): + if password: + password = DecryptText(password) + db_engine = _start_target_db_engine(flavor, host, port, db_name, user, password, url, connect_by_url, connect_by_key, private_key, private_key_passphrase) return pd.read_sql_query(text(sql_query), db_engine) diff --git a/testgen/ui/services/test_definition_service.py b/testgen/ui/services/test_definition_service.py index f64cf81..28d19b0 100644 --- a/testgen/ui/services/test_definition_service.py +++ b/testgen/ui/services/test_definition_service.py @@ -110,5 +110,8 @@ def validate_test(test_definition): connection["password"], connection["url"], connection["connect_by_url"], + connection["connect_by_key"], + connection["private_key"], + connection["private_key_passphrase"], sql_query, ) diff --git a/testgen/ui/views/connections.py b/testgen/ui/views/connections.py index 1a029b3..b6a15c0 100644 --- a/testgen/ui/views/connections.py +++ b/testgen/ui/views/connections.py @@ -1,17 +1,18 @@ -import time +import logging import typing import streamlit as st -import testgen.ui.services.database_service as db import testgen.ui.services.form_service as fm import testgen.ui.services.toolbar_service as tb -from testgen.common.database.database_service import empty_cache from testgen.ui.components import widgets as testgen from testgen.ui.navigation.menu import MenuItem from testgen.ui.navigation.page import Page -from testgen.ui.services import authentication_service, connection_service +from testgen.ui.services import connection_service from testgen.ui.session import session +from testgen.ui.views.connections_base import show_connection, show_create_qc_schema_modal + +LOG = logging.getLogger("testgen") class ConnectionsPage(Page): @@ -39,7 +40,11 @@ def render(self) -> None: enable_table_groups = connection["project_host"] and connection["project_db"] and connection["project_qc_schema"] - show_connection_form(connection, project_code) + form_container = st.expander("", expanded=True) + with form_container: + connection_modal = None + mode = "edit" + show_connection(connection_modal, connection, mode, project_code, show_header=False) if tool_bar.long_slots[-1].button( f":{'gray' if not enable_table_groups else 'green'}[Table Groups →]", @@ -57,15 +62,7 @@ def render(self) -> None: _, col2 = st.columns([70, 30]) if col2.button( - "Test Connection", - help="Verifies that the connection to the database is working", - use_container_width=True, - ): - status_container = st.empty() - verify_connection_works(connection, project_code, status_container) - - if col2.button( - "Create QC Utility schema...", + "Configure QC Utility Schema", help="Creates the required Utility schema and related functions in the target database", use_container_width=True, ): @@ -73,208 +70,3 @@ def render(self) -> None: if create_qc_schema_modal.is_open(): show_create_qc_schema_modal(create_qc_schema_modal, connection) - - -def show_create_qc_schema_modal(modal, selected_connection): - with modal.container(): - fm.render_modal_header("Create QC Utility Schema", None) - with st.form("Create QC Utility Schema", clear_on_submit=False): - skip_schema_creation = st.toggle("Skip schema creation -- create utility functions in existing QC Schema") - skip_granting_privileges = st.toggle("Skip granting privileges") - db_user = st.text_input(label="Admin db user", max_chars=40, placeholder="Optional Field") - db_password = st.text_input( - label="Admin db password", max_chars=40, type="password", placeholder="Optional Field" - ) - - submit = st.form_submit_button("Create Schema") - - if submit: - empty_cache() - _, bottom_right_column = st.columns([0.20, 0.80]) - operation_status = bottom_right_column.empty() - - operation_status.empty() - connection_id = selected_connection["connection_id"] - project_qc_schema = selected_connection["project_qc_schema"] - operation_status.info(f"Creating QC utility schema '{project_qc_schema}'...") - - create_qc_schema = not skip_schema_creation - try: - connection_service.create_qc_schema( - connection_id, - create_qc_schema, - db_user if db_user else None, - db_password if db_password else None, - skip_granting_privileges, - ) - operation_status.empty() - operation_status.success("Operation has finished successfully.") - - except Exception as e: - operation_status.empty() - operation_status.error("Error creating QC Utility schema.") - error_message = e.args[0] - st.text_area("Error Details", value=error_message) - - -def show_connection_form(connection, project_code): - with st.form("edit-connection", clear_on_submit=False): - flavor_options = ["redshift", "snowflake", "mssql", "postgresql"] - - left_column, right_column = st.columns([0.75, 0.25]) - toggle_left_column, _ = st.columns([0.25, 0.75]) - bottom_left_column, bottom_right_column = st.columns([0.25, 0.75]) - button_left_column, _, _ = st.columns([0.20, 0.20, 0.60]) - - connection_id = connection["connection_id"] - connection_name = connection["connection_name"] - sql_flavor_index = flavor_options.index(connection["sql_flavor"]) - project_port = connection["project_port"] - project_host = connection["project_host"] - project_db = connection["project_db"] - project_user = connection["project_user"] - url = connection["url"] - project_qc_schema = connection["project_qc_schema"] - password = connection["password"] - max_threads = connection["max_threads"] - max_query_chars = connection["max_query_chars"] - connect_by_url = connection["connect_by_url"] - - new_connection = { - "connection_id": connection_id, - "project_code": project_code, - "connection_name": left_column.text_input( - label="Connection Name", - max_chars=40, - value=connection_name, - help="Your name for this connection. Can be any text.", - ), - "sql_flavor": right_column.selectbox( - label="SQL Flavor", - options=flavor_options, - index=sql_flavor_index, - help="The type of database server that you will connect to. This determines TestGen's drivers and SQL dialect.", - ), - "project_port": right_column.text_input(label="Port", max_chars=5, value=project_port), - "project_host": left_column.text_input(label="Host", max_chars=250, value=project_host), - "project_db": left_column.text_input( - label="Database", - max_chars=100, - value=project_db, - help="The name of the database defined on your host where your schemas and tables is present.", - ), - "project_user": left_column.text_input( - label="User", - max_chars=50, - value=project_user, - help="Username to connect to your database.", - ), - "password": left_column.text_input( - label="Password", - max_chars=50, - type="password", - value=password, - help="Password to connect to your database.", - ), - "project_qc_schema": right_column.text_input( - label="QC Utility Schema", - max_chars=50, - value=project_qc_schema, - help="The name of the schema on your database that will contain TestGen's profiling functions.", - ), - "max_threads": right_column.number_input( - label="Max Threads (Advanced Tuning)", - min_value=1, - max_value=8, - value=max_threads, - help="Maximum number of concurrent threads that run tests. Default values should be retained unless test queries are failing.", - ), - "max_query_chars": right_column.number_input( - label="Max Expression Length (Advanced Tuning)", - min_value=500, - max_value=14000, - value=max_query_chars, - help="Some tests are consolidated into queries for maximum performance. Default values should be retained unless test queries are failing.", - ), - } - - left_column.markdown("

 
", unsafe_allow_html=True) - - new_connection["connect_by_url"] = toggle_left_column.toggle( - "Connect by URL", - value=connect_by_url, - help="If this switch is set to on, the connection string will be driven by the field below. Only user name and password will be passed per the relevant fields above.", - ) - - connection_string = connection_service.form_overwritten_connection_url(new_connection) - connection_string_beginning, connection_string_end = connection_string.split("@", 1) - connection_string_header = connection_string_beginning + "@" - - if not url: - url = connection_string_end - - new_connection["url"] = bottom_right_column.text_input( - label="URL Suffix", - max_chars=200, - value=url, - help="Provide a connection string directly. This will override connection parameters if the 'Connect by URL' switch is set.", - ) - - bottom_left_column.text_input(label="URL Prefix", value=connection_string_header, disabled=True) - - submit = button_left_column.form_submit_button( - "Save Changes", - disabled=authentication_service.current_user_has_read_role(), - ) - if submit: - if not new_connection["password"]: - st.error("Enter a valid password.") - else: - connection_service.edit_connection(new_connection) - st.success("Changes have been saved successfully.") - time.sleep(1) - - -def verify_connection_works(connection, project_code, connection_status_container): - empty_cache() - connection_status_container.empty() - connection_status_container.info("Testing the connection...") - - try: - sql_query = "select 1;" - results = db.retrieve_target_db_data( - connection["sql_flavor"], - connection["project_host"], - connection["project_port"], - connection["project_db"], - connection["project_user"], - connection["password"], - connection["url"], - connection["connect_by_url"], - sql_query, - ) - if len(results) == 1 and results[0][0] == 1: - qc_error_message = "The connection was successful, but there is an issue with the QC Utility Schema" - try: - qc_results = connection_service.test_qc_connection(project_code, connection) - if not all(qc_results): - error_message = f"QC Utility schema confirmation failed. details: {qc_results}" - connection_status_container.empty() - connection_status_container.error(qc_error_message) - st.text_area("Connection Error Details", value=error_message) - else: - connection_status_container.empty() - connection_status_container.success("The connection was successful.") - except Exception as e: - connection_status_container.empty() - connection_status_container.error(qc_error_message) - error_message = e.args[0] - st.text_area("Connection Error Details", value=error_message) - else: - connection_status_container.empty() - connection_status_container.error("Error completing a query to the database server.") - except Exception as e: - connection_status_container.empty() - connection_status_container.error("Error attempting the Connection.") - error_message = e.args[0] - st.text_area("Connection Error Details", value=error_message) diff --git a/testgen/ui/views/connections_base.py b/testgen/ui/views/connections_base.py new file mode 100644 index 0000000..5cc9c6e --- /dev/null +++ b/testgen/ui/views/connections_base.py @@ -0,0 +1,367 @@ +import os +import time + +import streamlit as st + +import testgen.ui.services.database_service as db +import testgen.ui.services.form_service as fm +from testgen.commands.run_setup_profiling_tools import get_setup_profiling_tools_queries +from testgen.common.database.database_service import empty_cache +from testgen.ui.services import authentication_service, connection_service + + +def show_create_qc_schema_modal(modal, selected): + with modal.container(): + fm.render_modal_header("Configure QC Utility Schema", None) + selected_connection = selected + connection_id = selected_connection["connection_id"] + project_qc_schema = selected_connection["project_qc_schema"] + sql_flavor = selected_connection["sql_flavor"] + user = selected_connection["project_user"] + + create_qc_schema = st.toggle("Create QC Utility Schema", value=True) + grant_privileges = st.toggle("Grant access privileges to TestGen user", value=True) + + user_role = None + + # TODO ALEX: This textbox may be needed if we want to grant permissions to user role + # if sql_flavor == "snowflake": + # user_role_textbox_label = f"Primary role for database user {user}" + # user_role = st.text_input(label=user_role_textbox_label, max_chars=100) + + admin_credentials_expander = st.expander("Admin credential options", expanded=True) + with admin_credentials_expander: + admin_connection_option_index = 0 + admin_connection_options = ["Do not use admin credentials", "Use admin credentials with Password"] + if sql_flavor == "snowflake": + admin_connection_options.append("Use admin credentials with Key-Pair") + + admin_connection_option = st.radio( + "Admin credential options", + label_visibility="hidden", + options=admin_connection_options, + index=admin_connection_option_index, + horizontal=True, + ) + + st.markdown("

 
", unsafe_allow_html=True) + + db_user = None + db_password = None + admin_private_key_passphrase = None + admin_private_key = None + if admin_connection_option == admin_connection_options[0]: + st.markdown(":orange[User created in the connection dialog will be used.]") + else: + db_user = st.text_input(label="Admin db user", max_chars=40) + if admin_connection_option == admin_connection_options[1]: + db_password = st.text_input( + label="Admin db password", max_chars=40, type="password" + ) + st.markdown(":orange[Note: Admin credentials are not stored, are only used for this operation.]") + + if len(admin_connection_options) > 2 and admin_connection_option == admin_connection_options[2]: + admin_private_key_passphrase = st.text_input( + label="Private Key Passphrase", + key="create-qc-schema-private-key-password", + type="password", + max_chars=200, + help="Passphrase used while creating the private Key (leave empty if not applicable)", + ) + + admin_uploaded_file = st.file_uploader("Upload private key (rsa_key.p8)", key="admin-uploaded-file") + if admin_uploaded_file: + admin_private_key = admin_uploaded_file.getvalue().decode("utf-8") + + st.markdown(":orange[Note: Admin credentials are not stored, are only used for this operation.]") + + submit = st.button("Update Configuration") + + if submit: + empty_cache() + script_expander = st.expander("Script Details") + + operation_status = st.empty() + operation_status.info(f"Configuring QC Utility Schema '{project_qc_schema}'...") + + try: + skip_granting_privileges = not grant_privileges + queries = get_setup_profiling_tools_queries(sql_flavor, create_qc_schema, skip_granting_privileges, project_qc_schema, user, user_role) + with script_expander: + st.code( + os.linesep.join(queries), + language="sql", + line_numbers=True) + + connection_service.create_qc_schema( + connection_id, + create_qc_schema, + db_user if db_user else None, + db_password if db_password else None, + skip_granting_privileges, + admin_private_key_passphrase=admin_private_key_passphrase, + admin_private_key=admin_private_key, + user_role=user_role, + ) + operation_status.empty() + operation_status.success("Operation has finished successfully.") + + except Exception as e: + operation_status.empty() + operation_status.error("Error configuring QC Utility Schema.") + error_message = e.args[0] + st.text_area("Error Details", value=error_message) + + +def show_connection(connection_modal, selected_connection, mode, project_code, show_header=True): + if show_header: + fm.render_modal_header("Add Connection" if mode == "add" else "Edit Connection", None) + flavor_options = ["redshift", "snowflake", "mssql", "postgresql"] + connection_options = ["Connect by Password", "Connect by Key-Pair"] + + left_column, right_column = st.columns([0.75, 0.25]) + mid_column = st.columns(1)[0] + toggle_left_column, toggle_right_column = st.columns([0.25, 0.75]) + bottom_left_column, bottom_right_column = st.columns([0.25, 0.75]) + button_left_column, button_right_column, button_remaining_column = st.columns([0.20, 0.20, 0.60]) + + connection_id = selected_connection["connection_id"] if mode == "edit" else None + connection_name = selected_connection["connection_name"] if mode == "edit" else "" + sql_flavor_index = flavor_options.index(selected_connection["sql_flavor"]) if mode == "edit" else 0 + project_port = selected_connection["project_port"] if mode == "edit" else "" + project_host = selected_connection["project_host"] if mode == "edit" else "" + project_db = selected_connection["project_db"] if mode == "edit" else "" + project_user = selected_connection["project_user"] if mode == "edit" else "" + url = selected_connection["url"] if mode == "edit" else "" + project_qc_schema = selected_connection["project_qc_schema"] if mode == "edit" else "qc" + password = selected_connection["password"] if mode == "edit" else "" + max_threads = selected_connection["max_threads"] if mode == "edit" else 4 + max_query_chars = selected_connection["max_query_chars"] if mode == "edit" else 10000 + connect_by_url = selected_connection["connect_by_url"] if mode == "edit" else False + connect_by_key = selected_connection["connect_by_key"] if mode == "edit" else False + connection_option_index = 1 if connect_by_key else 0 + private_key = selected_connection["private_key"] if mode == "edit" else None + private_key_passphrase = selected_connection["private_key_passphrase"] if mode == "edit" else "" + + new_connection = { + "connection_id": connection_id, + "project_code": project_code, + "private_key": private_key, + "private_key_passphrase": private_key_passphrase, + "password": password, + "url": url, + "max_threads": right_column.number_input( + label="Max Threads (Advanced Tuning)", + min_value=1, + max_value=8, + value=max_threads, + help="Maximum number of concurrent threads that run tests. Default values should be retained unless test queries are failing.", + ), + "max_query_chars": right_column.number_input( + label="Max Expression Length (Advanced Tuning)", + min_value=500, + max_value=14000, + value=max_query_chars, + help="Some tests are consolidated into queries for maximum performance. Default values should be retained unless test queries are failing.", + ), + "connection_name": left_column.text_input( + label="Connection Name", + max_chars=40, + value=connection_name, + help="Your name for this connection. Can be any text.", + ), + "sql_flavor": left_column.selectbox( + label="SQL Flavor", + options=flavor_options, + index=sql_flavor_index, + help="The type of database server that you will connect to. This determines TestGen's drivers and SQL dialect.", + ) + } + + if "disable_url_widgets" not in st.session_state: + st.session_state.disable_url_widgets = connect_by_url + + new_connection["project_port"] = right_column.text_input(label="Port", max_chars=5, value=project_port, disabled=st.session_state.disable_url_widgets) + new_connection["project_host"] = left_column.text_input(label="Host", max_chars=250, value=project_host, disabled=st.session_state.disable_url_widgets) + new_connection["project_db"] = left_column.text_input( + label="Database", + max_chars=100, + value=project_db, + help="The name of the database defined on your host where your schemas and tables is present.", + disabled=st.session_state.disable_url_widgets, + ) + + new_connection["project_user"] = left_column.text_input( + label="User", + max_chars=50, + value=project_user, + help="Username to connect to your database.", + ) + + new_connection["project_qc_schema"] = right_column.text_input( + label="QC Utility Schema", + max_chars=50, + value=project_qc_schema, + help="The name of the schema on your database that will contain TestGen's profiling functions.", + ) + + if new_connection["sql_flavor"] == "snowflake": + mid_column.divider() + + connection_option = mid_column.radio( + "Connection options", + options=connection_options, + index=connection_option_index, + horizontal=True, + help="Connection strategy", + ) + + new_connection["connect_by_key"] = connection_option == "Connect by Key-Pair" + password_column = mid_column + else: + new_connection["connect_by_key"] = False + password_column = left_column + + uploaded_file = None + + if new_connection["connect_by_key"]: + new_connection["private_key_passphrase"] = mid_column.text_input( + label="Private Key Passphrase", + type="password", + max_chars=200, + value=private_key_passphrase, + help="Passphrase used while creating the private Key (leave empty if not applicable)", + ) + + uploaded_file = mid_column.file_uploader("Upload private key (rsa_key.p8)") + else: + new_connection["password"] = password_column.text_input( + label="Password", + max_chars=50, + type="password", + value=password, + help="Password to connect to your database.", + ) + + mid_column.divider() + + url_override_help_text = "If this switch is set to on, the connection string will be driven by the field below. " + if new_connection["connect_by_key"]: + url_override_help_text += "Only user name will be passed per the relevant fields above." + else: + url_override_help_text += "Only user name and password will be passed per the relevant fields above." + + def on_connect_by_url_change(): + value = st.session_state.connect_by_url_toggle + st.session_state.disable_url_widgets = value + + new_connection["connect_by_url"] = toggle_left_column.toggle( + "URL override", + value=connect_by_url, + key="connect_by_url_toggle", + help=url_override_help_text, + on_change=on_connect_by_url_change + ) + + if new_connection["connect_by_url"]: + connection_string = connection_service.form_overwritten_connection_url(new_connection) + connection_string_beginning, connection_string_end = connection_string.split("@", 1) + connection_string_header = connection_string_beginning + "@" + connection_string_header = connection_string_header.replace("%3E", ">") + connection_string_header = connection_string_header.replace("%3C", "<") + + if not url: + url = connection_string_end + + new_connection["url"] = bottom_right_column.text_input( + label="URL Suffix", + max_chars=200, + value=url, + help="Provide a connection string directly. This will override connection parameters if the 'Connect by URL' switch is set.", + ) + + bottom_left_column.text_input(label="URL Prefix", value=connection_string_header, disabled=True) + + bottom_left_column.markdown("

 
", unsafe_allow_html=True) + + submit_button_text = "Save" if mode == "edit" else "Add Connection" + submit = button_left_column.button( + submit_button_text, disabled=authentication_service.current_user_has_read_role() + ) + + if submit: + if not new_connection["password"] and not new_connection["connect_by_key"]: + st.error("Enter a valid password.") + else: + if uploaded_file: + new_connection["private_key"] = uploaded_file.getvalue().decode("utf-8") + + if mode == "edit": + connection_service.edit_connection(new_connection) + else: + connection_service.add_connection(new_connection) + success_message = ( + "Changes have been saved successfully. " + if mode == "edit" + else "New connection added successfully. " + ) + st.success(success_message) + time.sleep(1) + if connection_modal: + connection_modal.close() + st.experimental_rerun() + + test_left_column, test_mid_column, test_right_column = st.columns([0.15, 0.15, 0.70]) + test_connection = button_right_column.button("Test Connection") + + connection_status = test_right_column.empty() + + if test_connection: + if mode == "add" and new_connection["connect_by_key"]: + connection_status.empty() + connection_status.error( + "Please add the connection before testing it (so that we can get your private key file).") + else: + empty_cache() + connection_status.empty() + connection_status.info("Testing the connection...") + try: + sql_query = "select 1;" + results = db.retrieve_target_db_data( + new_connection["sql_flavor"], + new_connection["project_host"], + new_connection["project_port"], + new_connection["project_db"], + new_connection["project_user"], + new_connection["password"], + new_connection["url"], + new_connection["connect_by_url"], + new_connection["connect_by_key"], + new_connection["private_key"], + new_connection["private_key_passphrase"], + sql_query, + ) + if len(results) == 1 and results[0][0] == 1: + qc_error_message = "The connection was successful, but there is an issue with the QC Utility Schema" + try: + qc_results = connection_service.test_qc_connection(project_code, new_connection) + if not all(qc_results): + error_message = f"QC Utility Schema confirmation failed. details: {qc_results}" + connection_status.empty() + connection_status.error(qc_error_message) + st.text_area("Connection Error Details", value=error_message) + else: + connection_status.empty() + connection_status.success("The connection was successful.") + except Exception as e: + connection_status.empty() + connection_status.error(qc_error_message) + error_message = e.args[0] + st.text_area("Connection Error Details", value=error_message) + else: + test_right_column.error("Error completing a query to the database server.") + except Exception as e: + connection_status.empty() + connection_status.error("Error attempting the Connection.") + error_message = e.args[0] + st.text_area("Connection Error Details", value=error_message) diff --git a/testgen/ui/views/profiling_anomalies.py b/testgen/ui/views/profiling_anomalies.py index 7075c1e..d6a207a 100644 --- a/testgen/ui/views/profiling_anomalies.py +++ b/testgen/ui/views/profiling_anomalies.py @@ -287,7 +287,7 @@ def get_bad_data(selected_row): str_sql = f""" SELECT t.lookup_query, tg.table_group_schema, c.project_qc_schema, c.sql_flavor, c.project_host, c.project_port, c.project_db, c.project_user, c.project_pw_encrypted, - c.url, c.connect_by_url + c.url, c.connect_by_url, c.connect_by_key, c.private_key, c.private_key_passphrase FROM {str_schema}.target_data_lookups t INNER JOIN {str_schema}.table_groups tg ON ('{selected_row["table_groups_id"]}'::UUID = tg.id) @@ -350,6 +350,9 @@ def replace_parms(str_query): str_sql, lst_query[0]["url"], lst_query[0]["connect_by_url"], + lst_query[0]["connect_by_key"], + lst_query[0]["private_key"], + lst_query[0]["private_key_passphrase"], ) if df.empty: return "ND", "Data that violates Hygiene Issue criteria is not present in the current dataset.", None diff --git a/testgen/ui/views/test_results.py b/testgen/ui/views/test_results.py index 33b9b24..897cb97 100644 --- a/testgen/ui/views/test_results.py +++ b/testgen/ui/views/test_results.py @@ -353,7 +353,8 @@ def do_source_data_lookup_uncached(str_schema, selected_row, sql_only=False): str_sql = f""" SELECT t.lookup_query, tg.table_group_schema, c.project_qc_schema, c.sql_flavor, c.project_host, c.project_port, c.project_db, c.project_user, c.project_pw_encrypted, - c.url, c.connect_by_url + c.url, c.connect_by_url, + c.connect_by_key, c.private_key, c.private_key_passphrase FROM {str_schema}.target_data_lookups t INNER JOIN {str_schema}.table_groups tg ON ('{selected_row["table_groups_id"]}'::UUID = tg.id) @@ -434,6 +435,9 @@ def replace_parms(df_test, str_query): str_sql, lst_query[0]["url"], lst_query[0]["connect_by_url"], + lst_query[0]["connect_by_key"], + lst_query[0]["private_key"], + lst_query[0]["private_key_passphrase"], ) if df.empty: return "ND", "Data that violates Test criteria is not present in the current dataset.", None @@ -453,7 +457,7 @@ def do_source_data_lookup_custom(selected_row): str_sql = f""" SELECT d.custom_query as lookup_query, tg.table_group_schema, c.project_qc_schema, c.sql_flavor, c.project_host, c.project_port, c.project_db, c.project_user, c.project_pw_encrypted, - c.url, c.connect_by_url + c.url, c.connect_by_url, c.connect_by_key, c.private_key, c.private_key_passphrase FROM {str_schema}.test_definitions d INNER JOIN {str_schema}.table_groups tg ON ('{selected_row["table_groups_id"]}'::UUID = tg.id) @@ -480,6 +484,9 @@ def do_source_data_lookup_custom(selected_row): str_sql, lst_query[0]["url"], lst_query[0]["connect_by_url"], + lst_query[0]["connect_by_key"], + lst_query[0]["private_key"], + lst_query[0]["private_key_passphrase"], ) if df.empty: return "ND", "Data that violates Test criteria is not present in the current dataset.", None From 5b9931cb6226f3105effd85fe533e1bf921b3b28 Mon Sep 17 00:00:00 2001 From: "Chip.Bloche" Date: Sun, 21 Jul 2024 17:29:40 -0400 Subject: [PATCH 22/22] feat(cli/ui): scan for personally identifying information (pii) Scan, display and source data lookup for PII as Hygiene Issue --- testgen/commands/queries/profiling_query.py | 5 + testgen/commands/run_profiling_bridge.py | 2 + .../030_initialize_new_schema_structure.sql | 1 + .../050_populate_new_schema_metadata.sql | 9 +- .../dbupgrade/0107_incremental_upgrade.sql | 3 + .../profiling/functional_datatype.sql | 16 +-- testgen/template/profiling/pii_flag.sql | 133 ++++++++++++++++++ testgen/ui/queries/profiling_queries.py | 3 +- testgen/ui/queries/test_definition_queries.py | 1 + testgen/ui/views/profiling_anomalies.py | 11 +- testgen/ui/views/profiling_details.py | 2 +- testgen/ui/views/profiling_results.py | 6 +- testgen/ui/views/table_groups.py | 2 +- testgen/ui/views/test_definitions.py | 9 +- testgen/ui/views/test_results.py | 12 +- 15 files changed, 187 insertions(+), 28 deletions(-) create mode 100644 testgen/template/dbupgrade/0107_incremental_upgrade.sql create mode 100644 testgen/template/profiling/pii_flag.sql diff --git a/testgen/commands/queries/profiling_query.py b/testgen/commands/queries/profiling_query.py index b937195..db5ff1e 100644 --- a/testgen/commands/queries/profiling_query.py +++ b/testgen/commands/queries/profiling_query.py @@ -136,6 +136,11 @@ def GetFunctionalTableTypeUpdateQuery(self): strQ = self.ReplaceParms(read_template_sql_file("functional_tabletype_update.sql", sub_directory="profiling")) return strQ + def GetPIIFlagUpdateQuery(self): + # Runs on DK Postgres Server + strQ = self.ReplaceParms(read_template_sql_file("pii_flag.sql", sub_directory="profiling")) + return strQ + def GetAnomalyRefreshQuery(self): # Runs on DK Postgres Server strQ = self.ReplaceParms(read_template_sql_file("refresh_anomalies.sql", sub_directory="profiling")) diff --git a/testgen/commands/run_profiling_bridge.py b/testgen/commands/run_profiling_bridge.py index 430eae6..c141c76 100644 --- a/testgen/commands/run_profiling_bridge.py +++ b/testgen/commands/run_profiling_bridge.py @@ -449,6 +449,8 @@ def run_profiling_queries(strTableGroupsID, spinner=None): lstQueries.append(strQuery) strQuery = clsProfiling.GetFunctionalTableTypeUpdateQuery() lstQueries.append(strQuery) + strQuery = clsProfiling.GetPIIFlagUpdateQuery() + lstQueries.append(strQuery) lstQueries.extend(CompileAnomalyTestQueries(clsProfiling)) strQuery = clsProfiling.GetAnomalyRefreshQuery() lstQueries.append(strQuery) diff --git a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql index 405b57e..ac31aa3 100644 --- a/testgen/template/dbsetup/030_initialize_new_schema_structure.sql +++ b/testgen/template/dbsetup/030_initialize_new_schema_structure.sql @@ -263,6 +263,7 @@ CREATE TABLE profile_results ( embedded_space_ct BIGINT, avg_embedded_spaces DOUBLE PRECISION, std_pattern_match VARCHAR(30), + pii_flag VARCHAR(50), functional_data_type VARCHAR(50), functional_table_type VARCHAR(50), sample_ratio FLOAT diff --git a/testgen/template/dbsetup/050_populate_new_schema_metadata.sql b/testgen/template/dbsetup/050_populate_new_schema_metadata.sql index 0b6869e..a9643cf 100644 --- a/testgen/template/dbsetup/050_populate_new_schema_metadata.sql +++ b/testgen/template/dbsetup/050_populate_new_schema_metadata.sql @@ -94,7 +94,8 @@ n controls over data ingested and to make values more efficient, consistent and 'p.includes_digit_ct::FLOAT/NULLIF(p.value_ct, 0)::FLOAT > 0.5 AND TRIM(fn_parsefreq(p.top_freq_values, 1, 1)) ~ ''(?i)^[0-9]+(\.[0-9]+)? ?(%|lb|oz|kg|g|mg|km|m|cm|mm|mi|ft|in)$''', '''Top Freq: '' || p.top_freq_values', 'Possible', 'Review your source data and ingestion process. Consider whether it might be better to parse the numeric and unit data and store in separate columns.'), - ('1027', 'Variant_Coded_Values', 'Variant', 'Variant Codings for Same Values', 'This column contains more than one common variants that represent a single value or state. This can occur when data is integrated from multiple sources with different standards, or when free entry is permitted without validation. The variations can cause confusion and error for downstream data users and multiple versions of the truth. ', 'p.distinct_value_ct <= 20', '''Variants Found: '' || intersect_list', 'Definite', 'Review your source data and ingestion process. Consider cleansing this data to standardize on a single set of definitive codes.'); + ('1027', 'Variant_Coded_Values', 'Variant', 'Variant Codings for Same Values', 'This column contains more than one common variants that represent a single value or state. This can occur when data is integrated from multiple sources with different standards, or when free entry is permitted without validation. The variations can cause confusion and error for downstream data users and multiple versions of the truth. ', 'p.distinct_value_ct <= 20', '''Variants Found: '' || intersect_list', 'Definite', 'Review your source data and ingestion process. Consider cleansing this data to standardize on a single set of definitive codes.'), + ('1100', 'Potential_PII', 'Column', 'Personally Identifiable Information', 'This column contains data that could be Personally Identifiable Information (PII)', 'p.pii_flag > ''''', '''Risk: '' || CASE LEFT(p.pii_flag, 1) WHEN ''A'' THEN ''HIGH'' WHEN ''B'' THEN ''MODERATE'' WHEN ''C'' THEN ''LOW'' END || '', PII Type: '' || SUBSTRING(p.pii_flag, 3)', 'Potential PII', 'PII may require steps to ensure data security and compliance with relevant privacy regulations and legal requirements. You may have to classify and inventory PII, implement appropriate access controls, encrypt data, and monitor for unauthorized access. Your organization might be required to update privacy policies and train staff on data protection practices. Note that PII that is lower-risk in isolation might be high-risk in conjunction with other data.'); TRUNCATE TABLE test_types; @@ -1191,7 +1192,11 @@ SELECT ''Latest Timeframe'' as missing_from, {COLUMN_NAME} FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE {SUBSET_CONDITION} AND {WINDOW_DATE_COLUMN} >= (SELECT MAX({WINDOW_DATE_COLUMN}) FROM {TARGET_SCHEMA}.{TABLE_NAME}) - {WINDOW_DAYS} -)') +)'), + ('1269', '1100', 'Profile Anomaly', 'Potential_PII', 'redshift', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;'), + ('1270', '1100', 'Profile Anomaly', 'Potential_PII', 'snowflake', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;'), + ('1271', '1100', 'Profile Anomaly', 'Potential_PII', 'mssql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;'), + ('1272', '1100', 'Profile Anomaly', 'Potential_PII', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;') ; diff --git a/testgen/template/dbupgrade/0107_incremental_upgrade.sql b/testgen/template/dbupgrade/0107_incremental_upgrade.sql new file mode 100644 index 0000000..6ee1056 --- /dev/null +++ b/testgen/template/dbupgrade/0107_incremental_upgrade.sql @@ -0,0 +1,3 @@ +SET SEARCH_PATH TO {SCHEMA_NAME}; + +ALTER TABLE profile_results ADD COLUMN pii_flag VARCHAR(50); diff --git a/testgen/template/profiling/functional_datatype.sql b/testgen/template/profiling/functional_datatype.sql index 2389721..a74cfb4 100644 --- a/testgen/template/profiling/functional_datatype.sql +++ b/testgen/template/profiling/functional_datatype.sql @@ -263,7 +263,7 @@ INNER JOIN profile_results s AND c.position + 1 = s.position AND 'State' = s.functional_data_type) WHERE c.profile_run_id = '{PROFILE_RUN_ID}' - AND c.column_name SIMILAR TO '%c(|i)ty%' + AND LOWER(c.column_name) SIMILAR TO '%c(|i)ty%' AND c.functional_data_type NOT IN ('State', 'Zip') AND profile_results.id = c.id; @@ -283,19 +283,19 @@ UPDATE profile_results WHERE profile_run_id = '{PROFILE_RUN_ID}' AND avg_length <= 8 AND avg_embedded_spaces < 0.2 - AND (column_name SIMILAR TO '%f(|i)rst(_| |)n(|a)m%%' - OR column_name SIMILAR TO '%(middle|mdl)(_| |)n(|a)m%%' - OR column_name SIMILAR TO '%nick(_| |)n(|a)m%%'); + AND (LOWER(column_name) SIMILAR TO '%f(|i)rst(_| |)n(|a)m%%' + OR LOWER(column_name) SIMILAR TO '%(middle|mdl)(_| |)n(|a)m%%' + OR LOWER(column_name) SIMILAR TO '%nick(_| |)n(|a)m%%'); -- Assign Last Name UPDATE profile_results SET functional_data_type = 'Person Last Name' WHERE profile_run_id = '{PROFILE_RUN_ID}' - AND column_name SIMILAR TO '%l(|a)st(_| |)n(|a)m%' AND avg_length BETWEEN 5 and 8 AND avg_embedded_spaces < 0.2 - AND (column_name SIMILAR TO '%l(|a)st(_| |)n(|a)m%' - OR column_name SIMILAR TO '%sur(_| |)n(|a)m%'); + AND (LOWER(column_name) SIMILAR TO '%l(|a)st(_| |)n(|a)m%' + OR LOWER(column_name) SIMILAR TO '%maiden(_| |)n(|a)m%' + OR LOWER(column_name) SIMILAR TO '%sur(_| |)n(|a)m%'); UPDATE profile_results SET functional_data_type = 'Entity Name' @@ -413,7 +413,7 @@ SET functional_data_type = WHEN (max_value - min_value + 1 = distinct_value_ct) AND (fractional_sum IS NULL OR fractional_sum > 0) THEN 'Sequence' WHEN general_type='N' - AND column_name SIMILAR TO '%(no|num|number|nbr)' + AND LOWER(column_name) SIMILAR TO '%(no|num|number|nbr)' AND (column_type ILIKE '%int%' OR (RTRIM(SPLIT_PART(column_type, ',', 2), ')') > '0' diff --git a/testgen/template/profiling/pii_flag.sql b/testgen/template/profiling/pii_flag.sql new file mode 100644 index 0000000..587d187 --- /dev/null +++ b/testgen/template/profiling/pii_flag.sql @@ -0,0 +1,133 @@ +-- Primary Screen: Alpha +WITH screen + AS ( SELECT id AS profile_results_id, + table_name, column_name, + CASE + WHEN functional_data_type IN ('Person Full Name', 'Person Given Name', 'Person Last Name') THEN 'B/NAME/Individual' + + WHEN LOWER(column_name) SIMILAR TO '%(maiden|surname)%' THEN 'B/NAME/Individual' + + WHEN functional_data_type = 'Historical Date' + AND LOWER(column_name) SIMILAR TO '%(dob|birth)%' THEN 'B/DEMO/Birthdate' + + WHEN LOWER(column_name) + SIMILAR TO '%(nationality|race|ethnicity|gender|sex|marital)%' THEN 'B/DEMO/Demographic' + + WHEN LOWER(column_name) ILIKE '%med%record%' THEN 'A/DEMO/Medical' + + WHEN LOWER(column_name) SIMILAR TO '%(password|pwd|auth)%' THEN 'A/ID/Security' + + WHEN max_length < 10 + AND avg_embedded_spaces < 0.1 + AND (column_name ILIKE 'pin%' OR column_name ILIKE '%pin') THEN 'A/ID/Security' + + WHEN std_pattern_match = 'SSN' + AND LOWER(column_name) SIMILAR TO '%(ss|soc|sec)%' THEN 'A/ID/SSN' + + WHEN TRIM(fn_parsefreq(top_patterns, 1, 2)) + IN ('NNNNNNNNN', 'NNN-NN-NNNN', 'NNN NN NNNN') + AND LEFT(min_text, 1) = '9' + AND avg_length BETWEEN 8.8 AND 11.2 + AND LOWER(column_name) SIMILAR TO '%(tax|tin|fed)%' THEN 'A/ID/Tax' + + WHEN TRIM(fn_parsefreq(top_patterns, 1, 2)) + IN ('NNNNNNNNN', 'ANNNNNNNN') + AND avg_length BETWEEN 8.8 AND 9.2 + AND LOWER(column_name) SIMILAR TO '%(passp|pp)%' THEN 'A/ID/Passport' + + WHEN std_pattern_match = 'CREDIT_CARD' + AND LOWER(column_name) SIMILAR TO '%(credit|card|cc|acct|account)%' THEN 'A/ID/Credit' + + WHEN TRIM(fn_parsefreq(top_patterns, 1, 2)) + ILIKE '[Aa]{6}[A-Za-z0-9]{2}N{0,3}' + AND TRIM(fn_parsefreq(top_patterns, 2, 2)) + ILIKE '[Aa]{6}[A-Za-z0-9]{2}N{0,3}' + AND avg_length BETWEEN 7.8 AND 11.2 + AND LOWER(column_name) SIMILAR TO '%(swift|bic)%' THEN 'A/ID/Bank' + + WHEN max_length <= 34 + AND UPPER(LEFT(TRIM(fn_parsefreq(top_patterns, 1, 2)), 2)) + = 'AA' + AND (column_name ILIKE 'iban%' OR column_name ILIKE '%iban') THEN 'A/ID/Bank' + + WHEN avg_length BETWEEN 5 AND 20 + AND LOWER(column_name) SIMILAR TO '%(bank|checking|saving|debit)%' THEN 'A/ID/Bank' + + WHEN avg_embedded_spaces < 0.5 + AND avg_length < 20 + AND (LOWER(column_name) SIMILAR TO '%(dr|op)%lic%' + OR LOWER(column_name) SIMILAR TO '%(driver|license|operator)%') THEN 'A/ID/License' + + WHEN LOWER(column_name) IN ('patient_id', 'pat_id') THEN 'A/ID/Medical' + + WHEN LOWER(column_name) IN ('member_id') THEN 'B/ID/Commercial' + + END AS pii_flag + + FROM profile_results p + WHERE profile_run_id = '{PROFILE_RUN_ID}' + AND general_type = 'A' ) +UPDATE profile_results + SET pii_flag = screen.pii_flag + FROM screen + WHERE screen.pii_flag > '' + AND profile_results.id = screen.profile_results_id; + +-- Secondary Screen - Alpha + WITH table_pii_counts + AS ( SELECT table_name, COUNT(pii_flag) AS pii_ct + FROM profile_results + WHERE profile_run_id = '{PROFILE_RUN_ID}' + GROUP BY table_name ), + screen + AS ( SELECT id AS profile_results_id, + p.table_name, p.column_name, + CASE + WHEN functional_data_type = 'Email' THEN 'B/CONTACT/Email' + WHEN functional_data_type IN ('Address', 'City', 'State', 'Zip') + THEN 'B/CONTACT/Address' + WHEN functional_data_type = 'Phone' + THEN 'B/CONTACT/Phone' + + WHEN LOWER(column_name) SIMILAR TO '%(insur|health|med|patient)%' + THEN 'A/DEMO/Medical' + + WHEN LOWER(column_name) SIMILAR TO '%(vehicle|vin|auto|car)%' + AND avg_length BETWEEN 16 AND 18 + AND max_length < 20 + AND TRIM(fn_parsefreq(top_patterns, 1, 2)) + = 'AAANAAAAANNNNNNNN' THEN 'B/ID/Auto' + + WHEN LOWER(column_name) SIMILAR TO + '%(voice|fingerprint|retina|auth|biometric|iris|face_recog)%' + THEN 'A/ID/Security' + + WHEN LOWER(column_name) = 'dna' + OR LOWER(column_name) ILIKE '%\_dna' + OR LOWER(column_name) ILIKE 'dna\_%' + THEN 'A/DEMO/Demographic' + + WHEN column_name ILIKE '%rout%' + AND avg_length BETWEEN 8.8 AND 11.2 + AND TRIM(fn_parsefreq(top_patterns, 1, 2)) + IN ('NNNNNNNNN', 'NNNN-NNNN-N') THEN 'C/ID/Bank' + + WHEN LOWER(column_name) SIMILAR TO '%(salary|income|wage)%' + THEN 'B/DEMO/Financial' + + WHEN LOWER(column_name) SIMILAR TO '%(user_id|userid)%' + THEN 'C/ID/Security' + + END AS pii_flag + FROM profile_results p + INNER JOIN table_pii_counts t + ON (p.table_name = t.table_name) + WHERE p.profile_run_id = '{PROFILE_RUN_ID}' + AND p.general_type = 'A' + AND p.pii_flag IS NULL + AND t.pii_ct > 1 ) +UPDATE profile_results + SET pii_flag = screen.pii_flag + FROM screen + WHERE screen.pii_flag > '' + AND profile_results.id = screen.profile_results_id; diff --git a/testgen/ui/queries/profiling_queries.py b/testgen/ui/queries/profiling_queries.py index 06b149e..d33a7e0 100644 --- a/testgen/ui/queries/profiling_queries.py +++ b/testgen/ui/queries/profiling_queries.py @@ -98,7 +98,8 @@ def get_profiling_detail(str_profile_run_id, str_table_name, str_column_name): WHEN 'B' THEN 'Boolean' ELSE 'N/A' END as general_type, - functional_table_type, functional_data_type, + functional_table_type as semantic_table_type, + functional_data_type as semantic_data_type, datatype_suggestion, CASE WHEN s.column_name IS NOT NULL THEN 'Yes' END as anomalies, -- Shared counts diff --git a/testgen/ui/queries/test_definition_queries.py b/testgen/ui/queries/test_definition_queries.py index 6e27e11..3073a90 100644 --- a/testgen/ui/queries/test_definition_queries.py +++ b/testgen/ui/queries/test_definition_queries.py @@ -35,6 +35,7 @@ def get_test_definitions(schema, project_code, test_suite, table_name, column_na CASE WHEN d.test_active = 'Y' THEN 'Yes' ELSE 'No' END as test_active_display, d.lock_refresh, CASE WHEN d.lock_refresh = 'Y' THEN 'Yes' ELSE 'No' END as lock_refresh_display, + t.test_scope, d.test_description, d.profiling_as_of_date, d.last_manual_update, diff --git a/testgen/ui/views/profiling_anomalies.py b/testgen/ui/views/profiling_anomalies.py index d6a207a..cf42b01 100644 --- a/testgen/ui/views/profiling_anomalies.py +++ b/testgen/ui/views/profiling_anomalies.py @@ -57,8 +57,8 @@ def render(self) -> None: with tool_bar.long_slots[1]: # Likelihood selection - optional filter - lst_status_options = ["All Likelihoods", "Definite", "Likely", "Possible"] - str_likelihood = st.selectbox("Issue Likelihood", lst_status_options) + lst_status_options = ["All Likelihoods", "Definite", "Likely", "Possible", "Potential PII"] + str_likelihood = st.selectbox("Issue Class", lst_status_options) with tool_bar.short_slots[0]: str_help = "Toggle on to perform actions on multiple Hygiene Issues" @@ -206,7 +206,10 @@ def get_db_table_group_choices(str_project_code): @st.cache_data(show_spinner="Retrieving Data") def get_profiling_anomalies(str_profile_run_id, str_likelihood): str_schema = st.session_state["dbschema"] - str_criteria = f" AND t.issue_likelihood = '{str_likelihood}'" if str_likelihood != "All Likelihoods" else "" + if str_likelihood == "All Likelihoods": + str_criteria = " AND t.issue_likelihood <> 'Potential PII'" + else: + str_criteria = f" AND t.issue_likelihood = '{str_likelihood}'" # Define the query -- first visible column must be first, because will hold the multi-select box str_sql = f""" SELECT r.table_name, r.column_name, r.schema_name, @@ -216,6 +219,8 @@ def get_profiling_anomalies(str_profile_run_id, str_likelihood): WHEN t.issue_likelihood = 'Possible' THEN 'Possible: speculative test that often identifies problems' WHEN t.issue_likelihood = 'Likely' THEN 'Likely: typically indicates a data problem' WHEN t.issue_likelihood = 'Definite' THEN 'Definite: indicates a highly-likely data problem' + WHEN t.issue_likelihood = 'Potential PII' + THEN 'Potential PII: may require privacy policies, standards and procedures for access, storage and transmission.' END as likelihood_explanation, t.anomaly_description, r.detail, t.suggested_action, r.anomaly_id, r.table_groups_id::VARCHAR, r.id::VARCHAR, p.profiling_starttime diff --git a/testgen/ui/views/profiling_details.py b/testgen/ui/views/profiling_details.py index b75983c..4c5728d 100644 --- a/testgen/ui/views/profiling_details.py +++ b/testgen/ui/views/profiling_details.py @@ -72,7 +72,7 @@ def write_column_header(selected_row, form_data_width): "schema_name", "general_type", "column_type", - "functional_data_type", + "semantic_data_type", "datatype_suggestion", ] fm.render_html_list(selected_row, lst_columns, str_header, form_data_width) diff --git a/testgen/ui/views/profiling_results.py b/testgen/ui/views/profiling_results.py index bfcb561..71e2188 100644 --- a/testgen/ui/views/profiling_results.py +++ b/testgen/ui/views/profiling_results.py @@ -102,7 +102,7 @@ def render(self) -> None: "table_name", "column_name", "column_type", - "functional_data_type", + "semantic_data_type", "anomalies", ] @@ -121,8 +121,8 @@ def render(self) -> None: "position", "column_type", "general_type", - "functional_table_type", - "functional_data_type", + "semantic_table_type", + "semantic_data_type", "datatype_suggestion", "anomalies", "record_ct", diff --git a/testgen/ui/views/table_groups.py b/testgen/ui/views/table_groups.py index 962b903..80c9c3a 100644 --- a/testgen/ui/views/table_groups.py +++ b/testgen/ui/views/table_groups.py @@ -189,7 +189,7 @@ def show_record_detail(selected, profile_cli_command_modal, profile_command_moda st.write("

", unsafe_allow_html=True) _, button_column = st.columns([0.3, 0.7]) with button_column: - if st.button("Run Profile Command", help="Runs the run-profile command", use_container_width=True): + if st.button("Run Profiling", help="Performs profiling on the Table Group", use_container_width=True): profile_command_modal.open() if st.button( "Show Run Profile CLI Command", help="Shows the run-profile CLI command", use_container_width=True diff --git a/testgen/ui/views/test_definitions.py b/testgen/ui/views/test_definitions.py index a96da3b..d2d8a78 100644 --- a/testgen/ui/views/test_definitions.py +++ b/testgen/ui/views/test_definitions.py @@ -837,10 +837,11 @@ def show_test_defs_grid( ) _, col_profile_button = right_column.columns([0.7, 0.3]) - view_profiling_modal( - col_profile_button, selected_row["table_name"], selected_row["column_name"], - str_table_groups_id=str_table_groups_id - ) + if selected_row["test_scope"] == "column": + view_profiling_modal( + col_profile_button, selected_row["table_name"], selected_row["column_name"], + str_table_groups_id=str_table_groups_id + ) with right_column: st.write(generate_test_defs_help(row_selected["test_type"])) diff --git a/testgen/ui/views/test_results.py b/testgen/ui/views/test_results.py index 897cb97..44c1ed2 100644 --- a/testgen/ui/views/test_results.py +++ b/testgen/ui/views/test_results.py @@ -180,7 +180,8 @@ def get_test_results_uncached(str_schema, str_run_id, str_sel_test_status): ) SELECT r.table_name, p.project_name, ts.test_suite, tg.table_groups_name, cn.connection_name, cn.project_host, cn.sql_flavor, - tt.dq_dimension, r.schema_name, r.column_names, r.test_time::DATE as test_date, r.test_type, tt.id as test_type_id, + tt.dq_dimension, tt.test_scope, + r.schema_name, r.column_names, r.test_time::DATE as test_date, r.test_type, tt.id as test_type_id, tt.test_name_short, tt.test_name_long, r.test_description, tt.measure_uom, tt.measure_uom_description, c.test_operator, r.threshold_value::NUMERIC(16, 5), r.result_measure::NUMERIC(16, 5), r.result_status, CASE @@ -671,10 +672,11 @@ def show_result_detail(str_run_id, str_sel_test_status, do_multi_select, export_ with pg_col2: v_col1, v_col2, v_col3 = st.columns([0.33, 0.33, 0.33]) view_edit_test(v_col1, selected_row["test_definition_id_current"]) - view_profiling_modal( - v_col2, selected_row["table_name"], selected_row["column_names"], - str_table_groups_id=selected_row["table_groups_id"] - ) + if selected_row["test_scope"] == "column": + view_profiling_modal( + v_col2, selected_row["table_name"], selected_row["column_names"], + str_table_groups_id=selected_row["table_groups_id"] + ) view_bad_data(v_col3, selected_row) with pg_col1: