From f372b2facda73efb073583eaf3b8d624e6283e82 Mon Sep 17 00:00:00 2001 From: Agnes Kiss Date: Mon, 14 Oct 2024 12:35:00 +0100 Subject: [PATCH] Rework filter bots macro --- .../bigquery/snowplow_unified_events_stg.sql | 2 +- macros/filter_bots.sql | 14 ++++++++++++++ .../snowplow_unified_web_vital_events_this_run.sql | 8 -------- .../snowplow_unified_web_vital_events_this_run.sql | 8 -------- .../snowplow_unified_web_vital_events_this_run.sql | 4 ---- 5 files changed, 15 insertions(+), 21 deletions(-) diff --git a/integration_tests/models/source/bigquery/snowplow_unified_events_stg.sql b/integration_tests/models/source/bigquery/snowplow_unified_events_stg.sql index 8e9aafa8..5ecdcf34 100644 --- a/integration_tests/models/source/bigquery/snowplow_unified_events_stg.sql +++ b/integration_tests/models/source/bigquery/snowplow_unified_events_stg.sql @@ -494,7 +494,7 @@ You may obtain a copy of the Snowplow Personal and Academic License Version 1.0 select as struct JSON_EXTRACT_scalar(json_array,'$.category') as category, JSON_EXTRACT_scalar(json_array,'$.primary_impact') as primary_impact, JSON_EXTRACT_scalar(json_array,'$.reason') as reason, - JSON_EXTRACT_scalar(json_array,'$.spider_or_robot') as spider_or_robot + cast(JSON_EXTRACT_scalar(json_array,'$.spider_or_robot') as boolean) as spider_or_robot from unnest(contexts_com_iab_snowplow_spiders_and_robots_1_0_0) as json_array ) as contexts_com_iab_snowplow_spiders_and_robots_1_0_0 diff --git a/macros/filter_bots.sql b/macros/filter_bots.sql index 05343867..fc3e6557 100644 --- a/macros/filter_bots.sql +++ b/macros/filter_bots.sql @@ -10,17 +10,31 @@ You may obtain a copy of the Snowplow Personal and Academic License Version 1.0 {%- endmacro -%} {% macro default__filter_bots(table_alias = none) %} + {% if var('snowplow__enable_iab', false) %} + {# additional logic in case the result is null due to server anonymization #} + and coalesce(iab__spider_or_robot, False ) = False + {% endif %} and {% if table_alias %}{{table_alias~'.'}}{% endif %}useragent not similar to '%(bot|crawl|slurp|spider|archiv|spinn|sniff|seo|audit|survey|pingdom|worm|capture|(browser|screen)shots|analyz|index|thumb|check|facebook|PingdomBot|PhantomJS|YandexBot|Twitterbot|a_archiver|facebookexternalhit|Bingbot|BingPreview|Googlebot|Baiduspider|360(Spider|User-agent)|semalt)%' {% endmacro %} {% macro bigquery__filter_bots(table_alias = none) %} + {% if var('snowplow__enable_iab', false) %} + and coalesce(iab__spider_or_robot, False ) = False + {% endif %} and not regexp_contains({% if table_alias %}{{table_alias~'.'}}{% endif %}useragent, '(bot|crawl|slurp|spider|archiv|spinn|sniff|seo|audit|survey|pingdom|worm|capture|(browser|screen)shots|analyz|index|thumb|check|facebook|PingdomBot|PhantomJS|YandexBot|Twitterbot|a_archiver|facebookexternalhit|Bingbot|BingPreview|Googlebot|Baiduspider|360(Spider|User-agent)|semalt)') {% endmacro %} {% macro spark__filter_bots(table_alias = none) %} + {% if var('snowplow__enable_iab', false) %} + {# had to add different syntax as the coalesce based one resulted in a Spark error #} + and (not iab__spider_or_robot = True or iab__spider_or_robot is null) + {% endif %} and not rlike({% if table_alias %}{{table_alias~'.'}}{% endif %}useragent, '.*(bot|crawl|slurp|spider|archiv|spinn|sniff|seo|audit|survey|pingdom|worm|capture|(browser|screen)shots|analyz|index|thumb|check|facebook|PingdomBot|PhantomJS|YandexBot|Twitterbot|a_archiver|facebookexternalhit|Bingbot|BingPreview|Googlebot|Baiduspider|360(Spider|User-agent)|semalt).*') {% endmacro %} {% macro snowflake__filter_bots(table_alias = none) %} + {% if var('snowplow__enable_iab', false) %} + and coalesce(iab__spider_or_robot, False ) = False + {% endif %} and not rlike({% if table_alias %}{{table_alias~'.'}}{% endif %}useragent, '.*(bot|crawl|slurp|spider|archiv|spinn|sniff|seo|audit|survey|pingdom|worm|capture|(browser|screen)shots|analyz|index|thumb|check|facebook|PingdomBot|PhantomJS|YandexBot|Twitterbot|a_archiver|facebookexternalhit|Bingbot|BingPreview|Googlebot|Baiduspider|360(Spider|User-agent)|semalt).*') {% endmacro %} diff --git a/models/optional_modules/core_web_vitals/scratch/bigquery/snowplow_unified_web_vital_events_this_run.sql b/models/optional_modules/core_web_vitals/scratch/bigquery/snowplow_unified_web_vital_events_this_run.sql index 35d998f9..cfcc9b70 100644 --- a/models/optional_modules/core_web_vitals/scratch/bigquery/snowplow_unified_web_vital_events_this_run.sql +++ b/models/optional_modules/core_web_vitals/scratch/bigquery/snowplow_unified_web_vital_events_this_run.sql @@ -70,14 +70,6 @@ with prep as ( -- exclude bot traffic - {% if var('snowplow__enable_iab', false) %} - and not {{ snowplow_utils.get_field(column_name = 'contexts_com_iab_snowplow_spiders_and_robots_1_0_0', - field_name = 'spider_or_robot', - table_alias = 'e', - type = 'boolean', - array_index = 0)}} = True - {% endif %} - {{ snowplow_unified.filter_bots() }} ) diff --git a/models/optional_modules/core_web_vitals/scratch/databricks/snowplow_unified_web_vital_events_this_run.sql b/models/optional_modules/core_web_vitals/scratch/databricks/snowplow_unified_web_vital_events_this_run.sql index d3dc4b98..20cf52d1 100644 --- a/models/optional_modules/core_web_vitals/scratch/databricks/snowplow_unified_web_vital_events_this_run.sql +++ b/models/optional_modules/core_web_vitals/scratch/databricks/snowplow_unified_web_vital_events_this_run.sql @@ -70,14 +70,6 @@ with prep as ( -- exclude bot traffic - {% if var('snowplow__enable_iab', false) %} - and not {{ snowplow_utils.get_field(column_name = 'contexts_com_iab_snowplow_spiders_and_robots_1', - field_name = 'spider_or_robot', - table_alias = 'e', - type = 'boolean', - array_index = 0)}} = True - {% endif %} - {{ snowplow_unified.filter_bots() }} ) diff --git a/models/optional_modules/core_web_vitals/scratch/default/snowplow_unified_web_vital_events_this_run.sql b/models/optional_modules/core_web_vitals/scratch/default/snowplow_unified_web_vital_events_this_run.sql index 695123bd..c62a3ee7 100644 --- a/models/optional_modules/core_web_vitals/scratch/default/snowplow_unified_web_vital_events_this_run.sql +++ b/models/optional_modules/core_web_vitals/scratch/default/snowplow_unified_web_vital_events_this_run.sql @@ -69,10 +69,6 @@ with prep as ( -- exclude bot traffic - {% if var('snowplow__enable_iab', false) %} - and not e.iab__spider_or_robot = True - {% endif %} - {{ snowplow_unified.filter_bots() }} )