Skip to content

Commit

Permalink
Rework filter bots macro
Browse files Browse the repository at this point in the history
  • Loading branch information
agnessnowplow committed Oct 17, 2024
1 parent 1223a5f commit 16be271
Show file tree
Hide file tree
Showing 5 changed files with 16 additions and 21 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -494,7 +494,7 @@ You may obtain a copy of the Snowplow Personal and Academic License Version 1.0
select as struct JSON_EXTRACT_scalar(json_array,'$.category') as category,
JSON_EXTRACT_scalar(json_array,'$.primary_impact') as primary_impact,
JSON_EXTRACT_scalar(json_array,'$.reason') as reason,
JSON_EXTRACT_scalar(json_array,'$.spider_or_robot') as spider_or_robot
cast(JSON_EXTRACT_scalar(json_array,'$.spider_or_robot') as boolean) as spider_or_robot

from unnest(contexts_com_iab_snowplow_spiders_and_robots_1_0_0) as json_array
) as contexts_com_iab_snowplow_spiders_and_robots_1_0_0
Expand Down
15 changes: 15 additions & 0 deletions macros/filter_bots.sql
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,32 @@ You may obtain a copy of the Snowplow Personal and Academic License Version 1.0
{%- endmacro -%}

{% macro default__filter_bots(table_alias = none) %}
{% if var('snowplow__enable_iab', false) %}
{# additional logic in case the result is null due to server anonymization #}
and coalesce(iab__spider_or_robot, False ) = False
{% endif %}
and {% if table_alias %}{{table_alias~'.'}}{% endif %}useragent not similar to '%(bot|crawl|slurp|spider|archiv|spinn|sniff|seo|audit|survey|pingdom|worm|capture|(browser|screen)shots|analyz|index|thumb|check|facebook|PingdomBot|PhantomJS|YandexBot|Twitterbot|a_archiver|facebookexternalhit|Bingbot|BingPreview|Googlebot|Baiduspider|360(Spider|User-agent)|semalt)%'
{% endmacro %}

{% macro bigquery__filter_bots(table_alias = none) %}
{% if var('snowplow__enable_iab', false) %}
and coalesce(iab__spider_or_robot, False ) = False
{% endif %}
and not regexp_contains({% if table_alias %}{{table_alias~'.'}}{% endif %}useragent, '(bot|crawl|slurp|spider|archiv|spinn|sniff|seo|audit|survey|pingdom|worm|capture|(browser|screen)shots|analyz|index|thumb|check|facebook|PingdomBot|PhantomJS|YandexBot|Twitterbot|a_archiver|facebookexternalhit|Bingbot|BingPreview|Googlebot|Baiduspider|360(Spider|User-agent)|semalt)')
{% endmacro %}

{% macro spark__filter_bots(table_alias = none) %}
{% if var('snowplow__enable_iab', false) %}
{# had to add different syntax as the coalesce based one resulted in a Spark error #}
(not iab__spider_or_robot = True
or iab__spider_or_robot is null)
{% endif %}
and not rlike({% if table_alias %}{{table_alias~'.'}}{% endif %}useragent, '.*(bot|crawl|slurp|spider|archiv|spinn|sniff|seo|audit|survey|pingdom|worm|capture|(browser|screen)shots|analyz|index|thumb|check|facebook|PingdomBot|PhantomJS|YandexBot|Twitterbot|a_archiver|facebookexternalhit|Bingbot|BingPreview|Googlebot|Baiduspider|360(Spider|User-agent)|semalt).*')
{% endmacro %}

{% macro snowflake__filter_bots(table_alias = none) %}
{% if var('snowplow__enable_iab', false) %}
and coalesce(iab__spider_or_robot, False ) = False
{% endif %}
and not rlike({% if table_alias %}{{table_alias~'.'}}{% endif %}useragent, '.*(bot|crawl|slurp|spider|archiv|spinn|sniff|seo|audit|survey|pingdom|worm|capture|(browser|screen)shots|analyz|index|thumb|check|facebook|PingdomBot|PhantomJS|YandexBot|Twitterbot|a_archiver|facebookexternalhit|Bingbot|BingPreview|Googlebot|Baiduspider|360(Spider|User-agent)|semalt).*')
{% endmacro %}
Original file line number Diff line number Diff line change
Expand Up @@ -70,14 +70,6 @@ with prep as (

-- exclude bot traffic

{% if var('snowplow__enable_iab', false) %}
and not {{ snowplow_utils.get_field(column_name = 'contexts_com_iab_snowplow_spiders_and_robots_1_0_0',
field_name = 'spider_or_robot',
table_alias = 'e',
type = 'boolean',
array_index = 0)}} = True
{% endif %}

{{ snowplow_unified.filter_bots() }}

)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,14 +70,6 @@ with prep as (

-- exclude bot traffic

{% if var('snowplow__enable_iab', false) %}
and not {{ snowplow_utils.get_field(column_name = 'contexts_com_iab_snowplow_spiders_and_robots_1',
field_name = 'spider_or_robot',
table_alias = 'e',
type = 'boolean',
array_index = 0)}} = True
{% endif %}

{{ snowplow_unified.filter_bots() }}

)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,6 @@ with prep as (

-- exclude bot traffic

{% if var('snowplow__enable_iab', false) %}
and not e.iab__spider_or_robot = True
{% endif %}

{{ snowplow_unified.filter_bots() }}

)
Expand Down

0 comments on commit 16be271

Please sign in to comment.