diff --git a/sql/moz-fx-data-marketing-prod/ga/blogs_sessions/view.sql b/sql/moz-fx-data-marketing-prod/ga/blogs_sessions/view.sql new file mode 100644 index 0000000000..5d75f71472 --- /dev/null +++ b/sql/moz-fx-data-marketing-prod/ga/blogs_sessions/view.sql @@ -0,0 +1,7 @@ +CREATE OR REPLACE VIEW + `moz-fx-data-marketing-prod.ga.blogs_sessions` +AS +SELECT + * +FROM + `moz-fx-data-marketing-prod.ga_derived.blogs_sessions_v2` diff --git a/sql/moz-fx-data-marketing-prod/ga_derived/blogs_sessions_v2/metadata.yaml b/sql/moz-fx-data-marketing-prod/ga_derived/blogs_sessions_v2/metadata.yaml new file mode 100644 index 0000000000..1cd154de75 --- /dev/null +++ b/sql/moz-fx-data-marketing-prod/ga_derived/blogs_sessions_v2/metadata.yaml @@ -0,0 +1,21 @@ +friendly_name: Blogs Sessions V2 +description: |- + Intermediate table containing normalized sessions for blog.mozilla.org, sourced from Google Analytics 4 (GA4) +owners: +- kwindau@mozilla.com +labels: + incremental: true + owner1: kwindau@mozilla.com +scheduling: + dag_name: bqetl_google_analytics_derived_ga4 +bigquery: + time_partitioning: + type: day + field: date + require_partition_filter: true + expiration_days: null + clustering: + fields: + - country +references: {} +deprecated: false diff --git a/sql/moz-fx-data-marketing-prod/ga_derived/blogs_sessions_v2/query.sql b/sql/moz-fx-data-marketing-prod/ga_derived/blogs_sessions_v2/query.sql new file mode 100644 index 0000000000..7530fa9599 --- /dev/null +++ b/sql/moz-fx-data-marketing-prod/ga_derived/blogs_sessions_v2/query.sql @@ -0,0 +1,205 @@ +--Get all page views with the page location, and a flag for whether it was an entrance or not to the session +WITH all_page_views AS ( + SELECT + PARSE_DATE('%Y%m%d', event_date) AS `date`, + event_timestamp, + user_pseudo_id || '-' || CAST( + ( + SELECT + `value` + FROM + UNNEST(event_params) + WHERE + key = 'ga_session_id' + LIMIT + 1 + ).int_value AS STRING + ) AS visit_identifier, + device.category AS device_category, + device.operating_system AS operating_system, + device.web_info.browser AS browser, + device.language AS `language`, + geo.country AS country, + collected_traffic_source.manual_source AS source, + collected_traffic_source.manual_medium AS medium, + collected_traffic_source.manual_campaign_name AS campaign, + collected_traffic_source.manual_content AS content, + ( + SELECT + `value` + FROM + UNNEST(event_params) + WHERE + key = 'page_location' + LIMIT + 1 + ).string_value AS page_location, + ( + SELECT + `value` + FROM + UNNEST(event_params) + WHERE + key = 'entrances' + LIMIT + 1 + ).int_value AS is_entrance + FROM + `moz-fx-data-marketing-prod.analytics_314399816.events_*` + WHERE + _TABLE_SUFFIX = FORMAT_DATE('%Y%m%d', @submission_date) + AND event_name = 'page_view' +), +--Filter to entrance pages only, and then filter to ensure only 1 entrance page per session +--Theoretically Google should always only send 1 per session, but in case there is ever more than 1, which happens occasionally +entrance_page_views_only AS ( + SELECT + `date`, + visit_identifier, + device_category, + operating_system, + browser, + `language`, + country, + source, + medium, + campaign, + content, + REGEXP_REPLACE( + SPLIT(page_location, '?')[SAFE_OFFSET(0)], + '^https://blog.mozilla.org', + '' + ) AS page_path, + SPLIT( + REGEXP_REPLACE(SPLIT(page_location, '?')[SAFE_OFFSET(0)], '^https://blog.mozilla.org', ''), + '/' + )[SAFE_OFFSET(1)] AS blog, + SPLIT( + REGEXP_REPLACE(SPLIT(page_location, '?')[SAFE_OFFSET(0)], '^https://blog.mozilla.org', ''), + '/' + )[SAFE_OFFSET(2)] AS subblog + FROM + all_page_views + WHERE + is_entrance = 1 + QUALIFY + ROW_NUMBER() OVER (PARTITION BY visit_identifier ORDER BY event_timestamp ASC) = 1 +), +staging AS ( + SELECT + epvo.date, + epvo.visit_identifier, + epvo.device_category, + epvo.operating_system, + epvo.browser, + epvo.language, + epvo.country, + epvo.source, + epvo.medium, + epvo.campaign, + epvo.content, + epvo.blog, + epvo.subblog, + COUNT(DISTINCT(visit_identifier)) AS sessions + FROM + entrance_page_views_only epvo + GROUP BY + epvo.date, + epvo.visit_identifier, + epvo.device_category, + epvo.operating_system, + epvo.browser, + epvo.language, + epvo.country, + epvo.source, + epvo.medium, + epvo.campaign, + epvo.content, + epvo.blog, + epvo.subblog +) +SELECT + `date`, + visit_identifier, + device_category, + operating_system, + browser, + `language`, + country, + source, + medium, + campaign, + content, + CASE + WHEN blog LIKE "press%" + THEN "press" + WHEN blog = 'firefox' + THEN 'The Firefox Frontier' + WHEN blog = 'netPolicy' + THEN 'Open Policy & Advocacy' + WHEN LOWER(blog) = 'internetcitizen' + THEN 'Internet Citizen' + WHEN blog = 'futurereleases' + THEN 'Future Releases' + WHEN blog = 'careers' + THEN 'Careers' + WHEN blog = 'opendesign' + THEN 'Open Design' + WHEN blog = "" + THEN "Blog Home Page" + WHEN LOWER(blog) IN ( + 'blog', + 'addons', + 'security', + 'opendesign', + 'nnethercote', + 'thunderbird', + 'community', + 'l10n', + 'theden', + 'webrtc', + 'berlin', + 'webdev', + 'services', + 'tanvi', + 'laguaridadefirefox', + 'ux', + 'fxtesteng', + 'foundation-archive', + 'nfroyd', + 'sumo', + 'javascript', + 'page', + 'data' + ) + THEN LOWER(blog) + ELSE 'other' + END AS blog, + CASE + WHEN blog = "firefox" + AND subblog IN ('ru', 'pt-br', 'pl', 'it', 'id', 'fr', 'es', 'de') + THEN subblog + WHEN blog = "firefox" + THEN "Main" + WHEN blog LIKE "press-%" + AND blog IN ( + 'press-de', + 'press-fr', + 'press-es', + 'press-uk', + 'press-pl', + 'press-it', + 'press-br', + 'press-nl' + ) + THEN blog + WHEN blog LIKE "press%" + THEN "Main" + WHEN blog = 'internetcitizen' + AND subblog IN ('de', 'fr') + THEN subblog + ELSE "Main" + END AS subblog, + `sessions` +FROM + staging diff --git a/sql/moz-fx-data-marketing-prod/ga_derived/blogs_sessions_v2/schema.yaml b/sql/moz-fx-data-marketing-prod/ga_derived/blogs_sessions_v2/schema.yaml new file mode 100644 index 0000000000..5eb432e34b --- /dev/null +++ b/sql/moz-fx-data-marketing-prod/ga_derived/blogs_sessions_v2/schema.yaml @@ -0,0 +1,57 @@ +fields: +- mode: NULLABLE + name: date + type: DATE + description: Date of the visit +- mode: NULLABLE + name: visit_identifier + type: STRING + description: Visit Identifier - Uniquely identifies a visit; concatenation of user_pseudo_id and ga_session_id +- mode: NULLABLE + name: device_category + type: STRING + description: Device Category - The device category the visitor used to visit the site +- mode: NULLABLE + name: operating_system + type: STRING + description: Operating System - The operating system the visitor used to visit the site +- mode: NULLABLE + name: browser + type: STRING + description: Browser - The browser the visiting device was using when it visited the site +- mode: NULLABLE + name: language + type: STRING + description: Language - The language the visiting device was using when it visited the site +- mode: NULLABLE + name: country + type: STRING + description: Country - The country from which events were reported, based on IP address +- mode: NULLABLE + name: source + type: STRING + description: Source - Referring partner domain +- mode: NULLABLE + name: medium + type: STRING + description: Medium - Category of the source, such as 'organic' for a search engine +- mode: NULLABLE + name: campaign + type: STRING + description: Campaign - Identifier for the marketing campaign +- mode: NULLABLE + name: content + type: STRING + description: Content - Indicates the particular link within a campaign +- mode: NULLABLE + name: blog + type: STRING + description: Blog +- mode: NULLABLE + name: subblog + type: STRING + description: Sub-Blog +- mode: NULLABLE + name: sessions + type: INT64 + description: Number of Sessions