DENG-2492 Create new GA4 derived table: blogs_sessions_v2 (#5018)

* DENG-2492 initial commit for new table blogs_sessions_v2

* DENG-2492 wrap keywords with backticks
This commit is contained in:
Katie Windau 2024-02-12 10:54:41 -06:00 коммит произвёл GitHub
Родитель 63a4d72197
Коммит ee8de94705
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
4 изменённых файлов: 290 добавлений и 0 удалений

Просмотреть файл

@ -0,0 +1,7 @@
CREATE OR REPLACE VIEW
`moz-fx-data-marketing-prod.ga.blogs_sessions`
AS
SELECT
*
FROM
`moz-fx-data-marketing-prod.ga_derived.blogs_sessions_v2`

Просмотреть файл

@ -0,0 +1,21 @@
friendly_name: Blogs Sessions V2
description: |-
Intermediate table containing normalized sessions for blog.mozilla.org, sourced from Google Analytics 4 (GA4)
owners:
- kwindau@mozilla.com
labels:
incremental: true
owner1: kwindau@mozilla.com
scheduling:
dag_name: bqetl_google_analytics_derived_ga4
bigquery:
time_partitioning:
type: day
field: date
require_partition_filter: true
expiration_days: null
clustering:
fields:
- country
references: {}
deprecated: false

Просмотреть файл

@ -0,0 +1,205 @@
--Get all page views with the page location, and a flag for whether it was an entrance or not to the session
WITH all_page_views AS (
SELECT
PARSE_DATE('%Y%m%d', event_date) AS `date`,
event_timestamp,
user_pseudo_id || '-' || CAST(
(
SELECT
`value`
FROM
UNNEST(event_params)
WHERE
key = 'ga_session_id'
LIMIT
1
).int_value AS STRING
) AS visit_identifier,
device.category AS device_category,
device.operating_system AS operating_system,
device.web_info.browser AS browser,
device.language AS `language`,
geo.country AS country,
collected_traffic_source.manual_source AS source,
collected_traffic_source.manual_medium AS medium,
collected_traffic_source.manual_campaign_name AS campaign,
collected_traffic_source.manual_content AS content,
(
SELECT
`value`
FROM
UNNEST(event_params)
WHERE
key = 'page_location'
LIMIT
1
).string_value AS page_location,
(
SELECT
`value`
FROM
UNNEST(event_params)
WHERE
key = 'entrances'
LIMIT
1
).int_value AS is_entrance
FROM
`moz-fx-data-marketing-prod.analytics_314399816.events_*`
WHERE
_TABLE_SUFFIX = FORMAT_DATE('%Y%m%d', @submission_date)
AND event_name = 'page_view'
),
--Filter to entrance pages only, and then filter to ensure only 1 entrance page per session
--Theoretically Google should always only send 1 per session, but in case there is ever more than 1, which happens occasionally
entrance_page_views_only AS (
SELECT
`date`,
visit_identifier,
device_category,
operating_system,
browser,
`language`,
country,
source,
medium,
campaign,
content,
REGEXP_REPLACE(
SPLIT(page_location, '?')[SAFE_OFFSET(0)],
'^https://blog.mozilla.org',
''
) AS page_path,
SPLIT(
REGEXP_REPLACE(SPLIT(page_location, '?')[SAFE_OFFSET(0)], '^https://blog.mozilla.org', ''),
'/'
)[SAFE_OFFSET(1)] AS blog,
SPLIT(
REGEXP_REPLACE(SPLIT(page_location, '?')[SAFE_OFFSET(0)], '^https://blog.mozilla.org', ''),
'/'
)[SAFE_OFFSET(2)] AS subblog
FROM
all_page_views
WHERE
is_entrance = 1
QUALIFY
ROW_NUMBER() OVER (PARTITION BY visit_identifier ORDER BY event_timestamp ASC) = 1
),
staging AS (
SELECT
epvo.date,
epvo.visit_identifier,
epvo.device_category,
epvo.operating_system,
epvo.browser,
epvo.language,
epvo.country,
epvo.source,
epvo.medium,
epvo.campaign,
epvo.content,
epvo.blog,
epvo.subblog,
COUNT(DISTINCT(visit_identifier)) AS sessions
FROM
entrance_page_views_only epvo
GROUP BY
epvo.date,
epvo.visit_identifier,
epvo.device_category,
epvo.operating_system,
epvo.browser,
epvo.language,
epvo.country,
epvo.source,
epvo.medium,
epvo.campaign,
epvo.content,
epvo.blog,
epvo.subblog
)
SELECT
`date`,
visit_identifier,
device_category,
operating_system,
browser,
`language`,
country,
source,
medium,
campaign,
content,
CASE
WHEN blog LIKE "press%"
THEN "press"
WHEN blog = 'firefox'
THEN 'The Firefox Frontier'
WHEN blog = 'netPolicy'
THEN 'Open Policy & Advocacy'
WHEN LOWER(blog) = 'internetcitizen'
THEN 'Internet Citizen'
WHEN blog = 'futurereleases'
THEN 'Future Releases'
WHEN blog = 'careers'
THEN 'Careers'
WHEN blog = 'opendesign'
THEN 'Open Design'
WHEN blog = ""
THEN "Blog Home Page"
WHEN LOWER(blog) IN (
'blog',
'addons',
'security',
'opendesign',
'nnethercote',
'thunderbird',
'community',
'l10n',
'theden',
'webrtc',
'berlin',
'webdev',
'services',
'tanvi',
'laguaridadefirefox',
'ux',
'fxtesteng',
'foundation-archive',
'nfroyd',
'sumo',
'javascript',
'page',
'data'
)
THEN LOWER(blog)
ELSE 'other'
END AS blog,
CASE
WHEN blog = "firefox"
AND subblog IN ('ru', 'pt-br', 'pl', 'it', 'id', 'fr', 'es', 'de')
THEN subblog
WHEN blog = "firefox"
THEN "Main"
WHEN blog LIKE "press-%"
AND blog IN (
'press-de',
'press-fr',
'press-es',
'press-uk',
'press-pl',
'press-it',
'press-br',
'press-nl'
)
THEN blog
WHEN blog LIKE "press%"
THEN "Main"
WHEN blog = 'internetcitizen'
AND subblog IN ('de', 'fr')
THEN subblog
ELSE "Main"
END AS subblog,
`sessions`
FROM
staging

Просмотреть файл

@ -0,0 +1,57 @@
fields:
- mode: NULLABLE
name: date
type: DATE
description: Date of the visit
- mode: NULLABLE
name: visit_identifier
type: STRING
description: Visit Identifier - Uniquely identifies a visit; concatenation of user_pseudo_id and ga_session_id
- mode: NULLABLE
name: device_category
type: STRING
description: Device Category - The device category the visitor used to visit the site
- mode: NULLABLE
name: operating_system
type: STRING
description: Operating System - The operating system the visitor used to visit the site
- mode: NULLABLE
name: browser
type: STRING
description: Browser - The browser the visiting device was using when it visited the site
- mode: NULLABLE
name: language
type: STRING
description: Language - The language the visiting device was using when it visited the site
- mode: NULLABLE
name: country
type: STRING
description: Country - The country from which events were reported, based on IP address
- mode: NULLABLE
name: source
type: STRING
description: Source - Referring partner domain
- mode: NULLABLE
name: medium
type: STRING
description: Medium - Category of the source, such as 'organic' for a search engine
- mode: NULLABLE
name: campaign
type: STRING
description: Campaign - Identifier for the marketing campaign
- mode: NULLABLE
name: content
type: STRING
description: Content - Indicates the particular link within a campaign
- mode: NULLABLE
name: blog
type: STRING
description: Blog
- mode: NULLABLE
name: subblog
type: STRING
description: Sub-Blog
- mode: NULLABLE
name: sessions
type: INT64
description: Number of Sessions