diff --git a/sql/moz-fx-data-marketing-prod/google_search_console/search_impressions_by_page/metadata.yaml b/sql/moz-fx-data-marketing-prod/google_search_console/search_impressions_by_page/metadata.yaml index dad7eb1f15..aacadf9a26 100644 --- a/sql/moz-fx-data-marketing-prod/google_search_console/search_impressions_by_page/metadata.yaml +++ b/sql/moz-fx-data-marketing-prod/google_search_console/search_impressions_by_page/metadata.yaml @@ -3,15 +3,20 @@ description: |- Google Search impressions aggregated by page for the following domains: * addons.mozilla.org * blog.mozilla.org + * developer.mozilla.org * getpocket.com * support.mozilla.org * www.mozilla.org - Anonymized search queries, and Discover and Google News search impressions are included from August 2023 onward. + For the developer.mozilla.org domain: + * Records from 2024-04-10 onward are from source data exported directly to BigQuery by Google. + * Records before 2024-04-10 are from source data synced to BigQuery by Fivetran. - Source data from before August 2023 was synced to BigQuery by Fivetran. - Source data from August 2023 onward was exported directly to BigQuery by Google. - See https://bugzilla.mozilla.org/show_bug.cgi?id=1764960#c34. + For the other domains: + * Records from 2023-08-01 onward are from source data exported directly to BigQuery by Google. + * Records before 2023-08-01 are from source data synced to BigQuery by Fivetran. + + Anonymized search queries, and Discover and Google News search impressions are only included if the source data was exported directly to BigQuery by Google. owners: - srose@mozilla.com labels: diff --git a/sql/moz-fx-data-marketing-prod/google_search_console/search_impressions_by_page/schema.yaml b/sql/moz-fx-data-marketing-prod/google_search_console/search_impressions_by_page/schema.yaml index 3460ab744c..a857303659 100644 --- a/sql/moz-fx-data-marketing-prod/google_search_console/search_impressions_by_page/schema.yaml +++ b/sql/moz-fx-data-marketing-prod/google_search_console/search_impressions_by_page/schema.yaml @@ -31,11 +31,41 @@ fields: description: |- The path part of the page URL. This will be null for anonymized Discover impressions. -- name: page_path_segment_1 +- name: localized_site_code type: STRING mode: NULLABLE description: |- - The first segment of the page URL path, which is often a locale like `en-US` or `de`. + Localized site code such as `en-US` or `de` found in the first segment of the page URL path (if any). + This will be null for anonymized Discover impressions. +- name: localized_site + type: STRING + mode: NULLABLE + description: |- + Localized site description based on `localized_site_language` and `localized_site_country` (if any). + This will be null for anonymized Discover impressions. +- name: localized_site_language_code + type: STRING + mode: NULLABLE + description: |- + Localized site language code in ISO-639-alpha-2 format found in the first segment of the page URL path (if any). + This will be null for anonymized Discover impressions. +- name: localized_site_language + type: STRING + mode: NULLABLE + description: |- + Localized site language based on `localized_site_language_code` (if any). + This will be null for anonymized Discover impressions. +- name: localized_site_country_code + type: STRING + mode: NULLABLE + description: |- + Localized site country code in ISO-3166-1-alpha-2 format found in the first segment of the page URL path (if any). + This will be null for anonymized Discover impressions. +- name: localized_site_country + type: STRING + mode: NULLABLE + description: |- + Localized site country based on `localized_site_country_code` (if any). This will be null for anonymized Discover impressions. - name: query type: STRING @@ -43,6 +73,16 @@ fields: description: |- The search query. This will be null for anonymized search impressions, and all Discover and Google News search impressions. +- name: query_type + type: STRING + mode: NULLABLE + description: |- + Type of search query: + * Anonymized: Query was redacted by Google to protect the users' privacy. + * Brand: Query contained one or more Mozilla brand keywords. + * Non-Brand: Query didn't contain any Mozilla brand keywords. + * Unknown: Query couldn't be classified. + This will be null for all Discover and Google News search impressions. - name: is_anonymized type: BOOLEAN mode: NULLABLE @@ -55,7 +95,7 @@ fields: mode: NULLABLE description: |- Whether Google Search considers the page to be providing a good page experience. - This will be null prior to August 2023. + This will be null when the source data wasn't exported directly to BigQuery by Google. - name: search_type type: STRING mode: NULLABLE @@ -72,12 +112,18 @@ fields: mode: NULLABLE description: |- How the search result appeared (e.g. normal result, translated result, video). - This will be null prior to August 2023. -- name: country_code + This will be null when the source data wasn't exported directly to BigQuery by Google. +- name: user_country_code type: STRING mode: NULLABLE description: |- - Country from which the search was made, in ISO-3166-1-Alpha-3 format. + Country from which the user was searching, in ISO-3166-1-alpha-3 format. + This will be null for anonymized Discover impressions. +- name: user_country + type: STRING + mode: NULLABLE + description: |- + Country from which the user was searching. This will be null for anonymized Discover impressions. - name: device_type type: STRING diff --git a/sql/moz-fx-data-marketing-prod/google_search_console/search_impressions_by_page/view.sql b/sql/moz-fx-data-marketing-prod/google_search_console/search_impressions_by_page/view.sql index 105fcc5fb3..4387498029 100644 --- a/sql/moz-fx-data-marketing-prod/google_search_console/search_impressions_by_page/view.sql +++ b/sql/moz-fx-data-marketing-prod/google_search_console/search_impressions_by_page/view.sql @@ -1,48 +1,131 @@ CREATE OR REPLACE VIEW `moz-fx-data-marketing-prod.google_search_console.search_impressions_by_page` AS +WITH search_impressions_union AS ( + SELECT + `date`, + site_url, + site_domain_name, + page_url, + page_domain_name, + page_path, + localized_site_code, + localized_site_language_code, + localized_site_country_code, + query, + FALSE AS is_anonymized, + CAST(NULL AS BOOLEAN) AS has_good_page_experience, + search_type, + CAST(NULL AS STRING) AS search_appearance, + user_country_code, + device_type, + impressions, + clicks, + average_position + FROM + `moz-fx-data-marketing-prod.google_search_console_derived.search_impressions_by_page_v1` + WHERE + CASE + WHEN site_domain_name IN ( + 'addons.mozilla.org', + 'blog.mozilla.org', + 'getpocket.com', + 'support.mozilla.org', + 'www.mozilla.org' + ) + THEN `date` < '2023-08-01' + WHEN site_domain_name = 'developer.mozilla.org' + THEN `date` < '2024-04-10' + ELSE FALSE + END + UNION ALL + SELECT + `date`, + site_url, + site_domain_name, + page_url, + page_domain_name, + page_path, + localized_site_code, + localized_site_language_code, + localized_site_country_code, + query, + is_anonymized, + has_good_page_experience, + search_type, + search_appearance, + user_country_code, + device_type, + impressions, + clicks, + average_position + FROM + `moz-fx-data-marketing-prod.google_search_console_derived.search_impressions_by_page_v2` + WHERE + CASE + WHEN site_domain_name IN ( + 'addons.mozilla.org', + 'blog.mozilla.org', + 'getpocket.com', + 'support.mozilla.org', + 'www.mozilla.org' + ) + THEN `date` >= '2023-08-01' + ELSE TRUE + END +) SELECT - `date`, - site_url, - site_domain_name, - page_url, - page_domain_name, - page_path, - page_path_segment_1, - query, - FALSE AS is_anonymized, - CAST(NULL AS BOOLEAN) AS has_good_page_experience, - search_type, - CAST(NULL AS STRING) AS search_appearance, - country_code, - device_type, - impressions, - clicks, - average_position + search_impressions.date, + search_impressions.site_url, + search_impressions.site_domain_name, + search_impressions.page_url, + search_impressions.page_domain_name, + search_impressions.page_path, + search_impressions.localized_site_code, + CONCAT( + COALESCE(localized_site_language.name, search_impressions.localized_site_language_code), + COALESCE( + CONCAT( + ' - ', + COALESCE(localized_site_country.name, search_impressions.localized_site_country_code) + ), + '' + ) + ) AS localized_site, + search_impressions.localized_site_language_code, + COALESCE( + localized_site_language.name, + search_impressions.localized_site_language_code + ) AS localized_site_language, + search_impressions.localized_site_country_code, + COALESCE( + localized_site_country.name, + search_impressions.localized_site_country_code + ) AS localized_site_country, + search_impressions.query, + mozfun.google_search_console.classify_site_query( + search_impressions.site_domain_name, + search_impressions.query, + search_impressions.search_type + ) AS query_type, + search_impressions.is_anonymized, + search_impressions.has_good_page_experience, + search_impressions.search_type, + search_impressions.search_appearance, + search_impressions.user_country_code, + COALESCE(user_country.name, search_impressions.user_country_code) AS user_country, + search_impressions.device_type, + search_impressions.impressions, + search_impressions.clicks, + search_impressions.average_position FROM - `moz-fx-data-marketing-prod.google_search_console_derived.search_impressions_by_page_v1` -WHERE - `date` < '2023-08-01' -UNION ALL -SELECT - `date`, - site_url, - site_domain_name, - page_url, - page_domain_name, - page_path, - page_path_segment_1, - query, - is_anonymized, - has_good_page_experience, - search_type, - search_appearance, - country_code, - device_type, - impressions, - clicks, - average_position -FROM - `moz-fx-data-marketing-prod.google_search_console_derived.search_impressions_by_page_v2` -WHERE - `date` >= '2023-08-01' + search_impressions_union AS search_impressions +LEFT JOIN + `moz-fx-data-shared-prod.static.language_codes_v1` AS localized_site_language + ON search_impressions.localized_site_language_code = localized_site_language.code_2 +LEFT JOIN + `moz-fx-data-shared-prod.static.country_codes_v1` AS localized_site_country + ON search_impressions.localized_site_country_code = localized_site_country.code +LEFT JOIN + `moz-fx-data-shared-prod.static.country_codes_v1` AS user_country + ON search_impressions.user_country_code = user_country.code_3 diff --git a/sql/moz-fx-data-marketing-prod/google_search_console/search_impressions_by_site/metadata.yaml b/sql/moz-fx-data-marketing-prod/google_search_console/search_impressions_by_site/metadata.yaml index dfeb4320ad..6ce1416d64 100644 --- a/sql/moz-fx-data-marketing-prod/google_search_console/search_impressions_by_site/metadata.yaml +++ b/sql/moz-fx-data-marketing-prod/google_search_console/search_impressions_by_site/metadata.yaml @@ -3,15 +3,20 @@ description: |- Google Search impressions aggregated by site for the following domains: * addons.mozilla.org * blog.mozilla.org + * developer.mozilla.org * getpocket.com * support.mozilla.org * www.mozilla.org - Anonymized search queries are included from August 2023 onward. + For the developer.mozilla.org domain: + * Records from 2024-04-10 onward are from source data exported directly to BigQuery by Google. + * Records before 2024-04-10 are from source data synced to BigQuery by Fivetran. - Source data from before August 2023 was synced to BigQuery by Fivetran. - Source data from August 2023 onward was exported directly to BigQuery by Google. - See https://bugzilla.mozilla.org/show_bug.cgi?id=1764960#c34. + For the other domains: + * Records from 2023-08-01 onward are from source data exported directly to BigQuery by Google. + * Records before 2023-08-01 are from source data synced to BigQuery by Fivetran. + + Anonymized search queries are only included if the source data was exported directly to BigQuery by Google. owners: - srose@mozilla.com labels: diff --git a/sql/moz-fx-data-marketing-prod/google_search_console/search_impressions_by_site/schema.yaml b/sql/moz-fx-data-marketing-prod/google_search_console/search_impressions_by_site/schema.yaml index 87cf36cf48..6a813d2808 100644 --- a/sql/moz-fx-data-marketing-prod/google_search_console/search_impressions_by_site/schema.yaml +++ b/sql/moz-fx-data-marketing-prod/google_search_console/search_impressions_by_site/schema.yaml @@ -17,6 +17,15 @@ fields: type: STRING mode: NULLABLE description: The search query. +- name: query_type + type: STRING + mode: NULLABLE + description: |- + Type of search query: + * Anonymized: Query was redacted by Google to protect the users' privacy. + * Brand: Query contained one or more Mozilla brand keywords. + * Non-Brand: Query didn't contain any Mozilla brand keywords. + * Unknown: Query couldn't be classified. - name: is_anonymized type: BOOLEAN mode: NULLABLE @@ -32,10 +41,14 @@ fields: * Image: In Google Search's "Images" tab. * Video: In Google Search's "Videos" tab. * News: In Google Search's "News" tab. -- name: country_code +- name: user_country_code type: STRING mode: NULLABLE - description: Country from which the search was made, in ISO-3166-1-Alpha-3 format. + description: Country from which the user was searching, in ISO-3166-1-alpha-3 format. +- name: user_country + type: STRING + mode: NULLABLE + description: Country from which the user was searching. - name: device_type type: STRING mode: NULLABLE diff --git a/sql/moz-fx-data-marketing-prod/google_search_console/search_impressions_by_site/view.sql b/sql/moz-fx-data-marketing-prod/google_search_console/search_impressions_by_site/view.sql index f4fee8277f..f0ac1d2a2f 100644 --- a/sql/moz-fx-data-marketing-prod/google_search_console/search_impressions_by_site/view.sql +++ b/sql/moz-fx-data-marketing-prod/google_search_console/search_impressions_by_site/view.sql @@ -1,36 +1,83 @@ CREATE OR REPLACE VIEW `moz-fx-data-marketing-prod.google_search_console.search_impressions_by_site` AS +WITH search_impressions_union AS ( + SELECT + `date`, + site_url, + site_domain_name, + query, + FALSE AS is_anonymized, + search_type, + user_country_code, + device_type, + impressions, + clicks, + average_top_position + FROM + `moz-fx-data-marketing-prod.google_search_console_derived.search_impressions_by_site_v1` + WHERE + CASE + WHEN site_domain_name IN ( + 'addons.mozilla.org', + 'blog.mozilla.org', + 'getpocket.com', + 'support.mozilla.org', + 'www.mozilla.org' + ) + THEN `date` < '2023-08-01' + WHEN site_domain_name = 'developer.mozilla.org' + THEN `date` < '2024-04-10' + ELSE FALSE + END + UNION ALL + SELECT + `date`, + site_url, + site_domain_name, + query, + is_anonymized, + search_type, + user_country_code, + device_type, + impressions, + clicks, + average_top_position + FROM + `moz-fx-data-marketing-prod.google_search_console_derived.search_impressions_by_site_v2` + WHERE + CASE + WHEN site_domain_name IN ( + 'addons.mozilla.org', + 'blog.mozilla.org', + 'getpocket.com', + 'support.mozilla.org', + 'www.mozilla.org' + ) + THEN `date` >= '2023-08-01' + ELSE TRUE + END +) SELECT - `date`, - site_url, - site_domain_name, - query, - FALSE AS is_anonymized, - search_type, - country_code, - device_type, - impressions, - clicks, - average_top_position + search_impressions.`date`, + search_impressions.site_url, + search_impressions.site_domain_name, + search_impressions.query, + mozfun.google_search_console.classify_site_query( + search_impressions.site_domain_name, + search_impressions.query, + search_impressions.search_type + ) AS query_type, + search_impressions.is_anonymized, + search_impressions.search_type, + search_impressions.user_country_code, + COALESCE(user_country.name, search_impressions.user_country_code) AS user_country, + search_impressions.device_type, + search_impressions.impressions, + search_impressions.clicks, + search_impressions.average_top_position FROM - `moz-fx-data-marketing-prod.google_search_console_derived.search_impressions_by_site_v1` -WHERE - `date` < '2023-08-01' -UNION ALL -SELECT - `date`, - site_url, - site_domain_name, - query, - is_anonymized, - search_type, - country_code, - device_type, - impressions, - clicks, - average_top_position -FROM - `moz-fx-data-marketing-prod.google_search_console_derived.search_impressions_by_site_v2` -WHERE - `date` >= '2023-08-01' + search_impressions_union AS search_impressions +LEFT JOIN + `moz-fx-data-shared-prod.static.country_codes_v1` AS user_country + ON search_impressions.user_country_code = user_country.code_3 diff --git a/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_page_v1/metadata.yaml b/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_page_v1/metadata.yaml index 03637637d6..21dbb3f1d4 100644 --- a/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_page_v1/metadata.yaml +++ b/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_page_v1/metadata.yaml @@ -3,23 +3,28 @@ description: |- Google Search impressions aggregated by page, synced by Fivetran to BigQuery for the following domains: * addons.mozilla.org * blog.mozilla.org + * developer.mozilla.org * getpocket.com * support.mozilla.org * www.mozilla.org Anonymized search queries aren't included. - We stopped syncing Google Search Console data with Fivetran in August 2023. + For the developer.mozilla.org domain, we stopped syncing Google Search Console data with Fivetran in May 2024. + See https://bugzilla.mozilla.org/show_bug.cgi?id=1890816. + + For the other domains, we stopped syncing Google Search Console data with Fivetran in August 2023. See https://bugzilla.mozilla.org/show_bug.cgi?id=1764960#c44. owners: - srose@mozilla.com labels: incremental: true owner1: srose -# Not scheduled because we stopped syncing Google Search Console data with Fivetran in August 2023. -# See https://bugzilla.mozilla.org/show_bug.cgi?id=1764960#c44. -#scheduling: -# date_partition_parameter: date +scheduling: + # Not scheduled because we stopped syncing Google Search Console data with Fivetran. + # See https://bugzilla.mozilla.org/show_bug.cgi?id=1764960#c44 and https://bugzilla.mozilla.org/show_bug.cgi?id=1890816. + #dag_name: bqetl_google_search_console + date_partition_parameter: date bigquery: time_partitioning: type: day diff --git a/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_page_v1/query.sql b/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_page_v1/query.sql index 5223eff169..a83111292b 100644 --- a/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_page_v1/query.sql +++ b/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_page_v1/query.sql @@ -1,12 +1,13 @@ -{% set fivetran_gsc_dataset_ids = [ - 'moz-fx-data-bq-fivetran.google_search_console_addons', - 'moz-fx-data-bq-fivetran.google_search_console_blog', - 'moz-fx-data-bq-fivetran.google_search_console_pocket', - 'moz-fx-data-bq-fivetran.google_search_console_support', - 'moz-fx-data-bq-fivetran.google_search_console_www', +{% set fivetran_gsc_datasets = [ + {'id': 'moz-fx-data-bq-fivetran.google_search_console_addons', 'query_column': 'keyword'}, + {'id': 'moz-fx-data-bq-fivetran.google_search_console_blog', 'query_column': 'keyword'}, + {'id': 'moz-fx-data-bq-fivetran.google_search_console_mdn', 'query_column': 'query'}, + {'id': 'moz-fx-data-bq-fivetran.google_search_console_pocket', 'query_column': 'keyword'}, + {'id': 'moz-fx-data-bq-fivetran.google_search_console_support', 'query_column': 'keyword'}, + {'id': 'moz-fx-data-bq-fivetran.google_search_console_www', 'query_column': 'keyword'}, ] %} WITH keyword_page_report_union AS ( - {% for fivetran_gsc_dataset_id in fivetran_gsc_dataset_ids %} + {% for fivetran_gsc_dataset in fivetran_gsc_datasets %} {% if not loop.first %} UNION ALL {% endif %} @@ -14,7 +15,7 @@ WITH keyword_page_report_union AS ( `date`, site, page, - keyword, + `{{ fivetran_gsc_dataset['query_column'] }}` AS query, search_type, country, device, @@ -22,20 +23,22 @@ WITH keyword_page_report_union AS ( clicks, position FROM - `{{ fivetran_gsc_dataset_id }}.keyword_page_report` + `{{ fivetran_gsc_dataset['id'] }}.keyword_page_report` {% endfor %} ) SELECT `date`, site AS site_url, - REGEXP_EXTRACT(site, r'^(?:https?://|sc-domain:)([^/]+)') AS site_domain_name, + mozfun.google_search_console.extract_url_domain_name(site) AS site_domain_name, page AS page_url, - REGEXP_EXTRACT(page, r'^https?://([^/]+)') AS page_domain_name, - REGEXP_EXTRACT(page, r'^https?://(?:[^/]+)([^\?#]*)') AS page_path, - REGEXP_EXTRACT(page, r'^https?://(?:[^/]+)/*([^/\?#]*)') AS page_path_segment_1, - keyword AS query, + mozfun.google_search_console.extract_url_domain_name(page) AS page_domain_name, + mozfun.google_search_console.extract_url_path(page) AS page_path, + mozfun.google_search_console.extract_url_locale(page) AS localized_site_code, + mozfun.google_search_console.extract_url_language_code(page) AS localized_site_language_code, + mozfun.google_search_console.extract_url_country_code(page) AS localized_site_country_code, + query, INITCAP(search_type) AS search_type, - UPPER(country) AS country_code, + UPPER(country) AS user_country_code, INITCAP(device) AS device_type, CAST(impressions AS INTEGER) AS impressions, CAST(clicks AS INTEGER) AS clicks, diff --git a/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_page_v1/schema.yaml b/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_page_v1/schema.yaml index 3eac422c22..ee7cba2e66 100644 --- a/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_page_v1/schema.yaml +++ b/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_page_v1/schema.yaml @@ -25,10 +25,18 @@ fields: type: STRING mode: NULLABLE description: The path part of the page URL. -- name: page_path_segment_1 +- name: localized_site_code type: STRING mode: NULLABLE - description: The first segment of the page URL path, which is often a locale like `en-US` or `de`. + description: Localized site code such as `en-US` or `de` found in the first segment of the page URL path (if any). +- name: localized_site_language_code + type: STRING + mode: NULLABLE + description: Localized site language code in ISO-639-alpha-2 format found in the first segment of the page URL path (if any). +- name: localized_site_country_code + type: STRING + mode: NULLABLE + description: Localized site country code in ISO-3166-1-alpha-2 format found in the first segment of the page URL path (if any). - name: query type: STRING mode: NULLABLE @@ -42,10 +50,10 @@ fields: * Image: In Google Search's "Images" tab. * Video: In Google Search's "Videos" tab. * News: In Google Search's "News" tab. -- name: country_code +- name: user_country_code type: STRING mode: NULLABLE - description: Country from which the search was made, in ISO-3166-1-Alpha-3 format. + description: Country from which the user was searching, in ISO-3166-1-alpha-3 format. - name: device_type type: STRING mode: NULLABLE diff --git a/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_page_v2/metadata.yaml b/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_page_v2/metadata.yaml index 6918e20a8c..e004cf1292 100644 --- a/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_page_v2/metadata.yaml +++ b/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_page_v2/metadata.yaml @@ -3,13 +3,17 @@ description: |- Google Search impressions aggregated by page, exported directly to BigQuery for the following domains: * addons.mozilla.org * blog.mozilla.org + * developer.mozilla.org * getpocket.com * support.mozilla.org * www.mozilla.org Anonymized search queries are included. - We started exporting Google Search Console data directly to BigQuery in July 2023. + For the developer.mozilla.org domain, we started exporting Google Search Console data directly to BigQuery in April 2024. + See https://bugzilla.mozilla.org/show_bug.cgi?id=1890816. + + For the other domains, we started exporting Google Search Console data directly to BigQuery in July 2023. See https://bugzilla.mozilla.org/show_bug.cgi?id=1764960#c34. owners: - srose@mozilla.com @@ -31,6 +35,9 @@ scheduling: - task_id: wait_for_google_search_console_getpocket_url_impressions table_id: moz-fx-data-marketing-prod.searchconsole_getpocket.searchdata_url_impression partition_id: '{{ data_interval_start.subtract(days=1) | ds_nodash }}' + - task_id: wait_for_google_search_console_mdn_url_impressions + table_id: moz-fx-data-marketing-prod.searchconsole_mdn.searchdata_url_impression + partition_id: '{{ data_interval_start.subtract(days=1) | ds_nodash }}' - task_id: wait_for_google_search_console_support_url_impressions table_id: moz-fx-data-marketing-prod.searchconsole_support.searchdata_url_impression partition_id: '{{ data_interval_start.subtract(days=1) | ds_nodash }}' diff --git a/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_page_v2/query.sql b/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_page_v2/query.sql index 8301e3de3f..b0bfb7e5c1 100644 --- a/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_page_v2/query.sql +++ b/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_page_v2/query.sql @@ -2,6 +2,7 @@ 'moz-fx-data-marketing-prod.searchconsole_addons', 'moz-fx-data-marketing-prod.searchconsole_blog', 'moz-fx-data-marketing-prod.searchconsole_getpocket', + 'moz-fx-data-marketing-prod.searchconsole_mdn', 'moz-fx-data-marketing-prod.searchconsole_support', 'moz-fx-data-marketing-prod.searchconsole_www', ] %} @@ -61,11 +62,13 @@ WITH searchdata_url_impression_union AS ( SELECT data_date AS `date`, site_url, - REGEXP_EXTRACT(site_url, r'^(?:https?://|sc-domain:)([^/]+)') AS site_domain_name, + mozfun.google_search_console.extract_url_domain_name(site_url) AS site_domain_name, url AS page_url, - REGEXP_EXTRACT(url, r'^https?://([^/]+)') AS page_domain_name, - REGEXP_EXTRACT(url, r'^https?://(?:[^/]+)([^\?#]*)') AS page_path, - REGEXP_EXTRACT(url, r'^https?://(?:[^/]+)/*([^/\?#]*)') AS page_path_segment_1, + mozfun.google_search_console.extract_url_domain_name(url) AS page_domain_name, + mozfun.google_search_console.extract_url_path(url) AS page_path, + mozfun.google_search_console.extract_url_locale(url) AS localized_site_code, + mozfun.google_search_console.extract_url_language_code(url) AS localized_site_language_code, + mozfun.google_search_console.extract_url_country_code(url) AS localized_site_country_code, query, (is_anonymized_query OR is_anonymized_discover) AS is_anonymized, is_page_experience AS has_good_page_experience, @@ -77,7 +80,7 @@ SELECT {% endfor %} ELSE 'Normal result' END AS search_appearance, - UPPER(country) AS country_code, + UPPER(country) AS user_country_code, INITCAP(device) AS device_type, impressions, clicks, diff --git a/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_page_v2/schema.yaml b/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_page_v2/schema.yaml index f6e9fe9857..ead6165663 100644 --- a/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_page_v2/schema.yaml +++ b/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_page_v2/schema.yaml @@ -31,11 +31,23 @@ fields: description: |- The path part of the page URL. This will be null for anonymized Discover impressions. -- name: page_path_segment_1 +- name: localized_site_code type: STRING mode: NULLABLE description: |- - The first segment of the page URL path, which is often a locale like `en-US` or `de`. + Localized site code such as `en-US` or `de` found in the first segment of the page URL path (if any). + This will be null for anonymized Discover impressions. +- name: localized_site_language_code + type: STRING + mode: NULLABLE + description: |- + Localized site language code in ISO-639-alpha-2 format found in the first segment of the page URL path (if any). + This will be null for anonymized Discover impressions. +- name: localized_site_country_code + type: STRING + mode: NULLABLE + description: |- + Localized site country code in ISO-3166-1-alpha-2 format found in the first segment of the page URL path (if any). This will be null for anonymized Discover impressions. - name: query type: STRING @@ -69,11 +81,11 @@ fields: type: STRING mode: NULLABLE description: How the search result appeared (e.g. normal result, translated result, video). -- name: country_code +- name: user_country_code type: STRING mode: NULLABLE description: |- - Country from which the search was made, in ISO-3166-1-Alpha-3 format. + Country from which the user was searching, in ISO-3166-1-alpha-3 format. This will be null for anonymized Discover impressions. - name: device_type type: STRING diff --git a/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_site_v1/metadata.yaml b/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_site_v1/metadata.yaml index 64297ac0cd..b3f53abbbf 100644 --- a/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_site_v1/metadata.yaml +++ b/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_site_v1/metadata.yaml @@ -3,23 +3,28 @@ description: |- Google Search impressions aggregated by site, synced by Fivetran to BigQuery for the following domains: * addons.mozilla.org * blog.mozilla.org + * developer.mozilla.org * getpocket.com * support.mozilla.org * www.mozilla.org Anonymized search queries aren't included. - We stopped syncing Google Search Console data with Fivetran in August 2023. + For the developer.mozilla.org domain, we stopped syncing Google Search Console data with Fivetran in May 2024. + See https://bugzilla.mozilla.org/show_bug.cgi?id=1890816. + + For the other domains, we stopped syncing Google Search Console data with Fivetran in August 2023. See https://bugzilla.mozilla.org/show_bug.cgi?id=1764960#c44. owners: - srose@mozilla.com labels: incremental: true owner1: srose -# Not scheduled because we stopped syncing Google Search Console data with Fivetran in August 2023. -# See https://bugzilla.mozilla.org/show_bug.cgi?id=1764960#c44. -#scheduling: -# date_partition_parameter: date +scheduling: + # Not scheduled because we stopped syncing Google Search Console data with Fivetran. + # See https://bugzilla.mozilla.org/show_bug.cgi?id=1764960#c44 and https://bugzilla.mozilla.org/show_bug.cgi?id=1890816. + #dag_name: bqetl_google_search_console + date_partition_parameter: date bigquery: time_partitioning: type: day diff --git a/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_site_v1/query.sql b/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_site_v1/query.sql index 0733e70ab5..087d956589 100644 --- a/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_site_v1/query.sql +++ b/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_site_v1/query.sql @@ -1,19 +1,20 @@ -{% set fivetran_gsc_dataset_ids = [ - 'moz-fx-data-bq-fivetran.google_search_console_addons', - 'moz-fx-data-bq-fivetran.google_search_console_blog', - 'moz-fx-data-bq-fivetran.google_search_console_pocket', - 'moz-fx-data-bq-fivetran.google_search_console_support', - 'moz-fx-data-bq-fivetran.google_search_console_www', +{% set fivetran_gsc_datasets = [ + {'id': 'moz-fx-data-bq-fivetran.google_search_console_addons', 'query_column': 'keyword'}, + {'id': 'moz-fx-data-bq-fivetran.google_search_console_blog', 'query_column': 'keyword'}, + {'id': 'moz-fx-data-bq-fivetran.google_search_console_mdn', 'query_column': 'query'}, + {'id': 'moz-fx-data-bq-fivetran.google_search_console_pocket', 'query_column': 'keyword'}, + {'id': 'moz-fx-data-bq-fivetran.google_search_console_support', 'query_column': 'keyword'}, + {'id': 'moz-fx-data-bq-fivetran.google_search_console_www', 'query_column': 'keyword'}, ] %} WITH keyword_site_report_by_site_union AS ( - {% for fivetran_gsc_dataset_id in fivetran_gsc_dataset_ids %} + {% for fivetran_gsc_dataset in fivetran_gsc_datasets %} {% if not loop.first %} UNION ALL {% endif %} SELECT `date`, site, - keyword, + `{{ fivetran_gsc_dataset['query_column'] }}` AS query, search_type, country, device, @@ -21,16 +22,16 @@ WITH keyword_site_report_by_site_union AS ( clicks, position FROM - `{{ fivetran_gsc_dataset_id }}.keyword_site_report_by_site` + `{{ fivetran_gsc_dataset['id'] }}.keyword_site_report_by_site` {% endfor %} ) SELECT `date`, site AS site_url, - REGEXP_EXTRACT(site, r'^(?:https?://|sc-domain:)([^/]+)') AS site_domain_name, - keyword AS query, + mozfun.google_search_console.extract_url_domain_name(site) AS site_domain_name, + query, INITCAP(search_type) AS search_type, - UPPER(country) AS country_code, + UPPER(country) AS user_country_code, INITCAP(device) AS device_type, CAST(impressions AS INTEGER) AS impressions, CAST(clicks AS INTEGER) AS clicks, diff --git a/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_site_v1/schema.yaml b/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_site_v1/schema.yaml index 1a436a6f05..37141a6e70 100644 --- a/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_site_v1/schema.yaml +++ b/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_site_v1/schema.yaml @@ -26,10 +26,10 @@ fields: * Image: In Google Search's "Images" tab. * Video: In Google Search's "Videos" tab. * News: In Google Search's "News" tab. -- name: country_code +- name: user_country_code type: STRING mode: NULLABLE - description: Country from which the search was made, in ISO-3166-1-Alpha-3 format. + description: Country from which the user was searching, in ISO-3166-1-alpha-3 format. - name: device_type type: STRING mode: NULLABLE diff --git a/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_site_v2/metadata.yaml b/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_site_v2/metadata.yaml index 1e7c061543..4e0e2cdaa8 100644 --- a/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_site_v2/metadata.yaml +++ b/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_site_v2/metadata.yaml @@ -3,13 +3,17 @@ description: |- Google Search impressions aggregated by site, exported directly to BigQuery for the following domains: * addons.mozilla.org * blog.mozilla.org + * developer.mozilla.org * getpocket.com * support.mozilla.org * www.mozilla.org Anonymized search queries are included. - We started exporting Google Search Console data directly to BigQuery in July 2023. + For the developer.mozilla.org domain, we started exporting Google Search Console data directly to BigQuery in April 2024. + See https://bugzilla.mozilla.org/show_bug.cgi?id=1890816. + + For the other domains, we started exporting Google Search Console data directly to BigQuery in July 2023. See https://bugzilla.mozilla.org/show_bug.cgi?id=1764960#c34. owners: - srose@mozilla.com @@ -31,6 +35,9 @@ scheduling: - task_id: wait_for_google_search_console_getpocket_site_impressions table_id: moz-fx-data-marketing-prod.searchconsole_getpocket.searchdata_site_impression partition_id: '{{ data_interval_start.subtract(days=1) | ds_nodash }}' + - task_id: wait_for_google_search_console_mdn_site_impressions + table_id: moz-fx-data-marketing-prod.searchconsole_mdn.searchdata_site_impression + partition_id: '{{ data_interval_start.subtract(days=1) | ds_nodash }}' - task_id: wait_for_google_search_console_support_site_impressions table_id: moz-fx-data-marketing-prod.searchconsole_support.searchdata_site_impression partition_id: '{{ data_interval_start.subtract(days=1) | ds_nodash }}' diff --git a/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_site_v2/query.sql b/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_site_v2/query.sql index d6e5f02c8d..fd9a79ba70 100644 --- a/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_site_v2/query.sql +++ b/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_site_v2/query.sql @@ -2,6 +2,7 @@ 'moz-fx-data-marketing-prod.searchconsole_addons', 'moz-fx-data-marketing-prod.searchconsole_blog', 'moz-fx-data-marketing-prod.searchconsole_getpocket', + 'moz-fx-data-marketing-prod.searchconsole_mdn', 'moz-fx-data-marketing-prod.searchconsole_support', 'moz-fx-data-marketing-prod.searchconsole_www', ] %} @@ -28,11 +29,11 @@ WITH searchdata_site_impression_union AS ( SELECT data_date AS `date`, site_url, - REGEXP_EXTRACT(site_url, r'^(?:https?://|sc-domain:)([^/]+)') AS site_domain_name, + mozfun.google_search_console.extract_url_domain_name(site_url) AS site_domain_name, query, is_anonymized_query AS is_anonymized, INITCAP(REPLACE(search_type, '_', ' ')) AS search_type, - UPPER(country) AS country_code, + UPPER(country) AS user_country_code, INITCAP(device) AS device_type, impressions, clicks, diff --git a/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_site_v2/schema.yaml b/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_site_v2/schema.yaml index 87cf36cf48..ea8256301d 100644 --- a/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_site_v2/schema.yaml +++ b/sql/moz-fx-data-marketing-prod/google_search_console_derived/search_impressions_by_site_v2/schema.yaml @@ -32,10 +32,10 @@ fields: * Image: In Google Search's "Images" tab. * Video: In Google Search's "Videos" tab. * News: In Google Search's "News" tab. -- name: country_code +- name: user_country_code type: STRING mode: NULLABLE - description: Country from which the search was made, in ISO-3166-1-Alpha-3 format. + description: Country from which the user was searching, in ISO-3166-1-alpha-3 format. - name: device_type type: STRING mode: NULLABLE diff --git a/sql/mozfun/google_search_console/README.md b/sql/mozfun/google_search_console/README.md new file mode 100644 index 0000000000..713e7f0d94 --- /dev/null +++ b/sql/mozfun/google_search_console/README.md @@ -0,0 +1,3 @@ +# google_search_console + +Functions for use with Google Search Console data. diff --git a/sql/mozfun/google_search_console/classify_site_query/README.md b/sql/mozfun/google_search_console/classify_site_query/README.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sql/mozfun/google_search_console/classify_site_query/metadata.yaml b/sql/mozfun/google_search_console/classify_site_query/metadata.yaml new file mode 100644 index 0000000000..a02b010d56 --- /dev/null +++ b/sql/mozfun/google_search_console/classify_site_query/metadata.yaml @@ -0,0 +1,3 @@ +friendly_name: Classify Site Query +description: >- + Classify a Google search query for a site as "Anonymized", "Brand", "Non-Brand", or "Unknown". diff --git a/sql/mozfun/google_search_console/classify_site_query/udf.sql b/sql/mozfun/google_search_console/classify_site_query/udf.sql new file mode 100644 index 0000000000..5f0b0662de --- /dev/null +++ b/sql/mozfun/google_search_console/classify_site_query/udf.sql @@ -0,0 +1,137 @@ +CREATE OR REPLACE FUNCTION google_search_console.classify_site_query( + site_domain_name STRING, + query STRING, + search_type STRING +) +RETURNS STRING AS ( + CASE + -- Discover and Google News search impressions never have `query` values. + WHEN search_type IN ('Discover', 'Google News') + THEN NULL + WHEN query IS NULL + THEN 'Anonymized' + WHEN site_domain_name = 'www.mozilla.org' + THEN IF( + REGEXP_CONTAINS( + query, + ARRAY_TO_STRING( + [ + r'\bff\b', + r'\bm.z', + r'f.ref.x', + r'fier', + r'fire', + r'firf', + r'focus', + r'fokku', + r'fox', + r'il+a\b', + r'nightly', + r'quantum', + r'μοζ+ιλ+α', -- moz+il+a (Greek) + r'μοτζιλα', -- motzila (Greek) + r'лиса', -- fox (Cyrillic) + r'мази', -- mazi (Cyrillic) + r'мазі', -- mazi (Cyrillic) + r'моз', -- moz (Cyrillic) + r'муз', -- muz (Cyrillic) + r'фаер', -- fire (Cyrillic) + r'фаир', -- fair (Cyrillic) + r'файер', -- fire (Cyrillic) + r'файр', -- fire (Cyrillic) + r'фире', -- fire (Cyrillic) + r'фокс', -- fox (Cyrillic) + r'фох', -- fox (Cyrillic) + r'כןרקכםס', -- yes, rakhems (Hebrew) + r'מוזילה', -- mozilla (Hebrew) + r'פיירפוקס', -- firefox (Hebrew) + r'فاکس', -- fax (Arabic) + r'فاير', -- fire (Arabic) + r'فایر', -- fire (Arabic) + r'فكس', -- fx (Arabic) + r'فوكس', -- fox (Arabic) + r'فير', -- fir (Arabic) + r'موزلا', -- mozilla (Arabic) + r'موزيلا', -- mozilla (Arabic) + r'موزیلا', -- mozilla (Arabic) + r'फायरफक्स', -- firefox (Indic) + r'फायरफॉक्स', -- firefox (Indic) + r'फ़ायरफ़ॉक्स', -- firefox (Indic) + r'मोजिला', -- mozilla (Indic) + r'मोज़िला', -- mozilla (Indic) + r'ফায়ারফক্স', -- firefox (Indic) + r'মজিলা', -- mozilla (Indic) + r'মোজিলা', -- mozilla (Indic) + r'ฟายฟอก', -- bleach (Thai) + r'ฟายฟ๊อก', -- firefox (Thai) + r'ไฟ ฟอก', -- fire bleach (Thai) + r'ไฟฟ็อก', -- fire fox (Thai) + r'ไฟฟ๊อก', -- fire fox (Thai) + r'ไฟฟอก', -- purifying light (Thai) + r'ไฟร์ฟอกซ์', -- firefox (Thai) + r'ไฟล์ฟอก', -- bleaching file (Thai) + r'ไฟลฟอก', -- fire bleach (Thai) + r'모질라', -- mozilla (Korean) + r'파이어', -- fire (Korean) + r'폭스', -- fox (Korean) + r'화이어', -- fire (Korean) + r'ふぁいあ', -- faia (Japanese) + r'ファイア', -- fire (Japanese) + r'ファイや', -- faiya (Japanese) + r'ファイヤ', -- fire (Japanese) + r'ふぁいやー', -- fire (Japanese) + r'ふぃれふぉ', -- firefox (Japanese) + r'ふぉっくす', -- fox (Japanese) + r'フォックス', -- fox (Japanese) + r'モジラ', -- mozilla (Japanese) + r'火孤', -- firefox (Chinese) + r'火狐', -- firefox (Chinese) + r'狐狸' -- fox (Chinese) + ], + '|' + ) + ), + 'Brand', + 'Non-Brand' + ) + ELSE 'Unknown' + END +); + +SELECT + assert.equals( + google_search_console.classify_site_query('www.mozilla.org', 'mozilla', 'Discover'), + CAST(NULL AS STRING) + ), + assert.equals( + google_search_console.classify_site_query('www.mozilla.org', 'mozilla', 'Google News'), + CAST(NULL AS STRING) + ), + assert.equals( + google_search_console.classify_site_query('www.mozilla.org', NULL, 'Discover'), + CAST(NULL AS STRING) + ), + assert.equals( + google_search_console.classify_site_query('www.mozilla.org', NULL, 'Google News'), + CAST(NULL AS STRING) + ), + assert.equals( + google_search_console.classify_site_query('www.mozilla.org', NULL, 'Web'), + 'Anonymized' + ), + assert.equals( + google_search_console.classify_site_query('www.mozilla.org', 'mozilla', 'Web'), + 'Brand' + ), + assert.equals( + google_search_console.classify_site_query('www.mozilla.org', 'firefox', 'Web'), + 'Brand' + ), + assert.equals( + google_search_console.classify_site_query('www.mozilla.org', 'browser', 'Web'), + 'Non-Brand' + ), + assert.equals( + google_search_console.classify_site_query('addons.mozilla.org', 'mozilla', 'Web'), + 'Unknown' + ), diff --git a/sql/mozfun/google_search_console/extract_url_country_code/README.md b/sql/mozfun/google_search_console/extract_url_country_code/README.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sql/mozfun/google_search_console/extract_url_country_code/metadata.yaml b/sql/mozfun/google_search_console/extract_url_country_code/metadata.yaml new file mode 100644 index 0000000000..b00afc2b2f --- /dev/null +++ b/sql/mozfun/google_search_console/extract_url_country_code/metadata.yaml @@ -0,0 +1,3 @@ +friendly_name: Extract URL Country Code +description: >- + Extract the country code from a URL if it's present. diff --git a/sql/mozfun/google_search_console/extract_url_country_code/udf.sql b/sql/mozfun/google_search_console/extract_url_country_code/udf.sql new file mode 100644 index 0000000000..5327fbca86 --- /dev/null +++ b/sql/mozfun/google_search_console/extract_url_country_code/udf.sql @@ -0,0 +1,26 @@ +CREATE OR REPLACE FUNCTION google_search_console.extract_url_country_code(url STRING) +RETURNS STRING AS ( + UPPER(SPLIT(google_search_console.extract_url_locale(url), '-')[SAFE_ORDINAL(2)]) +); + +SELECT + assert.equals( + google_search_console.extract_url_country_code('https://www.mozilla.org/en-US/firefox/'), + 'US' + ), + assert.equals( + google_search_console.extract_url_country_code('https://www.mozilla.org/en-us/firefox/'), + 'US' + ), + assert.equals( + google_search_console.extract_url_country_code('https://support.mozilla.org/es/'), + CAST(NULL AS STRING) + ), + assert.equals( + google_search_console.extract_url_country_code('https://blog.mozilla.org/ux/'), + CAST(NULL AS STRING) + ), + assert.equals( + google_search_console.extract_url_country_code('https://www.mozilla.org/'), + CAST(NULL AS STRING) + ), diff --git a/sql/mozfun/google_search_console/extract_url_domain_name/README.md b/sql/mozfun/google_search_console/extract_url_domain_name/README.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sql/mozfun/google_search_console/extract_url_domain_name/metadata.yaml b/sql/mozfun/google_search_console/extract_url_domain_name/metadata.yaml new file mode 100644 index 0000000000..13f6b6a4d3 --- /dev/null +++ b/sql/mozfun/google_search_console/extract_url_domain_name/metadata.yaml @@ -0,0 +1,3 @@ +friendly_name: Extract URL Domain Name +description: >- + Extract the domain name from a URL. diff --git a/sql/mozfun/google_search_console/extract_url_domain_name/udf.sql b/sql/mozfun/google_search_console/extract_url_domain_name/udf.sql new file mode 100644 index 0000000000..a3fec77a5e --- /dev/null +++ b/sql/mozfun/google_search_console/extract_url_domain_name/udf.sql @@ -0,0 +1,14 @@ +CREATE OR REPLACE FUNCTION google_search_console.extract_url_domain_name(url STRING) +RETURNS STRING AS ( + REGEXP_EXTRACT(url, r'^(?:https?://|sc-domain:)([^/]+)') +); + +SELECT + assert.equals( + google_search_console.extract_url_domain_name('https://www.mozilla.org/'), + 'www.mozilla.org' + ), + assert.equals( + google_search_console.extract_url_domain_name('sc-domain:addons.mozilla.org'), + 'addons.mozilla.org' + ), diff --git a/sql/mozfun/google_search_console/extract_url_language_code/README.md b/sql/mozfun/google_search_console/extract_url_language_code/README.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sql/mozfun/google_search_console/extract_url_language_code/metadata.yaml b/sql/mozfun/google_search_console/extract_url_language_code/metadata.yaml new file mode 100644 index 0000000000..b74285c0a2 --- /dev/null +++ b/sql/mozfun/google_search_console/extract_url_language_code/metadata.yaml @@ -0,0 +1,3 @@ +friendly_name: Extract URL Language Code +description: >- + Extract the language code from a URL if it's present. diff --git a/sql/mozfun/google_search_console/extract_url_language_code/udf.sql b/sql/mozfun/google_search_console/extract_url_language_code/udf.sql new file mode 100644 index 0000000000..a7197c3da5 --- /dev/null +++ b/sql/mozfun/google_search_console/extract_url_language_code/udf.sql @@ -0,0 +1,22 @@ +CREATE OR REPLACE FUNCTION google_search_console.extract_url_language_code(url STRING) +RETURNS STRING AS ( + LOWER(SPLIT(google_search_console.extract_url_locale(url), '-')[SAFE_ORDINAL(1)]) +); + +SELECT + assert.equals( + google_search_console.extract_url_language_code('https://www.mozilla.org/en-US/firefox/'), + 'en' + ), + assert.equals( + google_search_console.extract_url_language_code('https://support.mozilla.org/es/'), + 'es' + ), + assert.equals( + google_search_console.extract_url_language_code('https://blog.mozilla.org/ux/'), + CAST(NULL AS STRING) + ), + assert.equals( + google_search_console.extract_url_language_code('https://www.mozilla.org/'), + CAST(NULL AS STRING) + ), diff --git a/sql/mozfun/google_search_console/extract_url_locale/README.md b/sql/mozfun/google_search_console/extract_url_locale/README.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sql/mozfun/google_search_console/extract_url_locale/metadata.yaml b/sql/mozfun/google_search_console/extract_url_locale/metadata.yaml new file mode 100644 index 0000000000..ec92ca90b6 --- /dev/null +++ b/sql/mozfun/google_search_console/extract_url_locale/metadata.yaml @@ -0,0 +1,3 @@ +friendly_name: Extract URL Locale +description: >- + Extract the locale from a URL if it's present. diff --git a/sql/mozfun/google_search_console/extract_url_locale/udf.sql b/sql/mozfun/google_search_console/extract_url_locale/udf.sql new file mode 100644 index 0000000000..b8516e3595 --- /dev/null +++ b/sql/mozfun/google_search_console/extract_url_locale/udf.sql @@ -0,0 +1,216 @@ +CREATE OR REPLACE FUNCTION google_search_console.extract_url_locale(url STRING) +RETURNS STRING AS ( + IF( + REGEXP_CONTAINS( + google_search_console.extract_url_path_segment(url, 1), + r'^[a-zA-Z]{2}-[a-zA-Z]{2}$' + ) + OR LOWER(google_search_console.extract_url_path_segment(url, 1)) IN ( + 'aa', + 'ab', + 'ae', + 'af', + 'ak', + 'am', + 'an', + 'ar', + 'as', + 'av', + 'ay', + 'az', + 'ba', + 'be', + 'bg', + 'bh', + 'bi', + 'bm', + 'bn', + 'bo', + 'br', + 'bs', + 'ca', + 'ce', + 'ch', + 'co', + 'cr', + 'cs', + 'cu', + 'cv', + 'cy', + 'da', + 'de', + 'dv', + 'dz', + 'ee', + 'el', + 'en', + 'eo', + 'es', + 'et', + 'eu', + 'fa', + 'ff', + 'fi', + 'fj', + 'fo', + 'fr', + 'fy', + 'ga', + 'gd', + 'gl', + 'gn', + 'gu', + 'gv', + 'ha', + 'he', + 'hi', + 'ho', + 'hr', + 'ht', + 'hu', + 'hy', + 'hz', + 'ia', + 'id', + 'ie', + 'ig', + 'ii', + 'ik', + 'io', + 'is', + 'it', + 'iu', + 'ja', + 'jv', + 'ka', + 'kg', + 'ki', + 'kj', + 'kk', + 'kl', + 'km', + 'kn', + 'ko', + 'kr', + 'ks', + 'ku', + 'kv', + 'kw', + 'ky', + 'la', + 'lb', + 'lg', + 'li', + 'ln', + 'lo', + 'lt', + 'lu', + 'lv', + 'mg', + 'mh', + 'mi', + 'mk', + 'ml', + 'mn', + 'mr', + 'ms', + 'mt', + 'my', + 'na', + 'nb', + 'nd', + 'ne', + 'ng', + 'nl', + 'nn', + 'no', + 'nr', + 'nv', + 'ny', + 'oc', + 'oj', + 'om', + 'or', + 'os', + 'pa', + 'pi', + 'pl', + 'ps', + 'pt', + 'qu', + 'rm', + 'rn', + 'ro', + 'ru', + 'rw', + 'sa', + 'sc', + 'sd', + 'se', + 'sg', + 'si', + 'sk', + 'sl', + 'sm', + 'sn', + 'so', + 'sq', + 'sr', + 'ss', + 'st', + 'su', + 'sv', + 'sw', + 'ta', + 'te', + 'tg', + 'th', + 'ti', + 'tk', + 'tl', + 'tn', + 'to', + 'tr', + 'ts', + 'tt', + 'tw', + 'ty', + 'ug', + 'uk', + 'ur', + 'uz', + 've', + 'vi', + 'vo', + 'wa', + 'wo', + 'xh', + 'yi', + 'yo', + 'za', + 'zh', + 'zu' + ), + google_search_console.extract_url_path_segment(url, 1), + NULL + ) +); + +SELECT + assert.equals( + google_search_console.extract_url_locale('https://www.mozilla.org/en-US/firefox/'), + 'en-US' + ), + assert.equals( + google_search_console.extract_url_locale('https://www.mozilla.org/en-us/firefox/'), + 'en-us' + ), + assert.equals(google_search_console.extract_url_locale('https://support.mozilla.org/es/'), 'es'), + assert.equals( + google_search_console.extract_url_locale('https://blog.mozilla.org/ux/'), + CAST(NULL AS STRING) + ), + assert.equals( + google_search_console.extract_url_locale('https://www.mozilla.org/'), + CAST(NULL AS STRING) + ), diff --git a/sql/mozfun/google_search_console/extract_url_path/README.md b/sql/mozfun/google_search_console/extract_url_path/README.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sql/mozfun/google_search_console/extract_url_path/metadata.yaml b/sql/mozfun/google_search_console/extract_url_path/metadata.yaml new file mode 100644 index 0000000000..1499c60d11 --- /dev/null +++ b/sql/mozfun/google_search_console/extract_url_path/metadata.yaml @@ -0,0 +1,3 @@ +friendly_name: Extract URL Path +description: >- + Extract the path from a URL. diff --git a/sql/mozfun/google_search_console/extract_url_path/udf.sql b/sql/mozfun/google_search_console/extract_url_path/udf.sql new file mode 100644 index 0000000000..ff1b20906e --- /dev/null +++ b/sql/mozfun/google_search_console/extract_url_path/udf.sql @@ -0,0 +1,20 @@ +CREATE OR REPLACE FUNCTION google_search_console.extract_url_path(url STRING) +RETURNS STRING AS ( + REGEXP_EXTRACT(url, r'^https?://(?:[^/]+)([^\?#]*)') +); + +SELECT + assert.equals(google_search_console.extract_url_path('https://www.mozilla.org'), ''), + assert.equals(google_search_console.extract_url_path('https://www.mozilla.org/'), '/'), + assert.equals( + google_search_console.extract_url_path('https://www.mozilla.org/en-US/firefox/'), + '/en-US/firefox/' + ), + assert.equals( + google_search_console.extract_url_path('https://www.mozilla.org/en-US/firefox/?foo'), + '/en-US/firefox/' + ), + assert.equals( + google_search_console.extract_url_path('https://www.mozilla.org/en-US/firefox/#foo'), + '/en-US/firefox/' + ), diff --git a/sql/mozfun/google_search_console/extract_url_path_segment/README.md b/sql/mozfun/google_search_console/extract_url_path_segment/README.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/sql/mozfun/google_search_console/extract_url_path_segment/metadata.yaml b/sql/mozfun/google_search_console/extract_url_path_segment/metadata.yaml new file mode 100644 index 0000000000..0cabda848b --- /dev/null +++ b/sql/mozfun/google_search_console/extract_url_path_segment/metadata.yaml @@ -0,0 +1,3 @@ +friendly_name: Extract URL Path Segment +description: >- + Extract a particular path segment from a URL. diff --git a/sql/mozfun/google_search_console/extract_url_path_segment/udf.sql b/sql/mozfun/google_search_console/extract_url_path_segment/udf.sql new file mode 100644 index 0000000000..1cc3cf71b1 --- /dev/null +++ b/sql/mozfun/google_search_console/extract_url_path_segment/udf.sql @@ -0,0 +1,21 @@ +CREATE OR REPLACE FUNCTION google_search_console.extract_url_path_segment( + url STRING, + segment_number INTEGER +) +RETURNS STRING AS ( + NULLIF(SPLIT(google_search_console.extract_url_path(url), '/')[SAFE_OFFSET(segment_number)], '') +); + +SELECT + assert.equals( + google_search_console.extract_url_path_segment('https://www.mozilla.org/en-US/firefox/', 1), + 'en-US' + ), + assert.equals( + google_search_console.extract_url_path_segment('https://www.mozilla.org/en-US/firefox/', 2), + 'firefox' + ), + assert.equals( + google_search_console.extract_url_path_segment('https://www.mozilla.org/en-US/firefox/', 3), + CAST(NULL AS STRING) + ),