Bug 1601139 - Add query to sample documents per doctype (#570)
* Bug 1601139 - Add query to sample documents per doctype * Add docstring, fix formatting, and update column name
This commit is contained in:
Родитель
28e23c7ea3
Коммит
b938356d48
|
@ -29,6 +29,7 @@ DERIVED_DATASETS_DRY_RUN_URL = (
|
|||
|
||||
SKIP = {
|
||||
# Access Denied
|
||||
"sql/monitoring/document_sample_nonprod_v1/query.sql",
|
||||
"sql/monitoring/schema_error_counts_v1/view.sql",
|
||||
"sql/monitoring/structured_error_counts_v1/view.sql",
|
||||
"sql/telemetry/fxa_content_events_v1/query.sql",
|
||||
|
|
|
@ -0,0 +1,70 @@
|
|||
-- Return a stratified sample of documents within the payload_bytes_decoded and
|
||||
-- payload_bytes_error tables. This query can be used for testing the expected
|
||||
-- behavior of the ingestion pipeline and for validating changes when updating
|
||||
-- schemas.
|
||||
WITH extract_decoded AS (
|
||||
SELECT
|
||||
metadata.document_namespace,
|
||||
metadata.document_type,
|
||||
metadata.document_version,
|
||||
'' AS error_message,
|
||||
payload
|
||||
FROM
|
||||
`moz-fx-data-shar-nonprod-efed.payload_bytes_decoded.*`
|
||||
WHERE
|
||||
submission_timestamp > TIMESTAMP_SUB(CURRENT_TIMESTAMP, INTERVAL 1 DAY)
|
||||
),
|
||||
extract_error AS (
|
||||
SELECT
|
||||
document_namespace,
|
||||
document_type,
|
||||
document_version,
|
||||
error_message,
|
||||
payload
|
||||
FROM
|
||||
`moz-fx-data-shar-nonprod-efed.payload_bytes_error.*`
|
||||
WHERE
|
||||
submission_timestamp > TIMESTAMP_SUB(CURRENT_TIMESTAMP, INTERVAL 1 DAY)
|
||||
AND error_type = 'ParsePayload'
|
||||
),
|
||||
extracted AS (
|
||||
SELECT
|
||||
*
|
||||
FROM
|
||||
extract_decoded
|
||||
UNION ALL
|
||||
SELECT
|
||||
*
|
||||
FROM
|
||||
extract_error
|
||||
)
|
||||
SELECT
|
||||
CURRENT_TIMESTAMP() AS submission_timestamp,
|
||||
ARRAY_LENGTH(samples) AS n_samples,
|
||||
document_decoded,
|
||||
sample.*
|
||||
FROM
|
||||
(
|
||||
SELECT
|
||||
document_namespace,
|
||||
document_type,
|
||||
document_version,
|
||||
BYTE_LENGTH(error_message) = 0 AS document_decoded,
|
||||
ARRAY_AGG(
|
||||
extracted
|
||||
ORDER BY
|
||||
RAND()
|
||||
LIMIT
|
||||
1000
|
||||
) samples
|
||||
FROM
|
||||
extracted
|
||||
GROUP BY
|
||||
document_namespace,
|
||||
document_type,
|
||||
document_version,
|
||||
document_decoded
|
||||
HAVING
|
||||
ARRAY_LENGTH(samples) > 10
|
||||
),
|
||||
UNNEST(samples) sample
|
|
@ -0,0 +1,70 @@
|
|||
-- Return a stratified sample of documents within the payload_bytes_decoded and
|
||||
-- payload_bytes_error tables. This query can be used for testing the expected
|
||||
-- behavior of the ingestion pipeline and for validating changes when updating
|
||||
-- schemas.
|
||||
WITH extract_decoded AS (
|
||||
SELECT
|
||||
metadata.document_namespace,
|
||||
metadata.document_type,
|
||||
metadata.document_version,
|
||||
'' AS error_message,
|
||||
payload
|
||||
FROM
|
||||
`moz-fx-data-shar-nonprod-efed.payload_bytes_decoded.*`
|
||||
WHERE
|
||||
submission_timestamp > TIMESTAMP_SUB(CURRENT_TIMESTAMP, INTERVAL 1 DAY)
|
||||
),
|
||||
extract_error AS (
|
||||
SELECT
|
||||
document_namespace,
|
||||
document_type,
|
||||
document_version,
|
||||
error_message,
|
||||
payload
|
||||
FROM
|
||||
`moz-fx-data-shar-nonprod-efed.payload_bytes_error.*`
|
||||
WHERE
|
||||
submission_timestamp > TIMESTAMP_SUB(CURRENT_TIMESTAMP, INTERVAL 1 DAY)
|
||||
AND error_type = 'ParsePayload'
|
||||
),
|
||||
extracted AS (
|
||||
SELECT
|
||||
*
|
||||
FROM
|
||||
extract_decoded
|
||||
UNION ALL
|
||||
SELECT
|
||||
*
|
||||
FROM
|
||||
extract_error
|
||||
)
|
||||
SELECT
|
||||
CURRENT_TIMESTAMP() AS submission_timestamp,
|
||||
ARRAY_LENGTH(samples) AS n_samples,
|
||||
document_decoded,
|
||||
sample.*
|
||||
FROM
|
||||
(
|
||||
SELECT
|
||||
document_namespace,
|
||||
document_type,
|
||||
document_version,
|
||||
BYTE_LENGTH(error_message) = 0 AS document_decoded,
|
||||
ARRAY_AGG(
|
||||
extracted
|
||||
ORDER BY
|
||||
RAND()
|
||||
LIMIT
|
||||
1000
|
||||
) samples
|
||||
FROM
|
||||
extracted
|
||||
GROUP BY
|
||||
document_namespace,
|
||||
document_type,
|
||||
document_version,
|
||||
document_decoded
|
||||
HAVING
|
||||
ARRAY_LENGTH(samples) > 10
|
||||
),
|
||||
UNNEST(samples) sample
|
Загрузка…
Ссылка в новой задаче