Migrate UDFs to new format
This commit is contained in:
Родитель
04014b483e
Коммит
2b29d24f59
|
@ -0,0 +1,78 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
"""Migrate legacy UDFs to the new UDF structure."""
|
||||
|
||||
from argparse import ArgumentParser
|
||||
import os
|
||||
from pathlib import Path
|
||||
import re
|
||||
import shutil
|
||||
import string
|
||||
import yaml
|
||||
|
||||
|
||||
UDF_DIRS = ("udf/", "udf_js/")
|
||||
DESCRIPTION_RE = re.compile(r"(/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+/)|(//.*)")
|
||||
|
||||
parser = ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--udf", help="Migrate the specified UDF.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--udf-dirs",
|
||||
"--udf_dirs",
|
||||
nargs="+",
|
||||
default=UDF_DIRS,
|
||||
help="Directories containing UDFs to migrate",
|
||||
)
|
||||
|
||||
|
||||
def migrate_udf(udf_file):
|
||||
"""Migrate a speficif UDF to the new format."""
|
||||
udf_file = Path(udf_file)
|
||||
print(f"Migrate {udf_file}")
|
||||
|
||||
udf_name = udf_file.name.replace(".sql", "")
|
||||
friendly_name = string.capwords(udf_name.replace("_", " "))
|
||||
|
||||
description = ""
|
||||
with open(udf_file) as udf:
|
||||
udf_content = udf.read()
|
||||
comment = re.findall(DESCRIPTION_RE, udf_content)
|
||||
|
||||
if len(comment) > 0:
|
||||
description = comment[0][0].replace("/*", "")
|
||||
description = description.replace("*/", "").strip()
|
||||
description = description.replace("\n", " ").strip()
|
||||
|
||||
# move files to directory
|
||||
migrated_udf_dir = udf_file.parent / udf_name
|
||||
migrated_udf_dir.mkdir(parents=True, exist_ok=True)
|
||||
shutil.move(str(udf_file), str(migrated_udf_dir / "udf.sql"))
|
||||
|
||||
# create metdata file
|
||||
metadata_file = migrated_udf_dir / "metadata.yaml"
|
||||
metadata = {"friendly_name": friendly_name, "description": description}
|
||||
metadata_file.write_text(yaml.dump(metadata))
|
||||
|
||||
|
||||
def main():
|
||||
"""Run the UDF migration"""
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.udf:
|
||||
# migrate a single UDF
|
||||
migrate_udf(args.udf)
|
||||
return
|
||||
|
||||
# iterate through udfs and migrate one by one
|
||||
for udf_dir in args.udf_dirs:
|
||||
if os.path.isdir(udf_dir):
|
||||
for root, dirs, files in os.walk(udf_dir):
|
||||
for udf_file in files:
|
||||
if udf_file.endswith(".sql") and not udf_file.endswith("udf.sql"):
|
||||
migrate_udf(os.path.join(root, udf_file))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,2 @@
|
|||
description: ''
|
||||
friendly_name: Active N Weeks Ago
|
|
@ -0,0 +1,4 @@
|
|||
description: Given a map of representing activity for STRING `key`s, this function
|
||||
returns an array of which `key`s were active for the time period in question. start_offset
|
||||
should be at most 0. n_bits should be at most the remaining bits.
|
||||
friendly_name: Active Values From Days Seen Map
|
|
@ -0,0 +1,10 @@
|
|||
description: 'This function specifically windows searches into calendar-month windows.
|
||||
This means groups are not necessarily directly comparable, since different months
|
||||
have different numbers of days. On the first of each month, a new month is appended,
|
||||
and the first month is dropped. If the date is not the first of the month, the
|
||||
new entry is added to the last element in the array. For example, if we were adding
|
||||
12 to [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]: On the first of the month, the result
|
||||
would be [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12] On any other day of the month,
|
||||
the result would be [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 24] This happens for every
|
||||
aggregate (searches, ad clicks, etc.)'
|
||||
friendly_name: Add Monthly Engine Searches
|
|
@ -0,0 +1,5 @@
|
|||
description: Adds together two engine searches structs. Each engine searches struct
|
||||
has a MAP[engine -> search_counts_struct]. We want to add add together the prev
|
||||
and curr's values for a certain engine. This allows us to be flexible with the
|
||||
number of engines we're using.
|
||||
friendly_name: Add Monthly Searches
|
|
@ -0,0 +1,3 @@
|
|||
description: Return sums of each search type grouped by the index. Results are ordered
|
||||
by index.
|
||||
friendly_name: Add Searches By Index
|
|
@ -0,0 +1,5 @@
|
|||
description: This function selects most frequently occuring value for each addon_id,
|
||||
using the latest value in the input among ties. The type for active_addons is ARRAY<STRUCT<addon_id
|
||||
STRING, ...>>, i.e. the output of `SELECT ARRAY_CONCAT_AGG(active_addons) FROM telemetry.main_summary_v4`,
|
||||
and is left unspecified to allow changes to the fields of the STRUCT.
|
||||
friendly_name: Aggregate Active Addons
|
|
@ -0,0 +1,3 @@
|
|||
description: Returns an aggregated map with all the keys and the first corresponding
|
||||
value from the given maps
|
||||
friendly_name: Aggregate Map First
|
|
@ -0,0 +1,2 @@
|
|||
description: ''
|
||||
friendly_name: Aggregate Search Counts
|
|
@ -0,0 +1,2 @@
|
|||
description: Aggregates the total counts of the given search counters
|
||||
friendly_name: Aggregate Search Map
|
|
@ -0,0 +1,2 @@
|
|||
description: An array of 11 zeroes, followed by a supplied value
|
||||
friendly_name: Array 11 Zeroes Then
|
|
@ -0,0 +1,3 @@
|
|||
description: Drop the first element of an array, and append the given element. Result
|
||||
is an array with the same length as the input.
|
||||
friendly_name: Array Drop First And Append
|
|
@ -0,0 +1,2 @@
|
|||
description: An array of 12 zeroes
|
||||
friendly_name: Array Of 12 Zeroes
|
|
@ -0,0 +1,2 @@
|
|||
description: ''
|
||||
friendly_name: Array Slice
|
|
@ -0,0 +1,2 @@
|
|||
description: This function counts the 1s in lowest 7 bits of an INT64
|
||||
friendly_name: Bitcount Lowest 7
|
|
@ -0,0 +1,2 @@
|
|||
description: A bitmask for 365 bits
|
||||
friendly_name: Bitmask 365
|
|
@ -0,0 +1,2 @@
|
|||
description: ''
|
||||
friendly_name: Bitmask Lowest 28
|
|
@ -0,0 +1,2 @@
|
|||
description: ''
|
||||
friendly_name: Bitmask Lowest 7
|
|
@ -0,0 +1,6 @@
|
|||
description: Returns a bitmask that can be used to return a subset of an integer representing
|
||||
a bit array. The start_ordinal argument is an integer specifying the starting position
|
||||
of the slice, with start_ordinal = 1 indicating the first bit. The length argument
|
||||
is the number of bits to include in the mask. The arguments were chosen to match
|
||||
the semantics of the SUBSTR function; see https://cloud.google.com/bigquery/docs/reference/standard-sql/functions-and-operators#substr
|
||||
friendly_name: Bitmask Range
|
|
@ -0,0 +1,2 @@
|
|||
description: ''
|
||||
friendly_name: Bits28 Active In Range
|
|
@ -0,0 +1,2 @@
|
|||
description: ''
|
||||
friendly_name: Bits28 Days Since Seen
|
|
@ -0,0 +1,2 @@
|
|||
description: ''
|
||||
friendly_name: Bits28 From String
|
|
@ -0,0 +1,2 @@
|
|||
description: ''
|
||||
friendly_name: Bits28 Range
|
|
@ -0,0 +1,2 @@
|
|||
description: ''
|
||||
friendly_name: Bits28 Retention
|
|
@ -0,0 +1,2 @@
|
|||
description: ''
|
||||
friendly_name: Bits28 To Dates
|
|
@ -0,0 +1,2 @@
|
|||
description: ''
|
||||
friendly_name: Bits28 To String
|
|
@ -0,0 +1,3 @@
|
|||
description: Given a BYTE and an INT64, return whether the user was active that many
|
||||
weeks ago. NULL input returns NULL output.
|
||||
friendly_name: Bits To Active N Weeks Ago
|
|
@ -0,0 +1,3 @@
|
|||
description: Given a BYTE, get the number of days the user was seen. NULL input returns
|
||||
NULL output.
|
||||
friendly_name: Bits To Days Seen
|
|
@ -0,0 +1,7 @@
|
|||
description: 'Given a BYTES, return the number of days since the client was first
|
||||
seen. If no bits are set, returns NULL, indicating we don''t know. Otherwise the
|
||||
result is 0-indexed, meaning that for \x01, it will return 0. Results showed this
|
||||
being between 5-10x faster than the simpler alternative: CREATE OR REPLACE FUNCTION udf.bits_to_days_since_first_seen(b
|
||||
BYTES) AS (( SELECT MAX(n) FROM UNNEST(GENERATE_ARRAY(0, 8 * BYTE_LENGTH(b)))
|
||||
AS n WHERE BIT_COUNT(SUBSTR(b >> n, -1) & b''\x01'') > 0)); See also: bits_to_days_since_seen.sql'
|
||||
friendly_name: Bits To Days Since First Seen
|
|
@ -0,0 +1,7 @@
|
|||
description: 'Given a BYTES, return the number of days since the client was last seen. If
|
||||
no bits are set, returns NULL, indicating we don''t know. Otherwise the results
|
||||
are 0-indexed, meaning \x01 will return 0. Tests showed this being 5-10x faster
|
||||
than the simpler alternative: CREATE OR REPLACE FUNCTION udf.bits_to_days_since_seen(b
|
||||
BYTES) AS (( SELECT MIN(n) FROM UNNEST(GENERATE_ARRAY(0, 364)) AS n WHERE
|
||||
BIT_COUNT(SUBSTR(b >> n, -1) & b''\x01'') > 0)); See also: bits_to_days_since_first_seen.sql'
|
||||
friendly_name: Bits To Days Since Seen
|
|
@ -0,0 +1,2 @@
|
|||
description: Convert a boolean to 365 bit byte array
|
||||
friendly_name: Bool To 365 Bits
|
|
@ -0,0 +1,3 @@
|
|||
description: Given histogram h, return TRUE if it has a value in the "true" bucket,
|
||||
or FALSE if it has a value in the "false" bucket, or NULL otherwise. https://github.com/mozilla/telemetry-batch-view/blob/ea0733c/src/main/scala/com/mozilla/telemetry/utils/MainPing.scala#L309-L317
|
||||
friendly_name: Boolean Histogram To Boolean
|
|
@ -0,0 +1,6 @@
|
|||
description: We generally want to believe only the first reasonable profile creation
|
||||
date that we receive from a client. Given bits representing usage from the previous
|
||||
day and the current day, this function shifts the first argument by one day and
|
||||
returns either that value if non-zero and non-null, the current day value if non-zero
|
||||
and non-null, or else 0.
|
||||
friendly_name: Coalesce Adjacent Days 28 Bits
|
|
@ -0,0 +1,7 @@
|
|||
description: Coalesce previous data's PCD with the new data's PCD. We generally want
|
||||
to believe only the first reasonable profile creation date that we receive from
|
||||
a client. Given bytes representing usage from the previous day and the current day,
|
||||
this function shifts the first argument by one day and returns either that value
|
||||
if non-zero and non-null, the current day value if non-zero and non-null, or else
|
||||
0.
|
||||
friendly_name: Coalesce Adjacent Days 365 Bits
|
|
@ -0,0 +1,5 @@
|
|||
description: Combines two bit patterns. The first pattern represents activity over
|
||||
a 28-day period ending "yesterday". The second pattern represents activity as observed
|
||||
today (usually just 0 or 1). We shift the bits in the first pattern by one to set
|
||||
the new baseline as "today", then perform a bitwise OR of the two patterns.
|
||||
friendly_name: Combine Adjacent Days 28 Bits
|
|
@ -0,0 +1,2 @@
|
|||
description: ''
|
||||
friendly_name: Combine Adjacent Days 365 Bits
|
|
@ -0,0 +1,7 @@
|
|||
description: The "clients_last_seen" class of tables represent various types of client
|
||||
activity within a 28-day window as bit patterns. This function takes in two arrays
|
||||
of structs (aka maps) where each entry gives the bit pattern for days in which we
|
||||
saw a ping for a given user in a given key. We combine the bit patterns for the
|
||||
previous day and the current day, returning a single map. See `udf.combine_experiment_days`
|
||||
for a more specific example of this approach.
|
||||
friendly_name: Combine Days Seen Maps
|
|
@ -0,0 +1,6 @@
|
|||
description: The "clients_last_seen" class of tables represent various types of client
|
||||
activity within a 28-day window as bit patterns. This function takes in two arrays
|
||||
of structs where each entry gives the bit pattern for days in which we saw a ping
|
||||
for a given user in a given experiment. We combine the bit patterns for the previous
|
||||
day and the current day, returning a single array of experiment structs.
|
||||
friendly_name: Combine Experiment Days
|
|
@ -0,0 +1,5 @@
|
|||
description: 'For a given two-letter ISO 3166-1 alpha-2 country code, returns a string
|
||||
consisting of two Unicode regional indicator symbols, which is rendered in supporting
|
||||
fonts (such as in the BigQuery console or STMO) as flag emoji. This is just for
|
||||
fun. See: - https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2 - https://en.wikipedia.org/wiki/Regional_Indicator_Symbol'
|
||||
friendly_name: Country Code To Flag
|
|
@ -0,0 +1,4 @@
|
|||
description: Return the frequency, recency, and T from a BYTE array, as defined in
|
||||
https://lifetimes.readthedocs.io/en/latest/Quickstart.html#the-shape-of-your-data RFM
|
||||
refers to Recency, Frequency, and Monetary value.
|
||||
friendly_name: Days Seen Bytes To Rfm
|
|
@ -0,0 +1,6 @@
|
|||
description: Takes in a difference between submission date and profile creation date
|
||||
and returns a bit pattern representing the profile creation date IFF the profile
|
||||
date is the same as the submission date or no more than 6 days earlier. Analysis
|
||||
has shown that client-reported profile creation dates are much less reliable outside
|
||||
of this range and cannot be used as reliable indicators of new profile creation.
|
||||
friendly_name: Days Since Created Profile As 28 Bits
|
|
@ -0,0 +1,2 @@
|
|||
description: Rename struct fields in anonymous event tuples to meaningful names.
|
||||
friendly_name: Deanonymize Event
|
|
@ -0,0 +1,2 @@
|
|||
description: ''
|
||||
friendly_name: Decode Int64
|
|
@ -0,0 +1,2 @@
|
|||
description: Return an array containing only distinct values of the given array
|
||||
friendly_name: Dedupe Array
|
|
@ -0,0 +1,2 @@
|
|||
description: ''
|
||||
friendly_name: Extract Count Histogram Value
|
|
@ -0,0 +1,2 @@
|
|||
description: Extract the document type from a table name e.g. _TABLE_SUFFIX.
|
||||
friendly_name: Extract Document Type
|
|
@ -0,0 +1,2 @@
|
|||
description: Extract the document version from a table name e.g. _TABLE_SUFFIX.
|
||||
friendly_name: Extract Document Version
|
|
@ -0,0 +1,5 @@
|
|||
description: This is a performance optimization compared to the more general mozfun.hist.extract
|
||||
for cases where only the histogram sum is needed. It must support all the same
|
||||
format variants as mozfun.hist.extract but this simplification is necessary to keep
|
||||
the main_summary query complexity in check.
|
||||
friendly_name: Extract Histogram Sum
|
|
@ -0,0 +1,2 @@
|
|||
description: Return a path derived from an error message in `payload_bytes_error`
|
||||
friendly_name: Extract Schema Validation Path
|
|
@ -0,0 +1,6 @@
|
|||
description: 'Convert the Fenix client_info.app_build-format string to a DATETIME.
|
||||
May return NULL on failure. The Fenix app_build format is documented here: https://github.com/mozilla-mobile/fenix/blob/c72834479eb3e13ee91f82b529e59aa08392a92d/automation/gradle/versionCode.gradle#L13 In
|
||||
short it is yDDDHHmm * y is years since 2018 * DDD is day of year, 0-padded, 001-366 *
|
||||
HH is hour of day, 00-23 * mm is minute of hour, 00-59 After using this you may
|
||||
wish to DATETIME_TRUNC(result, DAY) for grouping by build date.'
|
||||
friendly_name: Fenix Build To Datetime
|
|
@ -0,0 +1,5 @@
|
|||
description: Convert geoip lookup fields to a struct, replacing '??' with NULL. Returns
|
||||
NULL if if required field country would be NULL. Replaces '??' with NULL because
|
||||
'??' is a placeholder that may be used if there was an issue during geoip lookup
|
||||
in hindsight.
|
||||
friendly_name: Geo Struct
|
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
Ссылка в новой задаче