This commit is contained in:
Anna Scholtz 2020-08-31 13:34:56 -07:00
Родитель 04014b483e
Коммит 2b29d24f59
242 изменённых файлов: 500 добавлений и 0 удалений

78
script/legacy/migrate_udfs Executable file
Просмотреть файл

@ -0,0 +1,78 @@
#!/usr/bin/env python3
"""Migrate legacy UDFs to the new UDF structure."""
from argparse import ArgumentParser
import os
from pathlib import Path
import re
import shutil
import string
import yaml
UDF_DIRS = ("udf/", "udf_js/")
DESCRIPTION_RE = re.compile(r"(/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+/)|(//.*)")
parser = ArgumentParser(description=__doc__)
parser.add_argument(
"--udf", help="Migrate the specified UDF.",
)
parser.add_argument(
"--udf-dirs",
"--udf_dirs",
nargs="+",
default=UDF_DIRS,
help="Directories containing UDFs to migrate",
)
def migrate_udf(udf_file):
"""Migrate a speficif UDF to the new format."""
udf_file = Path(udf_file)
print(f"Migrate {udf_file}")
udf_name = udf_file.name.replace(".sql", "")
friendly_name = string.capwords(udf_name.replace("_", " "))
description = ""
with open(udf_file) as udf:
udf_content = udf.read()
comment = re.findall(DESCRIPTION_RE, udf_content)
if len(comment) > 0:
description = comment[0][0].replace("/*", "")
description = description.replace("*/", "").strip()
description = description.replace("\n", " ").strip()
# move files to directory
migrated_udf_dir = udf_file.parent / udf_name
migrated_udf_dir.mkdir(parents=True, exist_ok=True)
shutil.move(str(udf_file), str(migrated_udf_dir / "udf.sql"))
# create metdata file
metadata_file = migrated_udf_dir / "metadata.yaml"
metadata = {"friendly_name": friendly_name, "description": description}
metadata_file.write_text(yaml.dump(metadata))
def main():
"""Run the UDF migration"""
args = parser.parse_args()
if args.udf:
# migrate a single UDF
migrate_udf(args.udf)
return
# iterate through udfs and migrate one by one
for udf_dir in args.udf_dirs:
if os.path.isdir(udf_dir):
for root, dirs, files in os.walk(udf_dir):
for udf_file in files:
if udf_file.endswith(".sql") and not udf_file.endswith("udf.sql"):
migrate_udf(os.path.join(root, udf_file))
if __name__ == "__main__":
main()

Просмотреть файл

@ -0,0 +1,2 @@
description: ''
friendly_name: Active N Weeks Ago

Просмотреть файл

Просмотреть файл

@ -0,0 +1,4 @@
description: Given a map of representing activity for STRING `key`s, this function
returns an array of which `key`s were active for the time period in question. start_offset
should be at most 0. n_bits should be at most the remaining bits.
friendly_name: Active Values From Days Seen Map

Просмотреть файл

@ -0,0 +1,10 @@
description: 'This function specifically windows searches into calendar-month windows.
This means groups are not necessarily directly comparable, since different months
have different numbers of days. On the first of each month, a new month is appended,
and the first month is dropped. If the date is not the first of the month, the
new entry is added to the last element in the array. For example, if we were adding
12 to [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]: On the first of the month, the result
would be [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12] On any other day of the month,
the result would be [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 24] This happens for every
aggregate (searches, ad clicks, etc.)'
friendly_name: Add Monthly Engine Searches

Просмотреть файл

@ -0,0 +1,5 @@
description: Adds together two engine searches structs. Each engine searches struct
has a MAP[engine -> search_counts_struct]. We want to add add together the prev
and curr's values for a certain engine. This allows us to be flexible with the
number of engines we're using.
friendly_name: Add Monthly Searches

Просмотреть файл

Просмотреть файл

@ -0,0 +1,3 @@
description: Return sums of each search type grouped by the index. Results are ordered
by index.
friendly_name: Add Searches By Index

Просмотреть файл

Просмотреть файл

@ -0,0 +1,5 @@
description: This function selects most frequently occuring value for each addon_id,
using the latest value in the input among ties. The type for active_addons is ARRAY<STRUCT<addon_id
STRING, ...>>, i.e. the output of `SELECT ARRAY_CONCAT_AGG(active_addons) FROM telemetry.main_summary_v4`,
and is left unspecified to allow changes to the fields of the STRUCT.
friendly_name: Aggregate Active Addons

Просмотреть файл

Просмотреть файл

@ -0,0 +1,3 @@
description: Returns an aggregated map with all the keys and the first corresponding
value from the given maps
friendly_name: Aggregate Map First

Просмотреть файл

Просмотреть файл

@ -0,0 +1,2 @@
description: ''
friendly_name: Aggregate Search Counts

Просмотреть файл

Просмотреть файл

@ -0,0 +1,2 @@
description: Aggregates the total counts of the given search counters
friendly_name: Aggregate Search Map

Просмотреть файл

Просмотреть файл

@ -0,0 +1,2 @@
description: An array of 11 zeroes, followed by a supplied value
friendly_name: Array 11 Zeroes Then

Просмотреть файл

Просмотреть файл

@ -0,0 +1,3 @@
description: Drop the first element of an array, and append the given element. Result
is an array with the same length as the input.
friendly_name: Array Drop First And Append

Просмотреть файл

@ -0,0 +1,2 @@
description: An array of 12 zeroes
friendly_name: Array Of 12 Zeroes

Просмотреть файл

Просмотреть файл

@ -0,0 +1,2 @@
description: ''
friendly_name: Array Slice

Просмотреть файл

Просмотреть файл

@ -0,0 +1,2 @@
description: This function counts the 1s in lowest 7 bits of an INT64
friendly_name: Bitcount Lowest 7

Просмотреть файл

Просмотреть файл

@ -0,0 +1,2 @@
description: A bitmask for 365 bits
friendly_name: Bitmask 365

Просмотреть файл

Просмотреть файл

@ -0,0 +1,2 @@
description: ''
friendly_name: Bitmask Lowest 28

Просмотреть файл

Просмотреть файл

@ -0,0 +1,2 @@
description: ''
friendly_name: Bitmask Lowest 7

Просмотреть файл

Просмотреть файл

@ -0,0 +1,6 @@
description: Returns a bitmask that can be used to return a subset of an integer representing
a bit array. The start_ordinal argument is an integer specifying the starting position
of the slice, with start_ordinal = 1 indicating the first bit. The length argument
is the number of bits to include in the mask. The arguments were chosen to match
the semantics of the SUBSTR function; see https://cloud.google.com/bigquery/docs/reference/standard-sql/functions-and-operators#substr
friendly_name: Bitmask Range

Просмотреть файл

Просмотреть файл

@ -0,0 +1,2 @@
description: ''
friendly_name: Bits28 Active In Range

Просмотреть файл

Просмотреть файл

@ -0,0 +1,2 @@
description: ''
friendly_name: Bits28 Days Since Seen

Просмотреть файл

Просмотреть файл

@ -0,0 +1,2 @@
description: ''
friendly_name: Bits28 From String

Просмотреть файл

Просмотреть файл

@ -0,0 +1,2 @@
description: ''
friendly_name: Bits28 Range

Просмотреть файл

Просмотреть файл

@ -0,0 +1,2 @@
description: ''
friendly_name: Bits28 Retention

Просмотреть файл

Просмотреть файл

@ -0,0 +1,2 @@
description: ''
friendly_name: Bits28 To Dates

Просмотреть файл

Просмотреть файл

@ -0,0 +1,2 @@
description: ''
friendly_name: Bits28 To String

Просмотреть файл

Просмотреть файл

@ -0,0 +1,3 @@
description: Given a BYTE and an INT64, return whether the user was active that many
weeks ago. NULL input returns NULL output.
friendly_name: Bits To Active N Weeks Ago

Просмотреть файл

@ -0,0 +1,3 @@
description: Given a BYTE, get the number of days the user was seen. NULL input returns
NULL output.
friendly_name: Bits To Days Seen

Просмотреть файл

Просмотреть файл

@ -0,0 +1,7 @@
description: 'Given a BYTES, return the number of days since the client was first
seen. If no bits are set, returns NULL, indicating we don''t know. Otherwise the
result is 0-indexed, meaning that for \x01, it will return 0. Results showed this
being between 5-10x faster than the simpler alternative: CREATE OR REPLACE FUNCTION udf.bits_to_days_since_first_seen(b
BYTES) AS (( SELECT MAX(n) FROM UNNEST(GENERATE_ARRAY(0, 8 * BYTE_LENGTH(b)))
AS n WHERE BIT_COUNT(SUBSTR(b >> n, -1) & b''\x01'') > 0)); See also: bits_to_days_since_seen.sql'
friendly_name: Bits To Days Since First Seen

Просмотреть файл

@ -0,0 +1,7 @@
description: 'Given a BYTES, return the number of days since the client was last seen. If
no bits are set, returns NULL, indicating we don''t know. Otherwise the results
are 0-indexed, meaning \x01 will return 0. Tests showed this being 5-10x faster
than the simpler alternative: CREATE OR REPLACE FUNCTION udf.bits_to_days_since_seen(b
BYTES) AS (( SELECT MIN(n) FROM UNNEST(GENERATE_ARRAY(0, 364)) AS n WHERE
BIT_COUNT(SUBSTR(b >> n, -1) & b''\x01'') > 0)); See also: bits_to_days_since_first_seen.sql'
friendly_name: Bits To Days Since Seen

Просмотреть файл

Просмотреть файл

@ -0,0 +1,2 @@
description: Convert a boolean to 365 bit byte array
friendly_name: Bool To 365 Bits

Просмотреть файл

Просмотреть файл

@ -0,0 +1,3 @@
description: Given histogram h, return TRUE if it has a value in the "true" bucket,
or FALSE if it has a value in the "false" bucket, or NULL otherwise. https://github.com/mozilla/telemetry-batch-view/blob/ea0733c/src/main/scala/com/mozilla/telemetry/utils/MainPing.scala#L309-L317
friendly_name: Boolean Histogram To Boolean

Просмотреть файл

@ -0,0 +1,6 @@
description: We generally want to believe only the first reasonable profile creation
date that we receive from a client. Given bits representing usage from the previous
day and the current day, this function shifts the first argument by one day and
returns either that value if non-zero and non-null, the current day value if non-zero
and non-null, or else 0.
friendly_name: Coalesce Adjacent Days 28 Bits

Просмотреть файл

@ -0,0 +1,7 @@
description: Coalesce previous data's PCD with the new data's PCD. We generally want
to believe only the first reasonable profile creation date that we receive from
a client. Given bytes representing usage from the previous day and the current day,
this function shifts the first argument by one day and returns either that value
if non-zero and non-null, the current day value if non-zero and non-null, or else
0.
friendly_name: Coalesce Adjacent Days 365 Bits

Просмотреть файл

@ -0,0 +1,5 @@
description: Combines two bit patterns. The first pattern represents activity over
a 28-day period ending "yesterday". The second pattern represents activity as observed
today (usually just 0 or 1). We shift the bits in the first pattern by one to set
the new baseline as "today", then perform a bitwise OR of the two patterns.
friendly_name: Combine Adjacent Days 28 Bits

Просмотреть файл

@ -0,0 +1,2 @@
description: ''
friendly_name: Combine Adjacent Days 365 Bits

Просмотреть файл

@ -0,0 +1,7 @@
description: The "clients_last_seen" class of tables represent various types of client
activity within a 28-day window as bit patterns. This function takes in two arrays
of structs (aka maps) where each entry gives the bit pattern for days in which we
saw a ping for a given user in a given key. We combine the bit patterns for the
previous day and the current day, returning a single map. See `udf.combine_experiment_days`
for a more specific example of this approach.
friendly_name: Combine Days Seen Maps

Просмотреть файл

Просмотреть файл

@ -0,0 +1,6 @@
description: The "clients_last_seen" class of tables represent various types of client
activity within a 28-day window as bit patterns. This function takes in two arrays
of structs where each entry gives the bit pattern for days in which we saw a ping
for a given user in a given experiment. We combine the bit patterns for the previous
day and the current day, returning a single array of experiment structs.
friendly_name: Combine Experiment Days

Просмотреть файл

Просмотреть файл

@ -0,0 +1,5 @@
description: 'For a given two-letter ISO 3166-1 alpha-2 country code, returns a string
consisting of two Unicode regional indicator symbols, which is rendered in supporting
fonts (such as in the BigQuery console or STMO) as flag emoji. This is just for
fun. See: - https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2 - https://en.wikipedia.org/wiki/Regional_Indicator_Symbol'
friendly_name: Country Code To Flag

Просмотреть файл

Просмотреть файл

@ -0,0 +1,4 @@
description: Return the frequency, recency, and T from a BYTE array, as defined in
https://lifetimes.readthedocs.io/en/latest/Quickstart.html#the-shape-of-your-data RFM
refers to Recency, Frequency, and Monetary value.
friendly_name: Days Seen Bytes To Rfm

Просмотреть файл

Просмотреть файл

@ -0,0 +1,6 @@
description: Takes in a difference between submission date and profile creation date
and returns a bit pattern representing the profile creation date IFF the profile
date is the same as the submission date or no more than 6 days earlier. Analysis
has shown that client-reported profile creation dates are much less reliable outside
of this range and cannot be used as reliable indicators of new profile creation.
friendly_name: Days Since Created Profile As 28 Bits

Просмотреть файл

@ -0,0 +1,2 @@
description: Rename struct fields in anonymous event tuples to meaningful names.
friendly_name: Deanonymize Event

Просмотреть файл

Просмотреть файл

@ -0,0 +1,2 @@
description: ''
friendly_name: Decode Int64

Просмотреть файл

Просмотреть файл

@ -0,0 +1,2 @@
description: Return an array containing only distinct values of the given array
friendly_name: Dedupe Array

Просмотреть файл

Просмотреть файл

@ -0,0 +1,2 @@
description: ''
friendly_name: Extract Count Histogram Value

Просмотреть файл

@ -0,0 +1,2 @@
description: Extract the document type from a table name e.g. _TABLE_SUFFIX.
friendly_name: Extract Document Type

Просмотреть файл

Просмотреть файл

@ -0,0 +1,2 @@
description: Extract the document version from a table name e.g. _TABLE_SUFFIX.
friendly_name: Extract Document Version

Просмотреть файл

Просмотреть файл

@ -0,0 +1,5 @@
description: This is a performance optimization compared to the more general mozfun.hist.extract
for cases where only the histogram sum is needed. It must support all the same
format variants as mozfun.hist.extract but this simplification is necessary to keep
the main_summary query complexity in check.
friendly_name: Extract Histogram Sum

Просмотреть файл

Просмотреть файл

@ -0,0 +1,2 @@
description: Return a path derived from an error message in `payload_bytes_error`
friendly_name: Extract Schema Validation Path

Просмотреть файл

@ -0,0 +1,6 @@
description: 'Convert the Fenix client_info.app_build-format string to a DATETIME.
May return NULL on failure. The Fenix app_build format is documented here: https://github.com/mozilla-mobile/fenix/blob/c72834479eb3e13ee91f82b529e59aa08392a92d/automation/gradle/versionCode.gradle#L13 In
short it is yDDDHHmm * y is years since 2018 * DDD is day of year, 0-padded, 001-366 *
HH is hour of day, 00-23 * mm is minute of hour, 00-59 After using this you may
wish to DATETIME_TRUNC(result, DAY) for grouping by build date.'
friendly_name: Fenix Build To Datetime

Просмотреть файл

Просмотреть файл

@ -0,0 +1,5 @@
description: Convert geoip lookup fields to a struct, replacing '??' with NULL. Returns
NULL if if required field country would be NULL. Replaces '??' with NULL because
'??' is a placeholder that may be used if there was an issue during geoip lookup
in hindsight.
friendly_name: Geo Struct

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше