bigquery-etl/bigquery_etl/parse_udf.py

259 строки
8.3 KiB
Python

"""
Machinery for parsing UDFs and tests defined in .sql files.
This should eventually be refactored to a more general library for
parsing UDF dependencies in queries as well.
"""
from dataclasses import dataclass, astuple
import re
import os
from typing import List
import sqlparse
UDF_DIRS = ("udf", "udf_js")
MOZFUN_DIR = ("mozfun",)
UDF_CHAR = "[a-zA-z0-9_]"
TEMP_UDF_RE = re.compile(f"(?:udf|assert)_{UDF_CHAR}+")
PERSISTENT_UDF_RE = re.compile(fr"((?:udf|assert){UDF_CHAR}*)\.({UDF_CHAR}+)")
MOZFUN_UDF_RE = re.compile(fr"({UDF_CHAR}+)\.({UDF_CHAR}+)")
PERSISTENT_UDF_PREFIX = re.compile(
r"CREATE\s+(OR\s+REPLACE\s+)?FUNCTION(\s+IF\s+NOT\s+EXISTS)?", re.IGNORECASE
)
UDF_NAME_RE = re.compile(r"^([a-zA-Z0-9_]+\.)?[a-zA-Z][a-zA-Z0-9_]{0,255}$")
# UDFs defined in mozfun
MOZFUN_UDFS = {
root.split("/")[-2] + "." + root.split("/")[-1]
for udf_dir in MOZFUN_DIR
for root, dirs, files in os.walk(udf_dir)
for filename in files
if not filename.startswith(".") and filename.endswith(".sql")
}
@dataclass
class RawUdf:
"""Representation of the content of a single UDF sql file."""
name: str
dataset: str
filepath: str
definitions: List[str]
tests: List[str]
dependencies: List[str]
@staticmethod
def from_file(filepath):
"""Read in a RawUdf from a SQL file on disk."""
dirpath, basename = os.path.split(filepath)
with open(filepath) as f:
text = f.read()
if basename == "udf.sql":
# mozfun support, all UDFs are stored in udf.sql files which are nested
# into directories denoting the UDF name and dataset
name = os.path.basename(dirpath)
dataset = os.path.basename(os.path.split(dirpath)[0])
else:
name = basename.replace(".sql", "")
dataset = os.path.basename(dirpath)
try:
return RawUdf.from_text(text, dataset, name, filepath)
except ValueError as e:
raise ValueError(str(e) + f" in {filepath}")
@staticmethod
def from_text(text, dataset, name, filepath=None, is_defined=True):
"""Create a RawUdf instance from text.
If is_defined is False, then the UDF does not
need to be defined in the text; it could be
just tests.
"""
sql = sqlparse.format(text, strip_comments=True)
statements = [s for s in sqlparse.split(sql) if s.strip()]
prod_name = name
persistent_name = f"{dataset}.{name}"
temp_name = f"{dataset}_{name}"
internal_name = None
definitions = []
tests = []
for s in statements:
normalized_statement = " ".join(s.lower().split())
if normalized_statement.startswith("create or replace function"):
definitions.append(s)
if persistent_name in normalized_statement:
internal_name = persistent_name
elif normalized_statement.startswith("create temp function"):
definitions.append(s)
if temp_name in normalized_statement:
internal_name = temp_name
else:
tests.append(s)
for name in (prod_name, internal_name):
if is_defined and not UDF_NAME_RE.match(name):
raise ValueError(
f"Invalid UDF name {name}: Must start with alpha char, "
f"limited to chars {UDF_CHAR}, be at most 256 chars long"
)
# find usages of both persistent and temporary UDFs
dependencies = re.findall(PERSISTENT_UDF_RE, "\n".join(definitions))
dependencies = [".".join(t) for t in dependencies]
dependencies.extend(re.findall(TEMP_UDF_RE, "\n".join(definitions)))
if filepath:
# for public UDFs dependencies can live in arbitrary dataset;
# we can check if some known dependency is part of the UDF
# definition instead
_, basename = os.path.split(filepath)
if basename == "udf.sql":
for udf in MOZFUN_UDFS:
if udf in "\n".join(definitions):
dependencies.append(udf)
if is_defined:
if internal_name is None:
raise ValueError(
f"Expected a UDF named {persistent_name} or {temp_name} "
f"to be defined"
)
dependencies.remove(internal_name)
return RawUdf(
internal_name,
dataset,
filepath,
definitions,
tests,
# We convert the list to a set to deduplicate entries,
# but then convert back to a list for stable order.
sorted(set(dependencies)),
)
@dataclass
class ParsedUdf(RawUdf):
"""Parsed representation of a UDF including dependent UDF code."""
tests_full_sql: List[str]
@staticmethod
def from_raw(raw_udf, tests_full_sql):
"""Promote a RawUdf to a ParsedUdf."""
return ParsedUdf(*astuple(raw_udf), tests_full_sql)
def read_udf_dirs(*udf_dirs):
"""Read contents of udf_dirs into dict of RawUdf instances."""
return {
raw_udf.name: raw_udf
for udf_dir in (udf_dirs or UDF_DIRS)
for root, dirs, files in os.walk(udf_dir)
for filename in files
if not filename.startswith(".") and filename.endswith(".sql")
for raw_udf in (RawUdf.from_file(os.path.join(root, filename)),)
}
def parse_udf_dirs(*udf_dirs):
"""Read contents of udf_dirs into ParsedUdf instances."""
# collect udfs to parse
raw_udfs = read_udf_dirs(*udf_dirs)
# prepend udf definitions to tests
for raw_udf in raw_udfs.values():
tests_full_sql = udf_tests_sql(raw_udf, raw_udfs)
yield ParsedUdf.from_raw(raw_udf, tests_full_sql)
def accumulate_dependencies(deps, raw_udfs, udf_name):
"""
Accumulate a list of dependent UDF names.
Given a dict of raw_udfs and a udf_name string, recurse into the
UDF's dependencies, adding the names to deps in depth-first order.
"""
if udf_name not in raw_udfs:
return deps
raw_udf = raw_udfs[udf_name]
for dep in raw_udf.dependencies:
deps = accumulate_dependencies(deps, raw_udfs, dep)
if udf_name in deps:
return deps
else:
return deps + [udf_name]
def udf_usages_in_file(filepath):
"""Return a list of UDF names used in the provided SQL file."""
with open(filepath) as f:
text = f.read()
return udf_usages_in_text(text)
def udf_usages_in_text(text):
"""Return a list of UDF names used in the provided SQL text."""
sql = sqlparse.format(text, strip_comments=True)
udf_usages = PERSISTENT_UDF_RE.findall(sql)
udf_usages = list(map(lambda t: ".".join(t), udf_usages))
udf_usages.extend(TEMP_UDF_RE.findall(sql))
return sorted(set(udf_usages))
def udf_usage_definitions(text, raw_udfs=None):
"""Return a list of definitions of UDFs used in provided SQL text."""
if raw_udfs is None:
raw_udfs = read_udf_dirs()
deps = []
for udf_usage in udf_usages_in_text(text):
deps = accumulate_dependencies(deps, raw_udfs, udf_usage)
return [
statement for udf_name in deps for statement in raw_udfs[udf_name].definitions
]
def persistent_udf_as_temp(raw_udf, raw_udfs=None):
"""Transform persistent UDF into temporary UDF."""
sql = prepend_udf_usage_definitions(raw_udf, raw_udfs)
sql = sub_persistent_udf_names_as_temp(sql)
sql = PERSISTENT_UDF_PREFIX.sub("CREATE TEMP FUNCTION", sql)
return sql
def udf_tests_sql(raw_udf, raw_udfs):
"""
Create tests for testing persistent UDFs.
Persistent UDFs need to be rewritten as temporary UDFs so that changes
can be tested.
"""
tests_full_sql = []
for test in raw_udf.tests:
test_sql = persistent_udf_as_temp(test, raw_udfs)
tests_full_sql.append(test_sql)
return tests_full_sql
def prepend_udf_usage_definitions(text, raw_udfs=None):
"""Prepend definitions of UDFs used to provided SQL text."""
statements = udf_usage_definitions(text, raw_udfs)
return "\n\n".join(statements + [text])
def sub_persistent_udf_names_as_temp(text):
"""Substitute persistent UDF references with temporary UDF references."""
return PERSISTENT_UDF_RE.sub(r"\1_\2", text)