Use zetasql to find sql dependencies (#1802)

This commit is contained in:
Daniel Thorn 2021-02-17 11:48:40 -08:00 коммит произвёл GitHub
Родитель 2306620c8d
Коммит f0ec77e27d
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
15 изменённых файлов: 268 добавлений и 13 удалений

Просмотреть файл

@ -11,7 +11,7 @@ jobs:
keys:
# when lock files change, use increasingly general
# patterns to restore cache
- &cache_key
- &python_cache_key
# yamllint disable-line rule:line-length
python-3.8-packages-v1-{{ .Branch }}-{{ checksum "requirements.in" }}-{{ checksum "requirements.txt" }}
# yamllint disable-line rule:line-length
@ -35,7 +35,7 @@ jobs:
- save_cache:
paths:
- venv/
key: *cache_key
key: *python_cache_key
verify-format-sql:
docker: *docker
steps:
@ -88,6 +88,22 @@ jobs:
fi
- *restore_venv_cache
- *build
- &restore_mvn_cache
restore_cache:
keys:
# when lock files change, use increasingly general
# patterns to restore cache
- &mvn_cache_key
maven-packages-v1-{{ .Branch }}-{{ checksum "pom.xml" }}
- maven-packages-v1-{{ .Branch }}-
- maven-packages-v1-
- &java_deps
run:
name: Install maven and java and download dependency jars
command: |
apt update
apt install -y maven default-jdk-headless
mvn dependency:copy-dependencies
- run:
name: PyTest Integration Test
# Google's client libraries will check for
@ -97,7 +113,11 @@ jobs:
command: |
export GOOGLE_APPLICATION_CREDENTIALS="/tmp/gcp.json"
echo "$GCLOUD_SERVICE_KEY" > "$GOOGLE_APPLICATION_CREDENTIALS"
PATH="venv/bin:$PATH" script/entrypoint -m integration
PATH="venv/bin:$PATH" script/entrypoint -m 'integration or java'
- save_cache:
paths:
- ~/.m2
key: *mvn_cache_key
validate-dags:
# based on
# https://github.com/mozilla/telemetry-airflow/blob/master/.circleci/config.yml

1
.gitignore поставляемый
Просмотреть файл

@ -5,6 +5,7 @@
.DS_Store
.mypy_cache/
venv/
target/
# ignore locally generated docs
sql/**/docs

Просмотреть файл

@ -6,11 +6,20 @@ RUN apt-get update -qqy && apt-get install -qqy gcc libc-dev
COPY requirements.txt ./
RUN pip install -r requirements.txt
# download java dependencies in separate stage because it requires maven
FROM python:${PYTHON_VERSION}-slim
# add bash for entrypoint
RUN apt-get update -qqy && apt-get install -qqy bash
# man is directory removed in upstream debian:buster-slim, but needed by jdk install
RUN mkdir -p /usr/share/man/man1 && apt-get update -qqy && apt-get install -qqy maven
WORKDIR /app
COPY pom.xml ./
RUN mvn dependency:copy-dependencies
FROM python:${PYTHON_VERSION}-slim
# add bash for entrypoint and jdk for jni access to zetasql
RUN mkdir -p /usr/share/man/man1 && apt-get update -qqy && apt-get install -qqy bash default-jdk-headless
COPY --from=google/cloud-sdk:alpine /google-cloud-sdk /google-cloud-sdk
ENV PATH /google-cloud-sdk/bin:$PATH
COPY --from=1 /app/target/dependency /app/target/dependency
COPY --from=0 /usr/local /usr/local
WORKDIR /app
COPY .bigqueryrc /root/

Просмотреть файл

@ -16,6 +16,7 @@ from ..cli.routine import mozfun, routine
from ..cli.view import view
from ..glam.cli import glam
from ..stripe import stripe_
from ..dependency import dependency
def cli(prog_name=None):
@ -23,6 +24,7 @@ def cli(prog_name=None):
commands = {
"query": query,
"dag": dag,
"dependency": dependency,
"dryrun": dryrun,
"format": format,
"routine": routine,

Просмотреть файл

@ -0,0 +1,86 @@
"""Build and use query dependency graphs."""
import sys
from pathlib import Path
from subprocess import CalledProcessError
from typing import Tuple
import click
import jnius_config
# this has to run before jnius can be imported
for path in (Path(__file__).parent.parent / "target" / "dependency").glob("*.jar"):
jnius_config.add_classpath(path.resolve().as_posix())
def extract_table_references(sql: str):
"""Return a list of tables referenced in the given SQL."""
# import jnius here so this module can be imported safely without java installed
import jnius # noqa: E402
try:
Analyzer = jnius.autoclass("com.google.zetasql.Analyzer")
AnalyzerOptions = jnius.autoclass("com.google.zetasql.AnalyzerOptions")
except jnius.JavaException:
# replace jnius.JavaException because it's not available outside this function
raise ImportError(
"failed to import java class via jni, please download java dependencies "
"with: mvn dependency:copy-dependencies"
)
# enable support for CreateViewStatement and others
options = AnalyzerOptions()
options.getLanguageOptions().setSupportsAllStatementKinds()
try:
result = Analyzer.extractTableNamesFromStatement(sql, options)
except jnius.JavaException:
# Only use extractTableNamesFromScript when extractTableNamesFromStatement
# fails, because for scripts zetasql incorrectly includes CTE references from
# subquery expressions
try:
result = Analyzer.extractTableNamesFromScript(sql, options)
except jnius.JavaException as e:
# replace jnius.JavaException because it's not available outside this function
raise ValueError(*e.args)
return [".".join(table.toArray()) for table in result.toArray()]
@click.group(help=__doc__)
def dependency():
"""Create the CLI group for dependency commands."""
pass
@dependency.command(
help="Show table references in sql files.",
)
@click.argument(
"paths",
nargs=-1,
type=click.Path(file_okay=True),
)
def show(paths: Tuple[str, ...]):
"""Show table references in sql files."""
distinct_paths = {
path
for parent in map(Path, paths or ["sql"])
for path in (parent.glob("**/*.sql") if parent.is_dir() else [parent])
if not path.name.endswith(".template.sql") # skip templates
}
fail = False
for path in sorted(distinct_paths):
try:
table_references = extract_table_references(path.read_text())
except CalledProcessError as e:
raise click.ClickException(f"failed to import jnius: {e}")
except ImportError as e:
raise click.ClickException(*e.args)
except ValueError as e:
fail = True
print(f"Failed to parse {path}: {e}", file=sys.stderr)
if table_references:
for table in table_references:
print(f"{path}: {table}")
else:
print(f"{path} contains no table references", file=sys.stderr)
if fail:
raise click.ClickException("Some paths could not be analyzed")

Просмотреть файл

@ -2,10 +2,12 @@
from google.cloud import bigquery
from google.cloud import storage
from pathlib import Path
import os
import pytest
import random
import string
import subprocess
TEST_BUCKET = "bigquery-etl-integration-test-bucket"
@ -28,10 +30,18 @@ def pytest_collection_modifyitems(config, items):
return
skip_integration = pytest.mark.skip(reason="integration marker not selected")
requires_java = pytest.mark.skipif(
subprocess.call(["which", "javac"], stdout=subprocess.DEVNULL) != 0
or len(list(Path(__file__).parent.glob("target/dependency/*.jar"))) == 0,
reason="requires javac and target/dependency/*.jar from "
"`mvn dependency:copy-dependencies`",
)
for item in items:
if "integration" in item.keywords:
item.add_marker(skip_integration)
if "java" in item.keywords:
item.add_marker(requires_java)
@pytest.fixture

23
pom.xml Normal file
Просмотреть файл

@ -0,0 +1,23 @@
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.mozilla.telemetry</groupId>
<artifactId>mozilla-pipeline-schemas</artifactId>
<version>0.1-SNAPSHOT</version>
<properties>
<zetasql.version>2020.10.1</zetasql.version>
</properties>
<dependencies>
<dependency>
<groupId>com.google.zetasql</groupId>
<artifactId>zetasql-client</artifactId>
<version>${zetasql.version}</version>
</dependency>
<dependency>
<groupId>com.google.zetasql</groupId>
<artifactId>zetasql-jni-channel</artifactId>
<version>${zetasql.version}</version>
</dependency>
</dependencies>
</project>

Просмотреть файл

@ -10,7 +10,8 @@ filterwarnings =
# Silence: "Your application has authenticated using end user credentials from Google Cloud SDK"
ignore::UserWarning:google.auth
markers =
integration
integration: mark tests that check integration with external services. Skipped when not specifically enabled.
java: mark tests that require java dependencies. Skipped when not specifically enabled and java is not available.
norecursedirs =
venv
script/legacy

Просмотреть файл

@ -23,3 +23,4 @@ pandas==1.2.2
jsonschema==3.2.0
yamllint==1.26.0
pip-tools==5.5.0
pyjnius==1.3.0

Просмотреть файл

@ -107,6 +107,43 @@ click==7.1.2 \
# black
# mozilla-schema-generator
# pip-tools
cython==0.29.21 \
--hash=sha256:0ac10bf476476a9f7ef61ec6e44c280ef434473124ad31d3132b720f7b0e8d2a \
--hash=sha256:0e25c209c75df8785480dcef85db3d36c165dbc0f4c503168e8763eb735704f2 \
--hash=sha256:171b9f70ceafcec5852089d0f9c1e75b0d554f46c882cd4e2e4acaba9bd7d148 \
--hash=sha256:23f3a00b843a19de8bb4468b087db5b413a903213f67188729782488d67040e0 \
--hash=sha256:2922e3031ba9ebbe7cb9200b585cc33b71d66023d78450dcb883f824f4969371 \
--hash=sha256:31c71a615f38401b0dc1f2a5a9a6c421ffd8908c4cd5bbedc4014c1b876488e8 \
--hash=sha256:473df5d5e400444a36ed81c6596f56a5b52a3481312d0a48d68b777790f730ae \
--hash=sha256:497841897942f734b0abc2dead2d4009795ee992267a70a23485fd0e937edc0b \
--hash=sha256:539e59949aab4955c143a468810123bf22d3e8556421e1ce2531ed4893914ca0 \
--hash=sha256:540b3bee0711aac2e99bda4fa0a46dbcd8c74941666bfc1ef9236b1a64eeffd9 \
--hash=sha256:57ead89128dee9609119c93d3926c7a2add451453063147900408a50144598c6 \
--hash=sha256:5c4276fdcbccdf1e3c1756c7aeb8395e9a36874fa4d30860e7694f43d325ae13 \
--hash=sha256:5da187bebe38030325e1c0b5b8a804d489410be2d384c0ef3ba39493c67eb51e \
--hash=sha256:5e545a48f919e40079b0efe7b0e081c74b96f9ef25b9c1ff4cdbd95764426b58 \
--hash=sha256:603b9f1b8e93e8b494d3e89320c410679e21018e48b6cbc77280f5db71f17dc0 \
--hash=sha256:695a6bcaf9e12b1e471dfce96bbecf22a1487adc2ac6106b15960a2b51b97f5d \
--hash=sha256:715294cd2246b39a8edca464a8366eb635f17213e4a6b9e74e52d8b877a8cb63 \
--hash=sha256:7ebaa8800c376bcdae596fb1372cb4232a5ef957619d35839520d2786f2debb9 \
--hash=sha256:856c7fb31d247ce713d60116375e1f8153d0291ab5e92cca7d8833a524ba9991 \
--hash=sha256:8c6e25e9cc4961bb2abb1777c6fa9d0fa2d9b014beb3276cebe69996ff162b78 \
--hash=sha256:9207fdedc7e789a3dcaca628176b80c82fbed9ae0997210738cbb12536a56699 \
--hash=sha256:93f5fed1c9445fb7afe20450cdaf94b0e0356d47cc75008105be89c6a2e417b1 \
--hash=sha256:9ce5e5209f8406ffc2b058b1293cce7a954911bb7991e623564d489197c9ba30 \
--hash=sha256:a0674f246ad5e1571ef29d4c5ec1d6ecabe9e6c424ad0d6fee46b914d5d24d69 \
--hash=sha256:b2f9172e4d6358f33ecce6a4339b5960f9f83eab67ea244baa812737793826b7 \
--hash=sha256:b8a8a31b9e8860634adbca30fea1d0c7f08e208b3d7611f3e580e5f20992e5d7 \
--hash=sha256:b8d8497091c1dc8705d1575c71e908a93b1f127a174b2d472020f3d84263ac28 \
--hash=sha256:c111ac9abdf715762e4fb87395e59d61c0fbb6ce79eb2e24167700b6cfa8ba79 \
--hash=sha256:c4b78356074fcaac04ecb4de289f11d506e438859877670992ece11f9c90f37b \
--hash=sha256:c541b2b49c6638f2b5beb9316726db84a8d1c132bf31b942dae1f9c7f6ad3b92 \
--hash=sha256:c8435959321cf8aec867bbad54b83b7fb8343204b530d85d9ea7a1f5329d5ac2 \
--hash=sha256:ccb77faeaad99e99c6c444d04862c6cf604204fe0a07d4c8f9cbf2c9012d7d5a \
--hash=sha256:e272ed97d20b026f4f25a012b25d7d7672a60e4f72b9ca385239d693cd91b2d5 \
--hash=sha256:e57acb89bd55943c8d8bf813763d20b9099cc7165c0f16b707631a7654be9cad \
--hash=sha256:e93acd1f603a0c1786e0841f066ae7cef014cf4750e3cd06fd03cfdf46361419
# via pyjnius
execnet==1.7.1 \
--hash=sha256:cacb9df31c9680ec5f95553976c4da484d407e85e41c83cb812aa014f0eddc50 \
--hash=sha256:d4efd397930c46415f62f8a31388d6be4f27a91d7550eb79bc64a756e0056547
@ -474,6 +511,28 @@ pyflakes==2.2.0 \
--hash=sha256:0d94e0e05a19e57a99444b6ddcf9a6eb2e5c68d3ca1e98e90707af8152c90a92 \
--hash=sha256:35b2d75ee967ea93b55750aa9edbbf72813e06a66ba54438df2cfac9e3c27fc8
# via flake8
pyjnius==1.3.0 \
--hash=sha256:05d6ad4e64241e4412491bf6502daf8eae40296aa39cd5093c14931e466ebfe3 \
--hash=sha256:0a7f2de85a0036f57f42b1ec7876dc04e1ad4886e5d0bac32d155265652cce6b \
--hash=sha256:162a0c43dfc303a83a8d80e3d6a980f5f10426e9fc0d4bec0e39b4bd1e162f6b \
--hash=sha256:163c3825e70e9ba6f9dd42e8342ad0fee7c99e18e10c0436d8e1dca340c2c5c4 \
--hash=sha256:1e2f231f83159eafd3b3c3e7393148e6f3376f1f786bbceae78bbdda1bdcc9a9 \
--hash=sha256:21b805277a27b1d436ca1c76bc5f269b232801f69a4e4f22bc72ed98b47355c2 \
--hash=sha256:245fe0218f5ec4b7f229955a8785678b98a697776c9e2cc1ac735d61ad08169c \
--hash=sha256:2bfe44f8d674b2826c11c4920fcfbbec4b65c331c309b3e01c1d9527f934fb01 \
--hash=sha256:44f3b1aedcbb4227499c70ba39d77462c437abb2b0cec14f52be568a1df1e0ee \
--hash=sha256:522a2955cf5543f60eafa6f7e63d01240260f7071fdfe4c2d0f8a0ca1c8b2119 \
--hash=sha256:6600ad9d4afa74af4157adcc4748f62e5c2dd4efc2562ece9f1d39274a2259b0 \
--hash=sha256:84eb01a0d4d592171769f56805895cc1825a9a3b3deac0d90909077e8a2f32a3 \
--hash=sha256:95d1741605dcac7abf79f22eb199c3fdcde3581c3629b2f3ed13f98b2d682d29 \
--hash=sha256:97a8fd5917069e892af94fba114cf8760583140aeaa77225d9b81d07a25a976c \
--hash=sha256:c2ffa4aeb2e56fb9f11be6782cc16f14c3743e27c65b1fcf5f514dc9782e5231 \
--hash=sha256:c5bad82180251fa628e74c2a3f4a119a5e60250cc26723b3e190f0c32e08ece3 \
--hash=sha256:d20845e75a2d18224e661d0e2bc2ce9141f17472e685cd6579847b0a7b5da6ad \
--hash=sha256:d65d510bc88f808e29e355f85ce4a0e35d974af01dd8a217eb98c97588b1cc80 \
--hash=sha256:f4fe0629e5a00f3f6104a28275ff4ea56456e11296875e9e73f3447870e20d96 \
--hash=sha256:f8b5c28398dc082419d195fd108647e5665c361f2897b94a09833f324e8987db
# via -r requirements.in
pyparsing==2.4.7 \
--hash=sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1 \
--hash=sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b
@ -602,6 +661,7 @@ six==1.15.0 \
# oauth2client
# packaging
# protobuf
# pyjnius
# python-dateutil
smart_open==4.2.0 \
--hash=sha256:d9f5a0f173ccb9bbae528db5a3804f57145815774f77ef755b9b0f3b4b2a9dcb

Просмотреть файл

@ -1,5 +1,5 @@
CREATE OR REPLACE VIEW
`moz-fx-data-shared-prod.monitoring_derived.shredder_progress_v1`
`moz-fx-data-shared-prod.monitoring_derived.shredder_progress`
AS
WITH shredder AS (
SELECT

Просмотреть файл

@ -11,6 +11,6 @@ CREATE TEMP FUNCTION
);
-- Tests
assert.true(udf_legacy_contains([1, 2, 3], 1)),
asset_false(udf_legacy_contains([1, 2, 3], 5))
SELECT
assert.true(udf_legacy_contains([1, 2, 3], 1)),
assert.false(udf_legacy_contains([1, 2, 3], 5))

Просмотреть файл

@ -10,6 +10,6 @@ CREATE TEMP FUNCTION
WHEN lower(part) = 'day' THEN DATE_TRUNC(d, DAY)
WHEN lower(part) = 'week' THEN DATE_TRUNC(d, WEEK)
WHEN lower(part) = 'month' THEN DATE_TRUNC(d, MONTH)
ELSE ERROR('This function is a legacy compatibility method and should not be used in new queries. Use the BigQuery built-in DATE_TRUNC instead'))
ELSE ERROR('This function is a legacy compatibility method and should not be used in new queries. Use the BigQuery built-in DATE_TRUNC instead')
END
);

Просмотреть файл

@ -7,8 +7,14 @@ This repository uses `pytest`:
# create a venv
python3.8 -m venv venv/
# install requirements
venv/bin/pip install -r requirements.txt
# install pip-tools for managing dependencies
venv/bin/pip install pip-tools -c requirements.in
# install python dependencies with pip-sync (provided by pip-tools)
venv/bin/pip-sync
# install java dependencies with maven
mvn dependency:copy-dependencies
# run pytest with all linters and 4 workers in parallel
venv/bin/pytest --black --docstyle --flake8 --mypy-ignore-missing-imports -n 4

Просмотреть файл

@ -0,0 +1,36 @@
from click.testing import CliRunner
import os
import pytest
from bigquery_etl.dependency import show as dependency_show
@pytest.mark.java
class TestDependency:
@pytest.fixture
def runner(self):
return CliRunner()
def test_format_invalid_path(self, runner):
result = runner.invoke(dependency_show, ["not-existing-path.sql"])
assert result.exit_code == 1
assert isinstance(result.exception, FileNotFoundError)
def test_format(self, runner):
with runner.isolated_filesystem():
with open("foo.sql", "w") as f:
f.write("SELECT 1 FROM test")
result = runner.invoke(dependency_show, ["foo.sql"])
assert "foo.sql: test\n" == result.output
assert result.exit_code == 0
os.mkdir("test")
with open("test/foo.sql", "w") as f:
f.write("SELECT 1 FROM test_foo")
with open("test/bar.sql", "w") as f:
f.write("SELECT 1 FROM test_bar")
result = runner.invoke(dependency_show, ["test"])
assert "test/bar.sql: test_bar\ntest/foo.sql: test_foo\n" == result.output
assert result.exit_code == 0