bigquery-etl/.circleci/config.yml

732 строки
25 KiB
YAML

---
version: 2.1
orbs:
gcp-gcr: circleci/gcp-gcr@0.13.0
docker: circleci/docker@1.5
python: circleci/python@2.1.1
parameters:
python-version:
type: string
default: '3.10'
executors:
ubuntu-machine-executor:
machine:
image: ubuntu-2004:202111-02
jobs:
build:
docker: &docker
- image: python:<< pipeline.parameters.python-version >>
steps:
- checkout
- &restore_venv_cache
restore_cache:
keys:
# when lock files change, use increasingly general
# patterns to restore cache
- &python_cache_key
# yamllint disable-line rule:line-length
python-<< pipeline.parameters.python-version >>-packages-v1-{{ .Branch }}-{{ checksum "requirements.in" }}-{{ checksum "requirements.txt" }}
# yamllint disable-line rule:line-length
- python-<< pipeline.parameters.python-version >>-packages-v1-{{ .Branch }}-{{ checksum "requirements.in" }}-
# yamllint disable-line rule:line-length
- python-<< pipeline.parameters.python-version >>-packages-v1-{{ .Branch }}-
- python-<< pipeline.parameters.python-version >>-packages-v1-main-
- &build
run:
name: Build
command: |
python3 -m venv venv/
venv/bin/pip install pip-tools --constraint requirements.in
venv/bin/pip-sync --pip-args=--no-deps
- run:
name: Yamllint Test
command: PATH="venv/bin:$PATH" yamllint -c .yamllint.yaml .
- run:
name: PyTest with linters
# integration tests are run in a separate `integration` step;
# SQL and routine tests are split out into a separate `test-sql` test
# since those tests take the longest to run and running those tests
# in parallel speeds up CI
command: |
PATH="venv/bin:$PATH" script/entrypoint --black --flake8 \
--isort --mypy-ignore-missing-imports --pydocstyle \
-m "not (routine or sql or integration)" \
-n 8
- save_cache:
paths:
- venv/
key: *python_cache_key
verify-format-sql:
docker: *docker
steps:
- checkout
- *restore_venv_cache
- *build
- &attach_generated_sql
attach_workspace:
at: /tmp/workspace
- &copy_generated_sql
run:
name: Move generated-sql into place
command: |
rm -rf sql/
cp -r /tmp/workspace/generated-sql/sql sql
- run:
name: Verify that SQL is correctly formatted
command: |
PATH="venv/bin:$PATH" script/bqetl format --check \
$(git ls-tree -d HEAD --name-only)
verify-requirements:
docker: *docker
steps:
- checkout
- run:
name: Verify that requirements.txt contains the right dependencies for
this python version
# use `--constraint` with `requirements.in` not `requirements.txt`
# because for pip>=20.3 "Constraints are only allowed to take the form
# of a package name and a version specifier"
command: |
pip install pip-tools --constraint requirements.in
pip-compile --allow-unsafe --generate-hashes --quiet
git diff --exit-code -G '^ *[^# ]' -- requirements.txt
test-sql:
docker: *docker
steps:
- checkout
- *restore_venv_cache
- *build
- *attach_generated_sql
- &copy_staged_sql
run:
name: Move sql deployed on stage into place
command: |
rm -rf sql/
cp -r /tmp/workspace/staged-generated-sql/sql sql
rm -rf tests/
cp -r /tmp/workspace/staged-generated-sql/tests tests
- run:
name: Run SQL tests
command: |
if [ -n "$CIRCLE_PR_NUMBER" ]; then
echo "Cannot pass creds to forked PRs," \
"so skipping routine and SQL tests"
else
PATH="venv/bin:$PATH" script/entrypoint -m "routine or sql" -n 8
fi
dry-run-sql:
docker: *docker
steps:
- checkout
- *restore_venv_cache
- *build
- *attach_generated_sql
- *copy_staged_sql
- run:
name: Dry run queries
# yamllint disable rule:line-length
# Dry runs on PRs are executed on sql/bigquery-etl-integration-test
# Artifacts (queries, views, UDFs) that are changed will be moved to the
# bigquery-etl-integration-test folder and deployed to the corresponding
# project. This ensures that dry runs can be executed before changes
# have been deployed to prod. (bigquery-etl-integration-test is treated
# as a stage environment)
command: |
if [ "$CIRCLE_BRANCH" = main ]; then
echo "Check dry run for all queries because branch is" \
"$CIRCLE_BRANCH"
PATHS=sql
elif git log --format=%B --no-merges -n 1 |
grep -qF '[run-tests]'; then
echo "Check dry run for all queries because [run-tests] in" \
"commit message"
PATHS=sql
else
PATHS="sql/bigquery-etl-integration-test"
fi
echo $PATHS
PATH="venv/bin:$PATH" script/bqetl dryrun --validate-schemas $PATHS
# yamllint enable rule:line-length
validate-backfills:
docker: *docker
steps:
- checkout
- *restore_venv_cache
- *build
- *attach_generated_sql
- *copy_staged_sql
- run:
name: Verify that backfill.yaml files are valid
command: |
PATH="venv/bin:$PATH" script/bqetl backfill validate
validate-metadata:
docker: *docker
steps:
- checkout
- *restore_venv_cache
- *build
- *attach_generated_sql
- *copy_staged_sql
- run:
name: Verify that metadata files are valid
command: |
# TODO: Add check here to make sure all queries have metadata.yaml
PATH="venv/bin:$PATH" script/bqetl query validate \
--respect-dryrun-skip
integration:
docker: *docker
steps:
- checkout
- &skip_forked_pr
run:
name: Early return if this build is from a forked PR
command: |
if [ -n "$CIRCLE_PR_NUMBER" ]; then
echo "Cannot pass creds to forked PRs," \
"so marking this step successful"
circleci-agent step halt
fi
- *restore_venv_cache
- *build
- run:
name: PyTest Integration Test
# yamllint disable rule:line-length
command: |
PATH="venv/bin:$PATH" script/entrypoint -m 'integration' -n 8
generate-dags:
docker: *docker
steps:
- checkout
- *restore_venv_cache
- *build
- *attach_generated_sql
- *copy_generated_sql
- run:
name: Generate DAGs
command: |
PATH="venv/bin:$PATH" script/bqetl dag generate
cp -R dags /tmp/workspace/generated-sql
- run:
name: Verify that DAGs were correctly generated and are up-to-date
command: |
git diff --exit-code dags/
diff <(git ls-files dags/*.py) <(ls dags/*.py)
# this task is overwriting the content produced by generate-sql;
# the behaviour here is additive, generated DAGs are just added to
# the generated-sql output
- persist_to_workspace:
root: /tmp/workspace
paths:
- generated-sql
validate-dags:
executor:
name: python/default
tag: 3.10.12
steps:
- checkout
- run:
name: Early return when job not modified
command: |
if [ "$CIRCLE_BRANCH" = main ]; then
echo "Run job because branch is $CIRCLE_BRANCH"
elif git log --format=%B --no-merges -n 1 |
grep -qF '[run-tests]'; then
echo "Run job because [run-tests] in commit message"
elif ! git diff --quiet origin/main... \
-- "$(git rev-parse --show-toplevel)"/{.circleci,dags}; then
echo "Run job because .circleci/ and/or dags/ were modified" \
"since branching off main"
else
echo "Skipping job because .circleci/ and dags/ were not modified"
circleci-agent step halt
fi
- run:
name: Pull telemetry-airflow
command: |
git clone https://github.com/mozilla/telemetry-airflow.git ~/telemetry-airflow
- run:
name: Replace telemetry-airflow DAGs with BigQuery ETL DAGs
command: |
rm ~/telemetry-airflow/dags/* -f || true
cp -a dags/. ~/telemetry-airflow/dags/
- *attach_generated_sql
- *copy_generated_sql
- run:
name: Install telemetry-airflow dependencies
command: |
cd ~/telemetry-airflow
virtualenv .venv
source .venv/bin/activate
pip install -r requirements.txt
- run:
name: 🧪 Test valid DAGs
command: |
cd ~/telemetry-airflow
source .venv/bin/activate
python -m pytest tests/dags/test_dag_validity.py --junitxml=~/telemetry-airflow/test-results/junit.xml
- store_test_results:
path: ~/telemetry-airflow/test-results/junit.xml
validate-docs:
docker: *docker
steps:
- checkout
- *restore_venv_cache
- *build
- *attach_generated_sql
- *copy_generated_sql
- run:
name: Validate doc examples
command: |
PATH="venv/bin:$PATH" script/bqetl routine validate --docs-only
validate-views:
docker: *docker
steps:
- checkout
- *restore_venv_cache
- *build
- *attach_generated_sql
- *copy_staged_sql
- run:
name: Validate views
command: PATH="venv/bin:$PATH" script/bqetl view validate
docs:
docker: *docker
steps:
- checkout
- *skip_forked_pr
- *restore_venv_cache
- *build
- *attach_generated_sql
- add_ssh_keys:
fingerprints: "22:b9:3c:1b:82:ab:3f:e4:b5:79:70:d1:7b:b9:28:d2"
- run:
name: Build and deploy docs
command: |
rm -r sql/ && cp -r /tmp/workspace/generated-sql/sql sql/
PATH="venv/bin:$PATH" script/bqetl docs generate \
--output_dir=generated_docs/
cd generated_docs/
PATH="../venv/bin:$PATH" mkdocs gh-deploy \
-m "[ci skip] Deployed {sha} with MkDocs version: {version}"
generate-sql:
docker: *docker
steps:
- checkout
- *restore_venv_cache
- *build
- run:
name: Generate SQL content
command: |
mkdir -p /tmp/workspace/generated-sql
cp -r sql/ /tmp/workspace/generated-sql/sql
# Don't depend on dry run for PRs
PATH="venv/bin:$PATH" script/bqetl generate all \
--output-dir /tmp/workspace/generated-sql/sql/ \
--target-project moz-fx-data-shared-prod
PATH="venv/bin:$PATH" script/bqetl query render \
--sql-dir /tmp/workspace/generated-sql/sql/ \
--output-dir /tmp/workspace/generated-sql/sql/ \
/tmp/workspace/generated-sql/sql/
PATH="venv/bin:$PATH" script/bqetl dependency record \
--skip-existing \
"/tmp/workspace/generated-sql/sql/"
- persist_to_workspace:
root: /tmp/workspace
paths:
- generated-sql
deploy-changes-to-stage:
docker: *docker
steps:
- checkout
- *skip_forked_pr
- *restore_venv_cache
- *build
- *attach_generated_sql
- *copy_generated_sql
- add_ssh_keys:
fingerprints: "22:b9:3c:1b:82:ab:3f:e4:b5:79:70:d1:7b:b9:28:d2"
- run:
name: Pull in generated-sql branch from remote
command: |
ssh-keyscan -t rsa github.com >> ~/.ssh/known_hosts
git clone --single-branch --branch generated-sql \
git@github.com:mozilla/bigquery-etl \
generated-sql
- run:
name: Deploy changes to stage
command: |
export GOOGLE_APPLICATION_CREDENTIALS="/tmp/gcp.json"
echo "$GCLOUD_SERVICE_KEY" > "$GOOGLE_APPLICATION_CREDENTIALS"
PATHS="$(git diff --no-index --name-only --diff-filter=d generated-sql/sql sql)" || true
echo $PATHS
PATH="venv/bin:$PATH" script/bqetl stage deploy \
--dataset-suffix=$CIRCLE_SHA1 \
--remove-updated-artifacts \
$PATHS
- run:
name: Copy generated SQL to temporary stage directory
command: |
mkdir -p /tmp/workspace/staged-generated-sql
cp -r sql/ /tmp/workspace/staged-generated-sql/sql
cp -r tests/ /tmp/workspace/staged-generated-sql/tests
- persist_to_workspace:
root: /tmp/workspace
paths:
- staged-generated-sql
push-generated-sql:
docker: *docker
steps:
- *attach_generated_sql
- add_ssh_keys:
fingerprints: "22:b9:3c:1b:82:ab:3f:e4:b5:79:70:d1:7b:b9:28:d2"
- run:
name: Push to generated-sql branch
command: |
ssh-keyscan -t rsa github.com >> ~/.ssh/known_hosts
git config --global user.name "CircleCI generate-sql job"
git config --global user.email "dataops+generated-sql@mozilla.com"
git clone --single-branch --branch generated-sql \
git@github.com:mozilla/bigquery-etl \
generated-sql
cd generated-sql/
rm -rf sql/
cp -r /tmp/workspace/generated-sql/sql sql
rm -rf dags/
cp -r /tmp/workspace/generated-sql/dags dags
git add .
git commit -m "Auto-push due to change on main branch [ci skip]" \
&& git push \
|| echo "Skipping push since it looks like there were no changes"
deploy:
executor: ubuntu-machine-executor
steps:
- checkout
- *attach_generated_sql
- *copy_generated_sql
- docker/check:
docker-password: DOCKER_PASS
docker-username: DOCKER_USER
- docker/build: &public-image
image: ${CIRCLE_PROJECT_USERNAME+$CIRCLE_PROJECT_USERNAME/}${CIRCLE_PROJECT_REPONAME:-bigquery-etl}
tag: ${CIRCLE_TAG:-latest}
- docker/push: *public-image
private-generate-sql:
docker: *docker
steps:
- checkout
- *skip_forked_pr
- *restore_venv_cache
- *build
- add_ssh_keys:
# deploy key to private-bigquery-etl
fingerprints: "9d:1e:af:52:78:2c:e8:ec:33:4c:db:cd:5a:ff:70:0a"
- run:
name: Install rsync
command: |
apt update
apt install -y rsync
- run:
name: Pull down private SQL content
command: |
ssh-keyscan -t rsa github.com >> ~/.ssh/known_hosts
git clone --single-branch --branch main \
git@github.com:mozilla/private-bigquery-etl.git \
~/private-bigquery-etl
rsync --archive ~/private-bigquery-etl/sql/ sql/
- run:
name: Generate SQL content
command: |
mkdir -p /tmp/workspace/private-generated-sql
cp -r sql/ /tmp/workspace/private-generated-sql/sql
# Don't depend on dry run for PRs
PATH="venv/bin:$PATH" script/bqetl generate all \
--output-dir /tmp/workspace/private-generated-sql/sql/ \
--target-project moz-fx-data-shared-prod
PATH="venv/bin:$PATH" script/bqetl dependency record \
--skip-existing \
"/tmp/workspace/private-generated-sql/sql/"
- persist_to_workspace:
root: /tmp/workspace
paths:
- private-generated-sql
push-private-generated-sql:
docker: *docker
steps:
- *attach_generated_sql
- add_ssh_keys:
fingerprints: "9d:1e:af:52:78:2c:e8:ec:33:4c:db:cd:5a:ff:70:0a"
- run:
name: Push to private-generated-sql branch
# yamllint disable rule:line-length
command: |
ssh-keyscan -t rsa github.com >> ~/.ssh/known_hosts
git config --global user.name "CircleCI private-generate-sql job"
git config --global user.email "dataops+private-generated-sql@mozilla.com"
git clone --single-branch --branch private-generated-sql \
git@github.com:mozilla/private-bigquery-etl \
private-generated-sql
cd private-generated-sql/
rm -rf sql/
cp -r /tmp/workspace/private-generated-sql/sql sql
git add .
git commit -m "Auto-push due to change on main branch [ci skip]" \
&& git push \
|| echo "Skipping push since it looks like there were no changes"
# yamllint enable rule:line-length
deploy-to-private-gcr:
executor: ubuntu-machine-executor
steps:
- checkout
- *attach_generated_sql
- run:
name: Move generated-sql into place
command: |
rm -rf sql/
cp -r /tmp/workspace/private-generated-sql/sql sql
- gcp-gcr/gcr-auth
- gcp-gcr/build-image: &private-image
image: bigquery-etl
tag: ${CIRCLE_TAG:-latest}
- gcp-gcr/push-image: *private-image
main-generate-sql-and-dags:
docker: *docker
steps:
- checkout
- run:
name: Switch to main branch
command: |
git remote add mozilla git@github.com:mozilla/bigquery-etl
git fetch mozilla main
git checkout mozilla/main
- attach_workspace:
at: /tmp/workspace
- *restore_venv_cache
- *build
- run:
name: Generate SQL content
command: |
export PATH="venv/bin:$PATH"
./script/bqetl generate all \
--target-project moz-fx-data-shared-prod
./script/bqetl dependency record \
--skip-existing \
"sql/"
./script/bqetl dag generate
mkdir -p /tmp/workspace/main-generated-sql
cp -r sql/ /tmp/workspace/main-generated-sql/sql
cp -r dags/ /tmp/workspace/main-generated-sql/dags
- persist_to_workspace:
root: /tmp/workspace
paths:
- main-generated-sql
generate-diff:
docker: *docker
steps:
- attach_workspace:
at: /tmp/workspace
- run:
name: Generate diff
command: |
diff -bur --no-dereference \
/tmp/workspace/main-generated-sql/sql/ /tmp/workspace/generated-sql/sql/ \
> /tmp/workspace/generated-sql/sql.diff || true
diff -bur --no-dereference \
/tmp/workspace/main-generated-sql/dags/ /tmp/workspace/generated-sql/dags/ \
>> /tmp/workspace/generated-sql/sql.diff || true
- persist_to_workspace:
root: /tmp/workspace
paths:
- generated-sql
post-diff:
docker:
- image: circleci/node:8.10.0
steps:
- checkout
- attach_workspace:
at: /tmp/workspace
- run: npm i circle-github-bot
- run: .circleci/post-diff.js
- store_artifacts:
path: /tmp/integration
destination: /app/integration
reset-stage-env:
docker: *docker
steps:
- checkout
- *skip_forked_pr
- *build
- run:
name: "Delete stage datasets"
command: |
export GOOGLE_APPLICATION_CREDENTIALS="/tmp/gcp.json"
echo "$GCLOUD_SERVICE_KEY" > "$GOOGLE_APPLICATION_CREDENTIALS"
PATH="venv/bin:$PATH" script/bqetl stage clean --dataset-suffix=$CIRCLE_SHA1 --delete-expired
manual-trigger-required-for-fork:
docker: *docker
steps:
- &skip_upstream
run:
name: Early return if this build is running on upstream
command: |
if [ -n "$CIRCLE_PR_NUMBER" ]; then
echo "Build on fork"
else
echo "Build on upstream"
circleci-agent step halt
fi
- checkout
- run:
name: Manually trigger integration tests for fork
# yamllint disable rule:line-length
command: |
apt update
apt install jq -y
CIRCLE_PR_BRANCH=`curl -s https://api.github.com/repos/${CIRCLE_PROJECT_USERNAME}/${CIRCLE_PROJECT_REPONAME}/pulls/${CIRCLE_PR_NUMBER} | jq -r '.head.label'`
echo "Integration tests for this fork need to be triggered manually"
echo "Users with write access to the repository can trigger" \
"integration tests by following these steps: "
echo " Open the following page:"
echo " https://github.com/mozilla/bigquery-etl/actions/workflows/push-to-upstream.yml"
echo " Choose the 'Run workflow' dropdown and provide '$CIRCLE_PR_BRANCH' as parameter."
exit 1
# yamllint enable rule:line-length
workflows:
version: 2
build:
jobs: &build_jobs
- manual-trigger-required-for-fork
- build:
context: data-eng-circleci-tests
- verify-format-sql:
requires:
- generate-sql
- deploy-changes-to-stage:
requires:
- generate-sql
- verify-requirements
- test-sql:
context: data-eng-circleci-tests
requires:
- deploy-changes-to-stage
- dry-run-sql:
requires:
- deploy-changes-to-stage
- validate-metadata:
requires:
- deploy-changes-to-stage
- integration
- validate-backfills:
requires:
- deploy-changes-to-stage
- validate-dags:
requires:
- generate-dags
- validate-docs:
requires:
- generate-sql
- validate-views:
requires:
- deploy-changes-to-stage
- generate-sql
- main-generate-sql-and-dags:
filters:
branches:
ignore: main
- generate-diff:
requires:
- generate-dags
- main-generate-sql-and-dags
filters:
branches:
ignore: main
- post-diff:
requires:
- generate-diff
filters:
branches:
ignore: main
- generate-dags:
requires:
- generate-sql
- docs:
requires:
- generate-sql
filters:
branches:
only: main
- push-generated-sql:
requires:
- validate-dags
filters:
branches:
only:
- main
- reset-stage-env:
requires:
- push-generated-sql
- test-sql
- validate-views
- validate-docs
- validate-metadata
- dry-run-sql
- deploy:
context: data-eng-bigquery-etl-dockerhub
requires:
- generate-sql
# Public image must be pushed after the private one because of
# webhooks used in Ops logic. For details, see:
# https://bugzilla.mozilla.org/show_bug.cgi?id=1715628#c0
- deploy-to-private-gcr
filters:
branches:
only:
- main
# The following "private" jobs are basically clones of the public jobs
# for generate-sql, deploy, and push-generated-sql, except that they pull
# in some additional content from an internal Mozilla repository for
# cases where ETL code cannot be public. Although the CI logic is
# consolidated in this public repository, note that we are both pulling
# from the internal repository and pushing generated results back to
# a branch on that internal repository, which may be initially
# surprising.
- private-generate-sql
- push-private-generated-sql:
requires:
- private-generate-sql
filters:
branches:
only:
- main
- deploy-to-private-gcr:
context: data-eng-airflow-gcr
requires:
- private-generate-sql
# can't run in parallel because CIRCLE_BUILD_NUM is same
- build
- generate-sql
filters:
branches:
only:
- main
nightly:
# Run after schema-generator to ensure we are up-to-date
triggers:
- schedule:
cron: "0 5 * * *"
filters:
branches:
only:
- main
jobs: *build_jobs