bigquery-etl/.circleci/config.yml

---
version: 2.1

orbs:
  gcp-gcr: circleci/gcp-gcr@0.13.0
  docker: circleci/docker@1.5
  python: circleci/python@2.1.1

parameters:
  python-version:
    type: string
    default: '3.10'

executors:
  ubuntu-machine-executor:
    machine:
      image: ubuntu-2004:202111-02

jobs:
  build:
    docker: &docker
      - image: python:<< pipeline.parameters.python-version >>
    steps:
      - checkout
      - &restore_venv_cache
        restore_cache:
          keys:
            # when lock files change, use increasingly general
            # patterns to restore cache
            - &python_cache_key
              # yamllint disable-line rule:line-length
              python-<< pipeline.parameters.python-version >>-packages-v1-{{ .Branch }}-{{ checksum "requirements.in" }}-{{ checksum "requirements.txt" }}
            # yamllint disable-line rule:line-length
            - python-<< pipeline.parameters.python-version >>-packages-v1-{{ .Branch }}-{{ checksum "requirements.in" }}-
            # yamllint disable-line rule:line-length
            - python-<< pipeline.parameters.python-version >>-packages-v1-{{ .Branch }}-
            - python-<< pipeline.parameters.python-version >>-packages-v1-main-
      - &build
        run:
          name: Build
          command: |
            python3 -m venv venv/
            venv/bin/pip install pip-tools --constraint requirements.in
            venv/bin/pip-sync --pip-args=--no-deps
      - run:
          name: Yamllint Test
          command: PATH="venv/bin:$PATH" yamllint -c .yamllint.yaml .
      - run:
          name: PyTest with linters
          # integration tests are run in a separate `integration` step;
          # SQL and routine tests are split out into a separate `test-sql` test
          # since those tests take the longest to run and running those tests
          # in parallel speeds up CI
          command: |
            PATH="venv/bin:$PATH" script/entrypoint --black --flake8 \
              --isort --mypy-ignore-missing-imports --pydocstyle \
              -m "not (routine or sql or integration)" \
              -n 8
      - save_cache:
          paths:
            - venv/
          key: *python_cache_key
  verify-format-sql:
    docker: *docker
    steps:
      - checkout
      - *restore_venv_cache
      - *build
      - &attach_generated_sql
        attach_workspace:
          at: /tmp/workspace
      - &copy_generated_sql
        run:
          name: Move generated-sql into place
          command: |
            rm -rf sql/
            cp -r /tmp/workspace/generated-sql/sql sql
      - run:
          name: Verify that SQL is correctly formatted
          command: |
            PATH="venv/bin:$PATH" script/bqetl format --check \
            $(git ls-tree -d HEAD --name-only)
  verify-requirements:
    docker: *docker
    steps:
      - checkout
      - run:
          name: Verify that requirements.txt contains the right dependencies for
            this python version
          # use `--constraint` with `requirements.in` not `requirements.txt`
          # because for pip>=20.3 "Constraints are only allowed to take the form
          # of a package name and a version specifier"
          command: |
            pip install pip-tools --constraint requirements.in
            pip-compile --allow-unsafe --generate-hashes --quiet
            git diff --exit-code -G '^ *[^# ]' -- requirements.txt
  test-sql:
    docker: *docker
    steps:
      - checkout
      - *restore_venv_cache
      - *build
      - *attach_generated_sql
      - &copy_staged_sql
        run:
          name: Move sql deployed on stage into place
          command: |
            rm -rf sql/
            cp -r /tmp/workspace/staged-generated-sql/sql sql
            rm -rf tests/
            cp -r /tmp/workspace/staged-generated-sql/tests tests
      - run:
          name: Run SQL tests
          command: |
            if [ -n "$CIRCLE_PR_NUMBER" ]; then
              echo "Cannot pass creds to forked PRs," \
                "so skipping routine and SQL tests"
            else
              PATH="venv/bin:$PATH" script/entrypoint -m "routine or sql" -n 8
            fi
  dry-run-sql:
    docker: *docker
    steps:
      - checkout
      - *restore_venv_cache
      - *build
      - *attach_generated_sql
      - *copy_staged_sql
      - run:
          name: Dry run queries
          # yamllint disable rule:line-length
          # Dry runs on PRs are executed on sql/bigquery-etl-integration-test
          # Artifacts (queries, views, UDFs) that are changed will be moved to the
          # bigquery-etl-integration-test folder and deployed to the corresponding
          # project. This ensures that dry runs can be executed before changes
          # have been deployed to prod. (bigquery-etl-integration-test is treated
          # as a stage environment)
          command: |
            if [ "$CIRCLE_BRANCH" = main ]; then
              echo "Check dry run for all queries because branch is" \
                "$CIRCLE_BRANCH"
              PATHS=sql
            elif git log --format=%B --no-merges -n 1 |
                grep -qF '[run-tests]'; then
              echo "Check dry run for all queries because [run-tests] in" \
                "commit message"
              PATHS=sql
            else
              PATHS="sql/bigquery-etl-integration-test"
            fi
            echo $PATHS
            PATH="venv/bin:$PATH" script/bqetl dryrun --validate-schemas $PATHS
          # yamllint enable rule:line-length
  validate-backfills:
    docker: *docker
    steps:
      - checkout
      - *restore_venv_cache
      - *build
      - *attach_generated_sql
      - *copy_staged_sql
      - run:
          name: Verify that backfill.yaml files are valid
          command: |
            PATH="venv/bin:$PATH" script/bqetl backfill validate
  validate-metadata:
    docker: *docker
    steps:
      - checkout
      - *restore_venv_cache
      - *build
      - *attach_generated_sql
      - *copy_staged_sql
      - run:
          name: Verify that metadata files are valid
          command: |
            # TODO: Add check here to make sure all queries have metadata.yaml
            PATH="venv/bin:$PATH" script/bqetl query validate \
              --respect-dryrun-skip
  integration:
    docker: *docker
    steps:
      - checkout
      - &skip_forked_pr
        run:
          name: Early return if this build is from a forked PR
          command: |
            if [ -n "$CIRCLE_PR_NUMBER" ]; then
              echo "Cannot pass creds to forked PRs," \
                "so marking this step successful"
              circleci-agent step halt
            fi
      - *restore_venv_cache
      - *build
      - run:
          name: PyTest Integration Test
          # yamllint disable rule:line-length
          command: |
            PATH="venv/bin:$PATH" script/entrypoint -m 'integration' -n 8
  generate-dags:
    docker: *docker
    steps:
      - checkout
      - *restore_venv_cache
      - *build
      - *attach_generated_sql
      - *copy_generated_sql
      - run:
          name: Generate DAGs
          command: |
            PATH="venv/bin:$PATH" script/bqetl dag generate
            cp -R dags /tmp/workspace/generated-sql
      - run:
          name: Verify that DAGs were correctly generated and are up-to-date
          command: |
            git diff --exit-code dags/
            diff <(git ls-files dags/*.py) <(ls dags/*.py)
      # this task is overwriting the content produced by generate-sql;
      # the behaviour here is additive, generated DAGs are just added to
      # the generated-sql output
      - persist_to_workspace:
          root: /tmp/workspace
          paths:
            - generated-sql
  validate-dags:
    executor:
      name: python/default
      tag: 3.10.12
    steps:
      - checkout
      - run:
          name: Early return when job not modified
          command: |
            if [ "$CIRCLE_BRANCH" = main ]; then
              echo "Run job because branch is $CIRCLE_BRANCH"
            elif git log --format=%B --no-merges -n 1 |
                grep -qF '[run-tests]'; then
              echo "Run job because [run-tests] in commit message"
            elif ! git diff --quiet origin/main... \
                -- "$(git rev-parse --show-toplevel)"/{.circleci,dags}; then
              echo "Run job because .circleci/ and/or dags/ were modified" \
                "since branching off main"
            else
              echo "Skipping job because .circleci/ and dags/ were not modified"
              circleci-agent step halt
            fi
      - run:
          name: Pull telemetry-airflow
          command: |
            git clone https://github.com/mozilla/telemetry-airflow.git ~/telemetry-airflow
      - run:
          name: Replace telemetry-airflow DAGs with BigQuery ETL DAGs
          command: |
            rm ~/telemetry-airflow/dags/* -f || true
            cp -a dags/. ~/telemetry-airflow/dags/
      - *attach_generated_sql
      - *copy_generated_sql
      - run:
          name: Install telemetry-airflow dependencies
          command: |
            cd ~/telemetry-airflow
            virtualenv .venv
            source .venv/bin/activate
            pip install -r requirements.txt
      - run:
          name: 🧪 Test valid DAGs
          command: |
            cd ~/telemetry-airflow
            source .venv/bin/activate
            python -m pytest tests/dags/test_dag_validity.py --junitxml=~/telemetry-airflow/test-results/junit.xml
      - store_test_results:
          path: ~/telemetry-airflow/test-results/junit.xml
  validate-docs:
    docker: *docker
    steps:
      - checkout
      - *restore_venv_cache
      - *build
      - *attach_generated_sql
      - *copy_generated_sql
      - run:
          name: Validate doc examples
          command: |
            PATH="venv/bin:$PATH" script/bqetl routine validate --docs-only
  validate-views:
    docker: *docker
    steps:
      - checkout
      - *restore_venv_cache
      - *build
      - *attach_generated_sql
      - *copy_staged_sql
      - run:
          name: Validate views
          command: PATH="venv/bin:$PATH" script/bqetl view validate
  docs:
    docker: *docker
    steps:
      - checkout
      - *skip_forked_pr
      - *restore_venv_cache
      - *build
      - *attach_generated_sql
      - add_ssh_keys:
          fingerprints: "22:b9:3c:1b:82:ab:3f:e4:b5:79:70:d1:7b:b9:28:d2"
      - run:
          name: Build and deploy docs
          command: |
            rm -r sql/ && cp -r /tmp/workspace/generated-sql/sql sql/
            PATH="venv/bin:$PATH" script/bqetl docs generate \
               --output_dir=generated_docs/
            cd generated_docs/
            PATH="../venv/bin:$PATH" mkdocs gh-deploy \
              -m "[ci skip] Deployed {sha} with MkDocs version: {version}"
  generate-sql:
    docker: *docker
    steps:
      - checkout
      - *restore_venv_cache
      - *build
      - run:
          name: Generate SQL content
          command: |
            mkdir -p /tmp/workspace/generated-sql
            cp -r sql/ /tmp/workspace/generated-sql/sql
            # Don't depend on dry run for PRs
            PATH="venv/bin:$PATH" script/bqetl generate all \
              --output-dir /tmp/workspace/generated-sql/sql/ \
              --target-project moz-fx-data-shared-prod
            PATH="venv/bin:$PATH" script/bqetl query render \
              --sql-dir /tmp/workspace/generated-sql/sql/ \
              --output-dir /tmp/workspace/generated-sql/sql/ \
              /tmp/workspace/generated-sql/sql/
            PATH="venv/bin:$PATH" script/bqetl dependency record \
              --skip-existing \
              "/tmp/workspace/generated-sql/sql/"
      - persist_to_workspace:
          root: /tmp/workspace
          paths:
            - generated-sql
  deploy-changes-to-stage:
    docker: *docker
    steps:
      - checkout
      - *skip_forked_pr
      - *restore_venv_cache
      - *build
      - *attach_generated_sql
      - *copy_generated_sql
      - add_ssh_keys:
          fingerprints: "22:b9:3c:1b:82:ab:3f:e4:b5:79:70:d1:7b:b9:28:d2"
      - run:
          name: Pull in generated-sql branch from remote
          command: |
            ssh-keyscan -t rsa github.com >> ~/.ssh/known_hosts
            git clone --single-branch --branch generated-sql \
              git@github.com:mozilla/bigquery-etl \
              generated-sql
      - run:
          name: Deploy changes to stage
          command: |
            export GOOGLE_APPLICATION_CREDENTIALS="/tmp/gcp.json"
            echo "$GCLOUD_SERVICE_KEY" > "$GOOGLE_APPLICATION_CREDENTIALS"

            PATHS="$(git diff --no-index --name-only --diff-filter=d generated-sql/sql sql)" || true
            echo $PATHS
            PATH="venv/bin:$PATH" script/bqetl stage deploy \
              --dataset-suffix=$CIRCLE_SHA1 \
              --remove-updated-artifacts \
              $PATHS
      - run:
          name: Copy generated SQL to temporary stage directory
          command: |
            mkdir -p /tmp/workspace/staged-generated-sql
            cp -r sql/ /tmp/workspace/staged-generated-sql/sql
            cp -r tests/ /tmp/workspace/staged-generated-sql/tests
      - persist_to_workspace:
          root: /tmp/workspace
          paths:
            - staged-generated-sql
  push-generated-sql:
    docker: *docker
    steps:
      - *attach_generated_sql
      - add_ssh_keys:
          fingerprints: "22:b9:3c:1b:82:ab:3f:e4:b5:79:70:d1:7b:b9:28:d2"
      - run:
          name: Push to generated-sql branch
          command: |
            ssh-keyscan -t rsa github.com >> ~/.ssh/known_hosts
            git config --global user.name "CircleCI generate-sql job"
            git config --global user.email "dataops+generated-sql@mozilla.com"
            git clone --single-branch --branch generated-sql \
              git@github.com:mozilla/bigquery-etl \
              generated-sql
            cd generated-sql/
            rm -rf sql/
            cp -r /tmp/workspace/generated-sql/sql sql
            rm -rf dags/
            cp -r /tmp/workspace/generated-sql/dags dags
            git add .
            git commit -m "Auto-push due to change on main branch [ci skip]" \
              && git push \
              || echo "Skipping push since it looks like there were no changes"
  deploy:
    executor: ubuntu-machine-executor
    steps:
      - checkout
      - *attach_generated_sql
      - *copy_generated_sql
      - docker/check:
          docker-password: DOCKER_PASS
          docker-username: DOCKER_USER
      - docker/build: &public-image
          image: ${CIRCLE_PROJECT_USERNAME+$CIRCLE_PROJECT_USERNAME/}${CIRCLE_PROJECT_REPONAME:-bigquery-etl}
          tag: ${CIRCLE_TAG:-latest}
      - docker/push: *public-image
  private-generate-sql:
    docker: *docker
    steps:
      - checkout
      - *skip_forked_pr
      - *restore_venv_cache
      - *build
      - add_ssh_keys:
          # deploy key to private-bigquery-etl
          fingerprints: "9d:1e:af:52:78:2c:e8:ec:33:4c:db:cd:5a:ff:70:0a"
      - run:
          name: Install rsync
          command: |
            apt update
            apt install -y rsync
      - run:
          name: Pull down private SQL content
          command: |
            ssh-keyscan -t rsa github.com >> ~/.ssh/known_hosts
            git clone --single-branch --branch main \
                          git@github.com:mozilla/private-bigquery-etl.git \
                          ~/private-bigquery-etl
            rsync --archive ~/private-bigquery-etl/sql/ sql/
      - run:
          name: Generate SQL content
          command: |
            mkdir -p /tmp/workspace/private-generated-sql
            cp -r sql/ /tmp/workspace/private-generated-sql/sql
            # Don't depend on dry run for PRs
            PATH="venv/bin:$PATH" script/bqetl generate all \
              --output-dir /tmp/workspace/private-generated-sql/sql/ \
              --target-project moz-fx-data-shared-prod
            PATH="venv/bin:$PATH" script/bqetl dependency record \
              --skip-existing \
              "/tmp/workspace/private-generated-sql/sql/"
      - persist_to_workspace:
          root: /tmp/workspace
          paths:
            - private-generated-sql
  push-private-generated-sql:
    docker: *docker
    steps:
      - *attach_generated_sql
      - add_ssh_keys:
          fingerprints: "9d:1e:af:52:78:2c:e8:ec:33:4c:db:cd:5a:ff:70:0a"
      - run:
          name: Push to private-generated-sql branch
          # yamllint disable rule:line-length
          command: |
            ssh-keyscan -t rsa github.com >> ~/.ssh/known_hosts
            git config --global user.name "CircleCI private-generate-sql job"
            git config --global user.email "dataops+private-generated-sql@mozilla.com"
            git clone --single-branch --branch private-generated-sql \
              git@github.com:mozilla/private-bigquery-etl \
              private-generated-sql
            cd private-generated-sql/
            rm -rf sql/
            cp -r /tmp/workspace/private-generated-sql/sql sql
            git add .
            git commit -m "Auto-push due to change on main branch [ci skip]" \
              && git push \
              || echo "Skipping push since it looks like there were no changes"
          # yamllint enable rule:line-length
  deploy-to-private-gcr:
    executor: ubuntu-machine-executor
    steps:
      - checkout
      - *attach_generated_sql
      - run:
          name: Move generated-sql into place
          command: |
            rm -rf sql/
            cp -r /tmp/workspace/private-generated-sql/sql sql
      - gcp-gcr/gcr-auth
      - gcp-gcr/build-image: &private-image
          image: bigquery-etl
          tag: ${CIRCLE_TAG:-latest}
      - gcp-gcr/push-image: *private-image
  main-generate-sql-and-dags:
    docker: *docker
    steps:
      - checkout
      - run:
          name: Switch to main branch
          command: |
            git remote add mozilla git@github.com:mozilla/bigquery-etl
            git fetch mozilla main
            git checkout mozilla/main
      - attach_workspace:
          at: /tmp/workspace
      - *restore_venv_cache
      - *build
      - run:
          name: Generate SQL content
          command: |
            export PATH="venv/bin:$PATH"
            ./script/bqetl generate all \
              --target-project moz-fx-data-shared-prod
            ./script/bqetl dependency record \
              --skip-existing \
              "sql/"
            ./script/bqetl dag generate

            mkdir -p /tmp/workspace/main-generated-sql
            cp -r sql/ /tmp/workspace/main-generated-sql/sql
            cp -r dags/ /tmp/workspace/main-generated-sql/dags
      - persist_to_workspace:
          root: /tmp/workspace
          paths:
            - main-generated-sql
  generate-diff:
    docker: *docker
    steps:
      - attach_workspace:
          at: /tmp/workspace
      - run:
          name: Generate diff
          command: |
            diff -bur --no-dereference \
              /tmp/workspace/main-generated-sql/sql/ /tmp/workspace/generated-sql/sql/ \
              > /tmp/workspace/generated-sql/sql.diff || true
            diff -bur --no-dereference \
              /tmp/workspace/main-generated-sql/dags/ /tmp/workspace/generated-sql/dags/ \
              >> /tmp/workspace/generated-sql/sql.diff || true
      - persist_to_workspace:
          root: /tmp/workspace
          paths:
            - generated-sql
  post-diff:
    docker:
      - image: circleci/node:8.10.0
    steps:
      - checkout
      - attach_workspace:
          at: /tmp/workspace
      - run: npm i circle-github-bot
      - run: .circleci/post-diff.js
      - store_artifacts:
          path: /tmp/integration
          destination: /app/integration
  reset-stage-env:
    docker: *docker
    steps:
      - checkout
      - *skip_forked_pr
      - *build
      - run:
          name: "Delete stage datasets"
          command: |
            export GOOGLE_APPLICATION_CREDENTIALS="/tmp/gcp.json"
            echo "$GCLOUD_SERVICE_KEY" > "$GOOGLE_APPLICATION_CREDENTIALS"

            PATH="venv/bin:$PATH" script/bqetl stage clean --dataset-suffix=$CIRCLE_SHA1 --delete-expired
  manual-trigger-required-for-fork:
    docker: *docker
    steps:
      - &skip_upstream
        run:
          name: Early return if this build is running on upstream
          command: |
            if [ -n "$CIRCLE_PR_NUMBER" ]; then
              echo "Build on fork"
            else
              echo "Build on upstream"
              circleci-agent step halt
            fi
      - checkout
      - run:
          name: Manually trigger integration tests for fork
          # yamllint disable rule:line-length
          command: |
            apt update
            apt install jq -y

            CIRCLE_PR_BRANCH=`curl -s https://api.github.com/repos/${CIRCLE_PROJECT_USERNAME}/${CIRCLE_PROJECT_REPONAME}/pulls/${CIRCLE_PR_NUMBER} | jq -r '.head.label'`

            echo "Integration tests for this fork need to be triggered manually"
            echo "Users with write access to the repository can trigger" \
              "integration tests by following these steps: "
            echo "  Open the following page:"
            echo "    https://github.com/mozilla/bigquery-etl/actions/workflows/push-to-upstream.yml"
            echo "  Choose the 'Run workflow' dropdown and provide '$CIRCLE_PR_BRANCH' as parameter."

            exit 1
          # yamllint enable rule:line-length

workflows:
  version: 2
  build:
    jobs: &build_jobs
      - manual-trigger-required-for-fork
      - build:
          context: data-eng-circleci-tests
      - verify-format-sql:
          requires:
            - generate-sql
      - deploy-changes-to-stage:
          requires:
            - generate-sql
      - verify-requirements
      - test-sql:
          context: data-eng-circleci-tests
          requires:
            - deploy-changes-to-stage
      - dry-run-sql:
          requires:
            - deploy-changes-to-stage
      - validate-metadata:
          requires:
            - deploy-changes-to-stage
      - integration
      - validate-backfills:
          requires:
            - deploy-changes-to-stage
      - validate-dags:
          requires:
            - generate-dags
      - validate-docs:
          requires:
            - generate-sql
      - validate-views:
          requires:
            - deploy-changes-to-stage
      - generate-sql
      - main-generate-sql-and-dags:
          filters:
            branches:
              ignore: main
      - generate-diff:
          requires:
            - generate-dags
            - main-generate-sql-and-dags
          filters:
            branches:
              ignore: main
      - post-diff:
          requires:
            - generate-diff
          filters:
            branches:
              ignore: main
      - generate-dags:
          requires:
            - generate-sql
      - docs:
          requires:
            - generate-sql
          filters:
            branches:
              only: main
      - push-generated-sql:
          requires:
            - validate-dags
          filters:
            branches:
              only:
                - main
      - reset-stage-env:
          requires:
            - push-generated-sql
            - test-sql
            - validate-views
            - validate-docs
            - validate-metadata
            - dry-run-sql
      - deploy:
          context: data-eng-bigquery-etl-dockerhub
          requires:
            - generate-sql
            # Public image must be pushed after the private one because of
            # webhooks used in Ops logic. For details, see:
            # https://bugzilla.mozilla.org/show_bug.cgi?id=1715628#c0
            - deploy-to-private-gcr
          filters:
            branches:
              only:
                - main
      # The following "private" jobs are basically clones of the public jobs
      # for generate-sql, deploy, and push-generated-sql, except that they pull
      # in some additional content from an internal Mozilla repository for
      # cases where ETL code cannot be public. Although the CI logic is
      # consolidated in this public repository, note that we are both pulling
      # from the internal repository and pushing generated results back to
      # a branch on that internal repository, which may be initially
      # surprising.
      - private-generate-sql
      - push-private-generated-sql:
          requires:
            - private-generate-sql
          filters:
            branches:
              only:
                - main
      - deploy-to-private-gcr:
          context: data-eng-airflow-gcr
          requires:
            - private-generate-sql
            # can't run in parallel because CIRCLE_BUILD_NUM is same
            - build
            - generate-sql
          filters:
            branches:
              only:
                - main
  nightly:
    # Run after schema-generator to ensure we are up-to-date
    triggers:
      - schedule:
          cron: "0 5 * * *"
          filters:
            branches:
              only:
                - main
    jobs: *build_jobs