From 70a355a0dd6155381232a08e2b200a4e16617fc2 Mon Sep 17 00:00:00 2001 From: Anna Scholtz Date: Wed, 6 Mar 2024 15:06:29 -0800 Subject: [PATCH] =?UTF-8?q?Require=20authentication=20for=20dry=20run=20fu?= =?UTF-8?q?nction=20and=20run=20gcloud=20auth=20when=20=E2=80=A6=20(#5171)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Require authentication for dry run function and run gcloud auth when not logged in * authenticate step in CI, remove interactive gcloud auth * Skip dryrun for ltv_state_values_v2 * Refactor skip_fork in CI, clarify login requirements --- .circleci/workflows.yml | 72 +++++++++++++++++++--------------- README.md | 2 +- bigquery_etl/backfill/utils.py | 2 +- bigquery_etl/cli/backfill.py | 2 +- bigquery_etl/cli/check.py | 2 +- bigquery_etl/cli/dryrun.py | 4 +- bigquery_etl/cli/query.py | 10 ++--- bigquery_etl/dryrun.py | 31 ++++++++++++++- bqetl_project.yaml | 2 + 9 files changed, 85 insertions(+), 42 deletions(-) diff --git a/.circleci/workflows.yml b/.circleci/workflows.yml index da95b06d9d..1003de0157 100644 --- a/.circleci/workflows.yml +++ b/.circleci/workflows.yml @@ -119,9 +119,25 @@ jobs: - << pipeline.parameters.validate-bqetl >> - << pipeline.parameters.deploy >> steps: + - &skip_forked_pr + run: + name: Early return if this build is from a forked PR + command: | + if [ -n "$CIRCLE_PR_NUMBER" ]; then + echo "Cannot pass creds to forked PRs," \ + "so marking this step successful" + circleci-agent step halt + fi - checkout - *restore_venv_cache - *build + - &authenticate + run: + name: Authenticate to GCP + command: | + export GOOGLE_APPLICATION_CREDENTIALS="/tmp/gcp.json" + echo 'export GOOGLE_APPLICATION_CREDENTIALS="/tmp/gcp.json"' >> "$BASH_ENV" + echo "$GCLOUD_SERVICE_KEY" > "$GOOGLE_APPLICATION_CREDENTIALS" - run: name: PyTest with linters # integration tests are run in a separate `integration` step; @@ -143,6 +159,7 @@ jobs: - when: condition: *validate-sql-or-routines steps: + - *skip_forked_pr - checkout - *restore_venv_cache - *build @@ -160,12 +177,7 @@ jobs: - run: name: Run SQL tests command: | - if [ -n "$CIRCLE_PR_NUMBER" ]; then - echo "Cannot pass creds to forked PRs," \ - "so skipping routine and SQL tests" - else PATH="venv/bin:$PATH" script/entrypoint -m sql -n 8 - fi - unless: condition: *validate-sql-or-routines steps: @@ -176,11 +188,13 @@ jobs: - when: condition: *validate-sql-or-routines steps: + - *skip_forked_pr - checkout - *restore_venv_cache - *build - *attach_generated_sql - *copy_staged_sql + - *authenticate - run: name: Dry run queries # yamllint disable rule:line-length @@ -238,11 +252,13 @@ jobs: - when: condition: *validate-sql steps: + - *skip_forked_pr - checkout - *restore_venv_cache - *build - *attach_generated_sql - *copy_staged_sql + - *authenticate - run: name: Verify that metadata files are valid command: | @@ -259,16 +275,8 @@ jobs: - when: condition: *validate-bqetl steps: + - *skip_forked_pr - checkout - - &skip_forked_pr - run: - name: Early return if this build is from a forked PR - command: | - if [ -n "$CIRCLE_PR_NUMBER" ]; then - echo "Cannot pass creds to forked PRs," \ - "so marking this step successful" - circleci-agent step halt - fi - *restore_venv_cache - *build - run: @@ -288,6 +296,7 @@ jobs: - when: condition: *validate-sql steps: + - *skip_forked_pr - checkout - *restore_venv_cache - *build @@ -298,6 +307,7 @@ jobs: command: | rm -rf sql/ cp -r /tmp/workspace/generated-sql/sql sql + - *authenticate - run: name: Generate DAGs command: | @@ -378,6 +388,7 @@ jobs: - << pipeline.parameters.validate-routines >> - << pipeline.parameters.deploy >> steps: + - *skip_forked_pr - checkout - *restore_venv_cache - *build @@ -386,12 +397,7 @@ jobs: - run: name: Run routine tests command: | - if [ -n "$CIRCLE_PR_NUMBER" ]; then - echo "Cannot pass creds to forked PRs," \ - "so skipping routine tests" - else - PATH="venv/bin:$PATH" script/entrypoint -m routine -n 8 - fi + PATH="venv/bin:$PATH" script/entrypoint -m routine -n 8 - run: name: Validate doc examples command: | @@ -406,14 +412,17 @@ jobs: - when: condition: *validate-sql-or-routines steps: + - *skip_forked_pr - checkout - *restore_venv_cache - *build - *attach_generated_sql - *copy_staged_sql + - *authenticate - run: name: Validate views - command: PATH="venv/bin:$PATH" script/bqetl view validate + command: | + PATH="venv/bin:$PATH" script/bqetl view validate - unless: condition: *validate-sql-or-routines steps: @@ -424,8 +433,8 @@ jobs: - when: condition: *validate-sql-or-routines steps: - - checkout - *skip_forked_pr + - checkout - *restore_venv_cache - *build - *attach_generated_sql @@ -452,9 +461,11 @@ jobs: - when: condition: *validate-sql-or-routines steps: + - *skip_forked_pr - checkout - *restore_venv_cache - *build + - *authenticate - run: name: Generate SQL content command: | @@ -546,8 +557,8 @@ jobs: - when: condition: *validate-sql-or-routines steps: - - checkout - *skip_forked_pr + - checkout - *restore_venv_cache - *build - *attach_generated_sql @@ -562,13 +573,11 @@ jobs: git clone --single-branch --branch generated-sql \ git@github.com:mozilla/bigquery-etl \ generated-sql + - *authenticate - run: name: Deploy changes to stage command: | if [ "<< pipeline.parameters.skip-stage-deploys >>" = "false" ]; then - export GOOGLE_APPLICATION_CREDENTIALS="/tmp/gcp.json" - echo "$GCLOUD_SERVICE_KEY" > "$GOOGLE_APPLICATION_CREDENTIALS" - PATHS="$(git diff --no-index --name-only --diff-filter=d generated-sql/sql sql)" || true echo $PATHS PATH="venv/bin:$PATH" script/bqetl stage deploy \ @@ -701,10 +710,11 @@ jobs: - when: condition: *deploy steps: - - checkout - *skip_forked_pr + - checkout - *restore_venv_cache - *build + - *authenticate - add_ssh_keys: # deploy key to private-bigquery-etl fingerprints: @@ -823,6 +833,7 @@ jobs: - when: condition: *validate-sql-or-routines steps: + - *skip_forked_pr - checkout - run: name: Switch to main branch @@ -834,6 +845,7 @@ jobs: at: /tmp/workspace - *restore_venv_cache - *build + - *authenticate - run: name: Generate SQL content command: | @@ -973,15 +985,13 @@ jobs: - when: condition: *validate-sql-or-routines steps: - - checkout - *skip_forked_pr + - checkout - *build + - *authenticate - run: name: "Delete stage datasets" command: | - export GOOGLE_APPLICATION_CREDENTIALS="/tmp/gcp.json" - echo "$GCLOUD_SERVICE_KEY" > "$GOOGLE_APPLICATION_CREDENTIALS" - PATH="venv/bin:$PATH" script/bqetl stage clean --dataset-suffix=$CIRCLE_SHA1 --delete-expired - unless: condition: *validate-sql-or-routines diff --git a/README.md b/README.md index 9d4e4dda6d..75be59deb6 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ For more information, see [https://mozilla.github.io/bigquery-etl/](https://mozi ### GCP CLI tools -- **For Mozilla Employees or Contributors (not in Data Engineering)** - Set up GCP command line tools, [as described on docs.telemetry.mozilla.org](https://docs.telemetry.mozilla.org/cookbooks/bigquery/access.html#using-the-bq-command-line-tool). Note that some functionality (e.g. writing UDFs or backfilling queries) may not be allowed. +- **For Mozilla Employees (not in Data Engineering)** - Set up GCP command line tools, [as described on docs.telemetry.mozilla.org](https://docs.telemetry.mozilla.org/cookbooks/bigquery/access.html#using-the-bq-command-line-tool). Note that some functionality (e.g. writing UDFs or backfilling queries) may not be allowed. Run `gcloud auth login --update-adc` to authenticate against GCP. - **For Data Engineering** - In addition to setting up the command line tools, you will want to log in to `shared-prod` if making changes to production systems. Run `gcloud auth login --update-adc --project=moz-fx-data-shared-prod` (if you have not run it previously). ### Installing bqetl diff --git a/bigquery_etl/backfill/utils.py b/bigquery_etl/backfill/utils.py index 6e61eaf3b2..9d7c1fda9e 100644 --- a/bigquery_etl/backfill/utils.py +++ b/bigquery_etl/backfill/utils.py @@ -189,7 +189,7 @@ def get_backfill_entries_to_initiate( bigquery.Client(project="") except DefaultCredentialsError: click.echo( - "Authentication to GCP required. Run `gcloud auth login` " + "Authentication to GCP required. Run `gcloud auth login --update-adc` " "and check that the project is set correctly." ) sys.exit(1) diff --git a/bigquery_etl/cli/backfill.py b/bigquery_etl/cli/backfill.py index 8b458c66a8..03df17d7bb 100644 --- a/bigquery_etl/cli/backfill.py +++ b/bigquery_etl/cli/backfill.py @@ -424,7 +424,7 @@ def complete(ctx, qualified_table_name, sql_dir, project_id): """Complete backfill entry in backfill.yaml file(s).""" if not is_authenticated(): click.echo( - "Authentication to GCP required. Run `gcloud auth login` " + "Authentication to GCP required. Run `gcloud auth login --update-adc` " "and check that the project is set correctly." ) sys.exit(1) diff --git a/bigquery_etl/cli/check.py b/bigquery_etl/cli/check.py index 8a1f91468d..f26009e698 100644 --- a/bigquery_etl/cli/check.py +++ b/bigquery_etl/cli/check.py @@ -199,7 +199,7 @@ def run(ctx, dataset, project_id, sql_dir, marker, dry_run): """Run a check.""" if not is_authenticated(): click.echo( - "Authentication to GCP required. Run `gcloud auth login` " + "Authentication to GCP required. Run `gcloud auth login --update-adc` " "and check that the project is set correctly." ) sys.exit(1) diff --git a/bigquery_etl/cli/dryrun.py b/bigquery_etl/cli/dryrun.py index 3eae6ceaf0..b98f88d42d 100644 --- a/bigquery_etl/cli/dryrun.py +++ b/bigquery_etl/cli/dryrun.py @@ -97,7 +97,9 @@ def dryrun( sys.exit(0) if not use_cloud_function and not is_authenticated(): - click.echo("Not authenticated to GCP. Run `gcloud auth login` to login.") + click.echo( + "Not authenticated to GCP. Run `gcloud auth login --update-adc` to login." + ) sys.exit(1) sql_file_valid = partial( diff --git a/bigquery_etl/cli/query.py b/bigquery_etl/cli/query.py index 5b2345f8c2..da14ffa855 100644 --- a/bigquery_etl/cli/query.py +++ b/bigquery_etl/cli/query.py @@ -731,7 +731,7 @@ def backfill( """Run a backfill.""" if not is_authenticated(): click.echo( - "Authentication to GCP required. Run `gcloud auth login` " + "Authentication to GCP required. Run `gcloud auth login --update-adc` " "and check that the project is set correctly." ) sys.exit(1) @@ -906,7 +906,7 @@ def run( """Run a query.""" if not is_authenticated(): click.echo( - "Authentication to GCP required. Run `gcloud auth login` " + "Authentication to GCP required. Run `gcloud auth login --update-adc` " "and check that the project is set correctly." ) sys.exit(1) @@ -1667,7 +1667,7 @@ def update( """CLI command for generating the query schema.""" if not is_authenticated(): click.echo( - "Authentication to GCP required. Run `gcloud auth login` " + "Authentication to GCP required. Run `gcloud auth login --update-adc` " "and check that the project is set correctly." ) sys.exit(1) @@ -1752,7 +1752,7 @@ def _update_query_schema_with_downstream( if not is_authenticated(): click.echo( "Cannot update downstream dependencies." - "Authentication to GCP required. Run `gcloud auth login` " + "Authentication to GCP required. Run `gcloud auth login --update-adc` " "and check that the project is set correctly." ) sys.exit(1) @@ -2052,7 +2052,7 @@ def deploy( """CLI command for deploying destination table schemas.""" if not is_authenticated(): click.echo( - "Authentication to GCP required. Run `gcloud auth login` " + "Authentication to GCP required. Run `gcloud auth login --update-adc` " "and check that the project is set correctly." ) sys.exit(1) diff --git a/bigquery_etl/dryrun.py b/bigquery_etl/dryrun.py index bcb92171a5..90073d5188 100644 --- a/bigquery_etl/dryrun.py +++ b/bigquery_etl/dryrun.py @@ -14,6 +14,7 @@ proxy the queries through the dry run service endpoint. import glob import json import re +import sys from enum import Enum from os.path import basename, dirname, exists from pathlib import Path @@ -21,7 +22,10 @@ from typing import Optional, Set from urllib.request import Request, urlopen import click +import google.auth +from google.auth.transport.requests import Request as GoogleAuthRequest from google.cloud import bigquery +from google.oauth2.id_token import fetch_id_token from .config import ConfigLoader from .metadata.parse_metadata import Metadata @@ -69,6 +73,15 @@ class DryRun: except FileNotFoundError: self.metadata = None + from bigquery_etl.cli.utils import is_authenticated + + if not is_authenticated(): + print( + "Authentication to GCP required. Run `gcloud auth login --update-adc` " + "and check that the project is set correctly." + ) + sys.exit(1) + @staticmethod def skipped_files(sql_dir=ConfigLoader.get("default", "sql_dir")) -> Set[str]: """Return files skipped by dry run.""" @@ -160,10 +173,26 @@ class DryRun: dataset = basename(dirname(dirname(self.sqlfile))) try: if self.use_cloud_function: + auth_req = GoogleAuthRequest() + creds, _ = google.auth.default( + scopes=["https://www.googleapis.com/auth/cloud-platform"] + ) + creds.refresh(auth_req) + if hasattr(creds, "id_token"): + # Get token from default credentials for the current environment created via Cloud SDK run + id_token = creds.id_token + else: + # If the environment variable GOOGLE_APPLICATION_CREDENTIALS is set to service account JSON file, + # then ID token is acquired using this service account credentials. + id_token = fetch_id_token(auth_req, self.dry_run_url) + r = urlopen( Request( self.dry_run_url, - headers={"Content-Type": "application/json"}, + headers={ + "Content-Type": "application/json", + "Authorization": f"Bearer {id_token}", + }, data=json.dumps( { "dataset": dataset, diff --git a/bqetl_project.yaml b/bqetl_project.yaml index 55296ef94e..5ab2e06ae3 100644 --- a/bqetl_project.yaml +++ b/bqetl_project.yaml @@ -197,6 +197,8 @@ dry_run: - sql/moz-fx-data-shared-prod/org_mozilla_tiktokreporter/**/*.sql - sql/moz-fx-data-shared-prod/org_mozilla_ios_tiktok_reporter_tiktok_reportershare/**/*.sql - sql/moz-fx-data-shared-prod/org_mozilla_ios_tiktok_reporter/**/*.sql + - sql/moz-fx-data-shared-prod/fenix_derived/ltv_state_values_v1/query.sql + - sql/moz-fx-data-shared-prod/fenix_derived/ltv_state_values_v2/query.sql # Materialized views - sql/moz-fx-data-shared-prod/telemetry_derived/experiment_search_events_live_v1/init.sql - sql/moz-fx-data-shared-prod/telemetry_derived/experiment_events_live_v1/init.sql