bigquery-etl/script/glam/generate_glean_sql

121 строка
3.8 KiB
Bash
Executable File

#!/bin/bash
set -ex
function write_scalars {
local product=$1
local dataset=$2
local table=$3
local dst_project=$4
local sql_dir=$5
local directory="${sql_dir}/${dst_project}/glam_etl/${product}__clients_daily_scalar_aggregates_${table}"
mkdir -p "$directory"
if ! python3 -m bigquery_etl.glam.clients_daily_scalar_aggregates \
--source-table "$dataset.$table" \
--product "$product" \
> "$directory/query.sql"; then
echo "skipping $directory/query.sql: no probes found"
rm -r "$directory"
else
echo "generated $directory/query.sql"
fi
}
function write_histograms {
local product=$1
local dataset=$2
local table=$3
local dst_project=$4
local sql_dir=$5
local directory="${sql_dir}/${dst_project}/glam_etl/${product}__clients_daily_histogram_aggregates_${table}"
mkdir -p "$directory"
if ! python3 -m bigquery_etl.glam.clients_daily_histogram_aggregates \
--source-table "$dataset.$table" \
--product "$product" \
> "$directory/query.sql"; then
echo "skipping $directory/query.sql: no probes found"
rm -r "$directory"
else
echo "generated $directory/query.sql"
fi
}
function write_clients_daily_aggregates {
local product=$1
local src_project=$2
local dst_project=$3
local sql_dir=$4
local dataset="${product}_stable"
local qualified="$src_project:$dataset"
# validate inputs with set -e, however note that this will fail silently
if ! bq ls "$qualified" &> /dev/null; then
echo "could not list $qualified"
exit 1
fi
if ! bq show "$qualified.baseline_v1" &> /dev/null; then
echo "could not find $qualified.baseline_v1"
exit 1
fi
# e.g. baseline_v1
local tables;
# GLAM only supports tables with glean_ping_* schema.
# Also excluding use_counters because they are not supported
# and their generated query is too big to run
tables=$(bq ls --format=json "$qualified" | \
jq -r '.[] |
select(.labels.schema_id | test("^glean_ping_[0-9]+")) |
select(.type == "TABLE") |
select(.tableReference.tableId | test("^use_counters.*") | not) |
"\(.tableReference.tableId)"')
# generate all of the schemas in parallel
for table in $tables; do
write_scalars "$product" "$dataset" "$table" "$dst_project" "$sql_dir" &
write_histograms "$product" "$dataset" "$table" "$dst_project" "$sql_dir" &
done
# wait for all of the processes before continuing
wait
}
cd "$(dirname "$0")/../.."
error="STAGE must be one of (daily, incremental, all)"
# The project for generating the clients daily tables
src_project=${SRC_PROJECT:-moz-fx-data-shared-prod}
# We may also define the PROJECT as the destination project for backwards
# compatibility.
dst_project=${DST_PROJECT:-${PROJECT:-glam-fenix-dev}}
sql_dir=${SQL_DIR:-sql}
# also remove trailing slash
sql_dir=${sql_dir%/}
product=${PRODUCT?PRODUCT must be defined}
stage=${STAGE?$error}
# ensure the sql directory exists
mkdir -p $sql_dir/$dst_project/glam_etl
if [[ $stage == "daily" ]]; then
write_clients_daily_aggregates "$product" "$src_project" "$dst_project" "$sql_dir"
python3 -m bigquery_etl.glam.generate \
--sql-root "$sql_dir" \
--project "$dst_project" \
--prefix "${product}" \
--daily-view-only
elif [[ $stage == "incremental" ]]; then
python3 -m bigquery_etl.glam.generate \
--sql-root "$sql_dir" \
--project "$dst_project" \
--prefix "${product}"
elif [[ $stage == "all" ]]; then
write_clients_daily_aggregates "$product" "$src_project" "$dst_project" "$sql_dir"
python3 -m bigquery_etl.glam.generate \
--sql-root "$sql_dir" \
--project "$dst_project" \
--prefix "${product}"
else
echo "$error"
exit 1
fi