Support range partitioning and remove GLAM init.sql (#5346)

* Add support for range partitioning

* Remove init.sql for GLAM tables
This commit is contained in:
Anna Scholtz 2024-04-08 15:03:43 -07:00 коммит произвёл GitHub
Родитель 75db39c884
Коммит c2b77ba3bd
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
10 изменённых файлов: 154 добавлений и 75 удалений

Просмотреть файл

@ -2167,6 +2167,15 @@ def _attach_metadata(query_file_path: Path, table: bigquery.Table) -> None:
),
expiration_ms=metadata.bigquery.time_partitioning.expiration_ms,
)
elif metadata.bigquery and metadata.bigquery.range_paritioning:
table.range_partitioning = bigquery.RangePartitioning(
field=metadata.bigquery.range_partitioning.field,
range_=bigquery.PartitionRange(
start=metadata.bigquery.range_partitioning.range.start,
end=metadata.bigquery.range_partitioning.range.end,
interval=metadata.bigquery.range_partitioning.range.interval,
),
)
if metadata.bigquery and metadata.bigquery.clustering:
table.clustering_fields = metadata.bigquery.clustering.fields

Просмотреть файл

@ -75,6 +75,23 @@ class PartitionMetadata:
return int(self.expiration_days * 86400000)
@attr.s(auto_attribs=True)
class PartitionRange:
"""Metadata for defining the partition range."""
start: int
end: int
interval: int
@attr.s(auto_attribs=True)
class RangePartitionMetadata:
"""Metadata for defining range partitioned tables."""
range: PartitionRange
field: Optional[str] = attr.ib(None)
@attr.s(auto_attribs=True)
class ClusteringMetadata:
"""Metadata for defining BigQuery table clustering."""
@ -91,6 +108,7 @@ class BigQueryMetadata:
"""
time_partitioning: Optional[PartitionMetadata] = attr.ib(None)
range_partitioning: Optional[RangePartitionMetadata] = attr.ib(None)
clustering: Optional[ClusteringMetadata] = attr.ib(None)
@ -406,12 +424,13 @@ class Metadata:
def set_bigquery_clustering(self, clustering_fields):
"""Update the BigQuery partitioning metadata."""
partitioning = None
if self.bigquery and self.bigquery.time_partitioning:
partitioning = self.bigquery.time_partitioning
if self.bigquery:
time_partitioning = self.bigquery.time_partitioning
range_partitioning = self.bigquery.range_partitioning
self.bigquery = BigQueryMetadata(
time_partitioning=partitioning,
time_partitioning=time_partitioning,
range_partitioning=range_partitioning,
clustering=ClusteringMetadata(fields=clustering_fields),
)

Просмотреть файл

@ -1,26 +0,0 @@
-- init for firefox_desktop_glam_nightly__clients_histogram_aggregates_v1;
CREATE TABLE IF NOT EXISTS
`glam-fenix-dev.glam_etl.firefox_desktop_glam_nightly__clients_histogram_aggregates_v1`(
sample_id INT64,
client_id STRING,
ping_type STRING,
os STRING,
app_version INT64,
app_build_id STRING,
channel STRING,
histogram_aggregates ARRAY<
STRUCT<
metric STRING,
metric_type STRING,
key STRING,
agg_type STRING,
value ARRAY<STRUCT<key STRING, value INT64>>
>
>
)
PARTITION BY
RANGE_BUCKET(sample_id, GENERATE_ARRAY(0, 100, 1))
CLUSTER BY
app_version,
channel,
client_id

Просмотреть файл

@ -4,3 +4,15 @@ description: |-
[DESCRIPTION_MISSING]
owners:
- efilho@mozilla.com
bigquery:
range_partitioning:
field: sample_id
range:
start: 0
end: 100
interval: 1
clustering:
fields:
- app_version
- channel
- client_id

Просмотреть файл

@ -0,0 +1,48 @@
fields:
- mode: NULLABLE
name: sample_id
type: INTEGER
- mode: NULLABLE
name: client_id
type: STRING
- mode: NULLABLE
name: ping_type
type: STRING
- mode: NULLABLE
name: os
type: STRING
- mode: NULLABLE
name: app_version
type: INTEGER
- mode: NULLABLE
name: app_build_id
type: STRING
- mode: NULLABLE
name: channel
type: STRING
- fields:
- mode: NULLABLE
name: metric
type: STRING
- mode: NULLABLE
name: metric_type
type: STRING
- mode: NULLABLE
name: key
type: STRING
- mode: NULLABLE
name: agg_type
type: STRING
- fields:
- mode: NULLABLE
name: key
type: STRING
- mode: NULLABLE
name: value
type: INTEGER
mode: REPEATED
name: value
type: RECORD
mode: REPEATED
name: histogram_aggregates
type: RECORD

Просмотреть файл

@ -1,19 +0,0 @@
-- init for firefox_desktop_glam_nightly__clients_scalar_aggregates_v1;
CREATE TABLE IF NOT EXISTS
`glam-fenix-dev.glam_etl.firefox_desktop_glam_nightly__clients_scalar_aggregates_v1`(
client_id STRING,
ping_type STRING,
os STRING,
app_version INT64,
app_build_id STRING,
channel STRING,
scalar_aggregates ARRAY<
STRUCT<metric STRING, metric_type STRING, key STRING, agg_type STRING, value FLOAT64>
>
)
PARTITION BY
RANGE_BUCKET(app_version, GENERATE_ARRAY(0, 100, 1))
CLUSTER BY
app_version,
channel,
client_id

Просмотреть файл

@ -4,3 +4,15 @@ description: |-
[DESCRIPTION_MISSING]
owners:
- efilho@mozilla.com
bigquery:
range_partitioning:
field: app_version
range:
start: 0
end: 100
interval: 1
clustering:
fields:
- app_version
- channel
- client_id

Просмотреть файл

@ -0,0 +1,38 @@
fields:
- name: client_id
type: STRING
mode: NULLABLE
- name: ping_type
type: STRING
mode: NULLABLE
- name: os
type: STRING
mode: NULLABLE
- name: app_version
type: INTEGER
mode: NULLABLE
- name: app_build_id
type: STRING
mode: NULLABLE
- name: channel
type: STRING
mode: NULLABLE
- name: scalar_aggregates
type: RECORD
mode: REPEATED
fields:
- name: metric
type: STRING
mode: NULLABLE
- name: metric_type
type: STRING
mode: NULLABLE
- name: key
type: STRING
mode: NULLABLE
- name: agg_type
type: STRING
mode: NULLABLE
- name: value
type: FLOAT
mode: NULLABLE

Просмотреть файл

@ -1,26 +0,0 @@
-- init for org_mozilla_fenix_glam_nightly__clients_histogram_aggregates_v1;
CREATE TABLE IF NOT EXISTS
`glam-fenix-dev.glam_etl.org_mozilla_fenix_glam_nightly__clients_histogram_aggregates_v1`(
sample_id INT64,
client_id STRING,
ping_type STRING,
os STRING,
app_version INT64,
app_build_id STRING,
channel STRING,
histogram_aggregates ARRAY<
STRUCT<
metric STRING,
metric_type STRING,
key STRING,
agg_type STRING,
value ARRAY<STRUCT<key STRING, value INT64>>
>
>
)
PARTITION BY
RANGE_BUCKET(sample_id, GENERATE_ARRAY(0, 100, 1))
CLUSTER BY
app_version,
channel,
client_id

Просмотреть файл

@ -4,3 +4,15 @@ description: |-
[DESCRIPTION_MISSING]
owners:
- efilho@mozilla.com
bigquery:
range_partitioning:
field: sample_id
range:
start: 0
end: 100
interval: 1
clustering:
fields:
- app_version
- channel
- client_id