Support range partitioning and remove GLAM init.sql (#5346)

* Add support for range partitioning * Remove init.sql for GLAM tables
2024-04-08 15:03:43 -07:00 · 2024-04-08 15:03:43 -07:00 · c2b77ba3bd
--- a/bigquery_etl/cli/query.py
+++ b/bigquery_etl/cli/query.py
@ -2167,6 +2167,15 @@ def _attach_metadata(query_file_path: Path, table: bigquery.Table) -> None:
            ),
            expiration_ms=metadata.bigquery.time_partitioning.expiration_ms,
        )
+    elif metadata.bigquery and metadata.bigquery.range_paritioning:
+        table.range_partitioning = bigquery.RangePartitioning(
+            field=metadata.bigquery.range_partitioning.field,
+            range_=bigquery.PartitionRange(
+                start=metadata.bigquery.range_partitioning.range.start,
+                end=metadata.bigquery.range_partitioning.range.end,
+                interval=metadata.bigquery.range_partitioning.range.interval,
+            ),
+        )

    if metadata.bigquery and metadata.bigquery.clustering:
        table.clustering_fields = metadata.bigquery.clustering.fields
--- a/bigquery_etl/metadata/parse_metadata.py
+++ b/bigquery_etl/metadata/parse_metadata.py
@ -75,6 +75,23 @@ class PartitionMetadata:
        return int(self.expiration_days * 86400000)


+@attr.s(auto_attribs=True)
+class PartitionRange:
+    """Metadata for defining the partition range."""
+
+    start: int
+    end: int
+    interval: int
+
+
+@attr.s(auto_attribs=True)
+class RangePartitionMetadata:
+    """Metadata for defining range partitioned tables."""
+
+    range: PartitionRange
+    field: Optional[str] = attr.ib(None)
+
+
@attr.s(auto_attribs=True)
 class ClusteringMetadata:
    """Metadata for defining BigQuery table clustering."""
@ -91,6 +108,7 @@ class BigQueryMetadata:
    """

    time_partitioning: Optional[PartitionMetadata] = attr.ib(None)
+    range_partitioning: Optional[RangePartitionMetadata] = attr.ib(None)
    clustering: Optional[ClusteringMetadata] = attr.ib(None)


@ -406,12 +424,13 @@ class Metadata:

    def set_bigquery_clustering(self, clustering_fields):
        """Update the BigQuery partitioning metadata."""
-        partitioning = None
-        if self.bigquery and self.bigquery.time_partitioning:
-            partitioning = self.bigquery.time_partitioning
+        if self.bigquery:
+            time_partitioning = self.bigquery.time_partitioning
+            range_partitioning = self.bigquery.range_partitioning

        self.bigquery = BigQueryMetadata(
-            time_partitioning=partitioning,
+            time_partitioning=time_partitioning,
+            range_partitioning=range_partitioning,
            clustering=ClusteringMetadata(fields=clustering_fields),
        )

--- a/sql/glam-fenix-dev/glam_etl/firefox_desktop_glam_nightly__clients_histogram_aggregates_v1/init.sql
+++ b/sql/glam-fenix-dev/glam_etl/firefox_desktop_glam_nightly__clients_histogram_aggregates_v1/init.sql
@ -1,26 +0,0 @@
-- init for firefox_desktop_glam_nightly__clients_histogram_aggregates_v1;
-CREATE TABLE IF NOT EXISTS
-  `glam-fenix-dev.glam_etl.firefox_desktop_glam_nightly__clients_histogram_aggregates_v1`(
-    sample_id INT64,
-    client_id STRING,
-    ping_type STRING,
-    os STRING,
-    app_version INT64,
-    app_build_id STRING,
-    channel STRING,
-    histogram_aggregates ARRAY<
-      STRUCT<
-        metric STRING,
-        metric_type STRING,
-        key STRING,
-        agg_type STRING,
-        value ARRAY<STRUCT<key STRING, value INT64>>
-      >
-    >
-  )
-PARTITION BY
-  RANGE_BUCKET(sample_id, GENERATE_ARRAY(0, 100, 1))
-CLUSTER BY
-  app_version,
-  channel,
-  client_id
--- a/sql/glam-fenix-dev/glam_etl/firefox_desktop_glam_nightly__clients_histogram_aggregates_v1/metadata.yaml
+++ b/sql/glam-fenix-dev/glam_etl/firefox_desktop_glam_nightly__clients_histogram_aggregates_v1/metadata.yaml
@ -4,3 +4,15 @@ description: |-
  [DESCRIPTION_MISSING]
 owners:
 - efilho@mozilla.com
+bigquery:
+  range_partitioning:
+    field: sample_id
+    range:
+      start: 0
+      end: 100
+      interval: 1
+  clustering:
+    fields:
+    - app_version
+    - channel
+    - client_id
--- a/sql/glam-fenix-dev/glam_etl/firefox_desktop_glam_nightly__clients_histogram_aggregates_v1/schema.yaml
+++ b/sql/glam-fenix-dev/glam_etl/firefox_desktop_glam_nightly__clients_histogram_aggregates_v1/schema.yaml
@ -0,0 +1,48 @@
+fields:
+- mode: NULLABLE
+  name: sample_id
+  type: INTEGER
+- mode: NULLABLE
+  name: client_id
+  type: STRING
+- mode: NULLABLE
+  name: ping_type
+  type: STRING
+- mode: NULLABLE
+  name: os
+  type: STRING
+- mode: NULLABLE
+  name: app_version
+  type: INTEGER
+- mode: NULLABLE
+  name: app_build_id
+  type: STRING
+- mode: NULLABLE
+  name: channel
+  type: STRING
+- fields:
+  - mode: NULLABLE
+    name: metric
+    type: STRING
+  - mode: NULLABLE
+    name: metric_type
+    type: STRING
+  - mode: NULLABLE
+    name: key
+    type: STRING
+  - mode: NULLABLE
+    name: agg_type
+    type: STRING
+  - fields:
+    - mode: NULLABLE
+      name: key
+      type: STRING
+    - mode: NULLABLE
+      name: value
+      type: INTEGER
+    mode: REPEATED
+    name: value
+    type: RECORD
+  mode: REPEATED
+  name: histogram_aggregates
+  type: RECORD
--- a/sql/glam-fenix-dev/glam_etl/firefox_desktop_glam_nightly__clients_scalar_aggregates_v1/init.sql
+++ b/sql/glam-fenix-dev/glam_etl/firefox_desktop_glam_nightly__clients_scalar_aggregates_v1/init.sql
@ -1,19 +0,0 @@
-- init for firefox_desktop_glam_nightly__clients_scalar_aggregates_v1;
-CREATE TABLE IF NOT EXISTS
-  `glam-fenix-dev.glam_etl.firefox_desktop_glam_nightly__clients_scalar_aggregates_v1`(
-    client_id STRING,
-    ping_type STRING,
-    os STRING,
-    app_version INT64,
-    app_build_id STRING,
-    channel STRING,
-    scalar_aggregates ARRAY<
-      STRUCT<metric STRING, metric_type STRING, key STRING, agg_type STRING, value FLOAT64>
-    >
-  )
-PARTITION BY
-  RANGE_BUCKET(app_version, GENERATE_ARRAY(0, 100, 1))
-CLUSTER BY
-  app_version,
-  channel,
-  client_id
--- a/sql/glam-fenix-dev/glam_etl/firefox_desktop_glam_nightly__clients_scalar_aggregates_v1/metadata.yaml
+++ b/sql/glam-fenix-dev/glam_etl/firefox_desktop_glam_nightly__clients_scalar_aggregates_v1/metadata.yaml
@ -4,3 +4,15 @@ description: |-
  [DESCRIPTION_MISSING]
 owners:
 - efilho@mozilla.com
+bigquery:
+  range_partitioning:
+    field: app_version
+    range:
+      start: 0
+      end: 100
+      interval: 1
+  clustering:
+    fields:
+    - app_version
+    - channel
+    - client_id
--- a/sql/glam-fenix-dev/glam_etl/firefox_desktop_glam_nightly__clients_scalar_aggregates_v1/schema.yaml
+++ b/sql/glam-fenix-dev/glam_etl/firefox_desktop_glam_nightly__clients_scalar_aggregates_v1/schema.yaml
@ -0,0 +1,38 @@
+fields:
+- name: client_id
+  type: STRING
+  mode: NULLABLE
+- name: ping_type
+  type: STRING
+  mode: NULLABLE
+- name: os
+  type: STRING
+  mode: NULLABLE
+- name: app_version
+  type: INTEGER
+  mode: NULLABLE
+- name: app_build_id
+  type: STRING
+  mode: NULLABLE
+- name: channel
+  type: STRING
+  mode: NULLABLE
+- name: scalar_aggregates
+  type: RECORD
+  mode: REPEATED
+  fields:
+  - name: metric
+    type: STRING
+    mode: NULLABLE
+  - name: metric_type
+    type: STRING
+    mode: NULLABLE
+  - name: key
+    type: STRING
+    mode: NULLABLE
+  - name: agg_type
+    type: STRING
+    mode: NULLABLE
+  - name: value
+    type: FLOAT
+    mode: NULLABLE
--- a/sql/glam-fenix-dev/glam_etl/org_mozilla_fenix_glam_nightly__clients_histogram_aggregates_v1/init.sql
+++ b/sql/glam-fenix-dev/glam_etl/org_mozilla_fenix_glam_nightly__clients_histogram_aggregates_v1/init.sql
@ -1,26 +0,0 @@
-- init for org_mozilla_fenix_glam_nightly__clients_histogram_aggregates_v1;
-CREATE TABLE IF NOT EXISTS
-  `glam-fenix-dev.glam_etl.org_mozilla_fenix_glam_nightly__clients_histogram_aggregates_v1`(
-    sample_id INT64,
-    client_id STRING,
-    ping_type STRING,
-    os STRING,
-    app_version INT64,
-    app_build_id STRING,
-    channel STRING,
-    histogram_aggregates ARRAY<
-      STRUCT<
-        metric STRING,
-        metric_type STRING,
-        key STRING,
-        agg_type STRING,
-        value ARRAY<STRUCT<key STRING, value INT64>>
-      >
-    >
-  )
-PARTITION BY
-  RANGE_BUCKET(sample_id, GENERATE_ARRAY(0, 100, 1))
-CLUSTER BY
-  app_version,
-  channel,
-  client_id
--- a/sql/glam-fenix-dev/glam_etl/org_mozilla_fenix_glam_nightly__clients_histogram_aggregates_v1/metadata.yaml
+++ b/sql/glam-fenix-dev/glam_etl/org_mozilla_fenix_glam_nightly__clients_histogram_aggregates_v1/metadata.yaml
@ -4,3 +4,15 @@ description: |-
  [DESCRIPTION_MISSING]
 owners:
 - efilho@mozilla.com
+bigquery:
+  range_partitioning:
+    field: sample_id
+    range:
+      start: 0
+      end: 100
+      interval: 1
+  clustering:
+    fields:
+    - app_version
+    - channel
+    - client_id