bugbug/infra/spawn_pipeline.py

#!/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright 2019 Mozilla
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""This script triggers the data pipeline for the bugbug project."""

import argparse
import os
from logging import INFO, basicConfig, getLogger

import jsone
import requests.packages.urllib3
import taskcluster
import yaml

basicConfig(level=INFO)
logger = getLogger(__name__)

requests.packages.urllib3.disable_warnings()

TASKCLUSTER_DEFAULT_URL = "https://community-tc.services.mozilla.com"


def get_taskcluster_options():
    """Get the Taskcluster setup options according to current environment.

    The current environment could be local.
    """
    options = taskcluster.optionsFromEnvironment()
    proxy_url = os.environ.get("TASKCLUSTER_PROXY_URL")

    if proxy_url is not None:
        # Always use proxy url when available
        options["rootUrl"] = proxy_url

    if "rootUrl" not in options:
        # Always have a value in root url
        options["rootUrl"] = TASKCLUSTER_DEFAULT_URL

    return options


def main():
    parser = argparse.ArgumentParser(description="Spawn tasks for bugbug data pipeline")
    parser.add_argument("data_pipeline_json")

    args = parser.parse_args()
    decision_task_id = os.environ.get("TASK_ID")
    options = get_taskcluster_options()
    add_self = False
    if decision_task_id:
        add_self = True
        task_group_id = decision_task_id
    else:
        task_group_id = taskcluster.utils.slugId()
    keys = {"taskGroupId": task_group_id}

    id_mapping = {}

    # First pass, do the template rendering and dependencies resolution
    tasks = []

    with open(args.data_pipeline_json) as pipeline_file:
        raw_tasks = yaml.safe_load(pipeline_file.read())

    version = os.getenv("TAG", "latest")
    context = {"version": version}
    rendered = jsone.render(raw_tasks, context)

    for task in rendered["tasks"]:
        # We need to generate new unique task ids for taskcluster to be happy
        # but need to identify dependencies across tasks. So we create a
        # mapping between an internal ID and the generate ID

        task_id = taskcluster.utils.slugId()
        task_internal_id = task["ID"]

        if task_internal_id in id_mapping:
            raise ValueError(f"Conflicting IDs {task_internal_id}")

        # Store each task ID in the id_mapping dictionary before processing dependencies.
        # This way, tasks can be defined in any order.
        id_mapping[task_internal_id] = task_id

    for task in rendered["tasks"]:
        task_internal_id = task.pop("ID")
        task_id = id_mapping[task_internal_id]

        for key, value in keys.items():
            task[key] = value

        task_payload = task["payload"]

        if "env" in task_payload and task_payload["env"]:
            task_payload["env"]["TAG"] = version
        else:
            task_payload["env"] = {
                "TAG": version,
            }

        # Process the dependencies
        new_dependencies = []
        for dependency in task.get("dependencies", []):
            new_dependencies.append(id_mapping[dependency])

        if add_self:
            new_dependencies.append(decision_task_id)

        task["dependencies"] = new_dependencies

        tasks.append((task_id, task))

    # Now sends them
    queue = taskcluster.Queue(options)
    try:
        for task_id, task_payload in tasks:
            queue.createTask(task_id, task_payload)

        logger.info(
            "https://community-tc.services.mozilla.com/tasks/groups/%s", task_group_id
        )
    except taskcluster.exceptions.TaskclusterAuthFailure:
        logger.exception("Failed to authenticate with Taskcluster")
        raise


if __name__ == "__main__":
    main()
Create the initial data pipeline task definition (#313) * Create the data pipeline task definition There is a hook (which runs every day at midnight) that will spawn the data-pipeline using the latest docker images that were build on latest release. The hook itself is updated on each release and is versionned in this repository. The hook will runs once every week and on every successful release. Add task for building the Docker image for HTTP service. 2019-04-26 15:56:49 +03:00			`#!/bin/env python`
			`# -- coding: utf-8 --`
			`#`
			`# Copyright 2019 Mozilla`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`

Add a linter for docstrings and fix violations (#3362) Co-authored-by: Marco Castelluccio <mcastelluccio@mozilla.com> 2023-03-17 13:21:06 +03:00			`"""This script triggers the data pipeline for the bugbug project."""`
Create the initial data pipeline task definition (#313) * Create the data pipeline task definition There is a hook (which runs every day at midnight) that will spawn the data-pipeline using the latest docker images that were build on latest release. The hook itself is updated on each release and is versionned in this repository. The hook will runs once every week and on every successful release. Add task for building the Docker image for HTTP service. 2019-04-26 15:56:49 +03:00
			`import argparse`
			`import os`
Refactor print statements to use logger (#3330) 2023-03-14 19:54:15 +03:00			`from logging import INFO, basicConfig, getLogger`
Create the initial data pipeline task definition (#313) * Create the data pipeline task definition There is a hook (which runs every day at midnight) that will spawn the data-pipeline using the latest docker images that were build on latest release. The hook itself is updated on each release and is versionned in this repository. The hook will runs once every week and on every successful release. Add task for building the Docker image for HTTP service. 2019-04-26 15:56:49 +03:00
			`import jsone`
			`import requests.packages.urllib3`
			`import taskcluster`
			`import yaml`

Refactor print statements to use logger (#3330) 2023-03-14 19:54:15 +03:00			`basicConfig(level=INFO)`
			`logger = getLogger(__name__)`

Create the initial data pipeline task definition (#313) * Create the data pipeline task definition There is a hook (which runs every day at midnight) that will spawn the data-pipeline using the latest docker images that were build on latest release. The hook itself is updated on each release and is versionned in this repository. The hook will runs once every week and on every successful release. Add task for building the Docker image for HTTP service. 2019-04-26 15:56:49 +03:00			`requests.packages.urllib3.disable_warnings()`

Update .taskcluster.yml for community cluster (#1076) 2019-11-09 00:13:10 +03:00			`TASKCLUSTER_DEFAULT_URL = "https://community-tc.services.mozilla.com"`
Create the initial data pipeline task definition (#313) * Create the data pipeline task definition There is a hook (which runs every day at midnight) that will spawn the data-pipeline using the latest docker images that were build on latest release. The hook itself is updated on each release and is versionned in this repository. The hook will runs once every week and on every successful release. Add task for building the Docker image for HTTP service. 2019-04-26 15:56:49 +03:00

			`def get_taskcluster_options():`
Add a linter for docstrings and fix violations (#3362) Co-authored-by: Marco Castelluccio <mcastelluccio@mozilla.com> 2023-03-17 13:21:06 +03:00			`"""Get the Taskcluster setup options according to current environment.`

			`The current environment could be local.`
Create the initial data pipeline task definition (#313) * Create the data pipeline task definition There is a hook (which runs every day at midnight) that will spawn the data-pipeline using the latest docker images that were build on latest release. The hook itself is updated on each release and is versionned in this repository. The hook will runs once every week and on every successful release. Add task for building the Docker image for HTTP service. 2019-04-26 15:56:49 +03:00			`"""`
			`options = taskcluster.optionsFromEnvironment()`
			`proxy_url = os.environ.get("TASKCLUSTER_PROXY_URL")`

			`if proxy_url is not None:`
			`# Always use proxy url when available`
			`options["rootUrl"] = proxy_url`

			`if "rootUrl" not in options:`
			`# Always have a value in root url`
			`options["rootUrl"] = TASKCLUSTER_DEFAULT_URL`

			`return options`


			`def main():`
			`parser = argparse.ArgumentParser(description="Spawn tasks for bugbug data pipeline")`
			`parser.add_argument("data_pipeline_json")`

			`args = parser.parse_args()`
			`decision_task_id = os.environ.get("TASK_ID")`
			`options = get_taskcluster_options()`
			`add_self = False`
			`if decision_task_id:`
			`add_self = True`
			`task_group_id = decision_task_id`
			`else:`
			`task_group_id = taskcluster.utils.slugId()`
			`keys = {"taskGroupId": task_group_id}`

			`id_mapping = {}`

			`# First pass, do the template rendering and dependencies resolution`
			`tasks = []`

			`with open(args.data_pipeline_json) as pipeline_file:`
Load yaml safely This is mostly to avoid warnings, as the yaml files we load are fully controlled by us. 2019-06-08 00:34:07 +03:00			`raw_tasks = yaml.safe_load(pipeline_file.read())`
Create the initial data pipeline task definition (#313) * Create the data pipeline task definition There is a hook (which runs every day at midnight) that will spawn the data-pipeline using the latest docker images that were build on latest release. The hook itself is updated on each release and is versionned in this repository. The hook will runs once every week and on every successful release. Add task for building the Docker image for HTTP service. 2019-04-26 15:56:49 +03:00
Set the TAG environment variable in all tasks spawned by our decision tasks Pre-requisite for #1169. 2019-12-11 23:24:38 +03:00			`version = os.getenv("TAG", "latest")`
			`context = {"version": version}`
Start tracking training metrics as Taskcluster artifacts (#604) Fixes #342 2019-06-23 00:18:08 +03:00			`rendered = jsone.render(raw_tasks, context)`
Create the initial data pipeline task definition (#313) * Create the data pipeline task definition There is a hook (which runs every day at midnight) that will spawn the data-pipeline using the latest docker images that were build on latest release. The hook itself is updated on each release and is versionned in this repository. The hook will runs once every week and on every successful release. Add task for building the Docker image for HTTP service. 2019-04-26 15:56:49 +03:00
Start tracking training metrics as Taskcluster artifacts (#604) Fixes #342 2019-06-23 00:18:08 +03:00			`for task in rendered["tasks"]:`
Create the initial data pipeline task definition (#313) * Create the data pipeline task definition There is a hook (which runs every day at midnight) that will spawn the data-pipeline using the latest docker images that were build on latest release. The hook itself is updated on each release and is versionned in this repository. The hook will runs once every week and on every successful release. Add task for building the Docker image for HTTP service. 2019-04-26 15:56:49 +03:00			`# We need to generate new unique task ids for taskcluster to be happy`
			`# but need to identify dependencies across tasks. So we create a`
			`# mapping between an internal ID and the generate ID`

			`task_id = taskcluster.utils.slugId()`
Make spawn_pipeline not depend on the order of tasks in the yaml file (#1631) Fixes #1282 2020-06-12 12:01:41 +03:00			`task_internal_id = task["ID"]`
Create the initial data pipeline task definition (#313) * Create the data pipeline task definition There is a hook (which runs every day at midnight) that will spawn the data-pipeline using the latest docker images that were build on latest release. The hook itself is updated on each release and is versionned in this repository. The hook will runs once every week and on every successful release. Add task for building the Docker image for HTTP service. 2019-04-26 15:56:49 +03:00
			`if task_internal_id in id_mapping:`
Move string formatting to f-string in spawn_data_pipeline (#559) 2019-06-07 12:04:33 +03:00			`raise ValueError(f"Conflicting IDs {task_internal_id}")`
Create the initial data pipeline task definition (#313) * Create the data pipeline task definition There is a hook (which runs every day at midnight) that will spawn the data-pipeline using the latest docker images that were build on latest release. The hook itself is updated on each release and is versionned in this repository. The hook will runs once every week and on every successful release. Add task for building the Docker image for HTTP service. 2019-04-26 15:56:49 +03:00
Make spawn_pipeline not depend on the order of tasks in the yaml file (#1631) Fixes #1282 2020-06-12 12:01:41 +03:00			`# Store each task ID in the id_mapping dictionary before processing dependencies.`
			`# This way, tasks can be defined in any order.`
Create the initial data pipeline task definition (#313) * Create the data pipeline task definition There is a hook (which runs every day at midnight) that will spawn the data-pipeline using the latest docker images that were build on latest release. The hook itself is updated on each release and is versionned in this repository. The hook will runs once every week and on every successful release. Add task for building the Docker image for HTTP service. 2019-04-26 15:56:49 +03:00			`id_mapping[task_internal_id] = task_id`

Make spawn_pipeline not depend on the order of tasks in the yaml file (#1631) Fixes #1282 2020-06-12 12:01:41 +03:00			`for task in rendered["tasks"]:`
			`task_internal_id = task.pop("ID")`
			`task_id = id_mapping[task_internal_id]`

Create the initial data pipeline task definition (#313) * Create the data pipeline task definition There is a hook (which runs every day at midnight) that will spawn the data-pipeline using the latest docker images that were build on latest release. The hook itself is updated on each release and is versionned in this repository. The hook will runs once every week and on every successful release. Add task for building the Docker image for HTTP service. 2019-04-26 15:56:49 +03:00			`for key, value in keys.items():`
Start tracking training metrics as Taskcluster artifacts (#604) Fixes #342 2019-06-23 00:18:08 +03:00			`task[key] = value`
Create the initial data pipeline task definition (#313) * Create the data pipeline task definition There is a hook (which runs every day at midnight) that will spawn the data-pipeline using the latest docker images that were build on latest release. The hook itself is updated on each release and is versionned in this repository. The hook will runs once every week and on every successful release. Add task for building the Docker image for HTTP service. 2019-04-26 15:56:49 +03:00
Set the TAG environment variable in all tasks spawned by our decision tasks Pre-requisite for #1169. 2019-12-11 23:24:38 +03:00			`task_payload = task["payload"]`

			`if "env" in task_payload and task_payload["env"]:`
Don't use JSON-e for the spawned tasks' env, as we are modifying already rendered JSON 2019-12-12 12:25:40 +03:00			`task_payload["env"]["TAG"] = version`
Set the TAG environment variable in all tasks spawned by our decision tasks Pre-requisite for #1169. 2019-12-11 23:24:38 +03:00			`else:`
			`task_payload["env"] = {`
			`"TAG": version,`
			`}`

Create the initial data pipeline task definition (#313) * Create the data pipeline task definition There is a hook (which runs every day at midnight) that will spawn the data-pipeline using the latest docker images that were build on latest release. The hook itself is updated on each release and is versionned in this repository. The hook will runs once every week and on every successful release. Add task for building the Docker image for HTTP service. 2019-04-26 15:56:49 +03:00			`# Process the dependencies`
			`new_dependencies = []`
Start tracking training metrics as Taskcluster artifacts (#604) Fixes #342 2019-06-23 00:18:08 +03:00			`for dependency in task.get("dependencies", []):`
Create the initial data pipeline task definition (#313) * Create the data pipeline task definition There is a hook (which runs every day at midnight) that will spawn the data-pipeline using the latest docker images that were build on latest release. The hook itself is updated on each release and is versionned in this repository. The hook will runs once every week and on every successful release. Add task for building the Docker image for HTTP service. 2019-04-26 15:56:49 +03:00			`new_dependencies.append(id_mapping[dependency])`

			`if add_self:`
			`new_dependencies.append(decision_task_id)`

Start tracking training metrics as Taskcluster artifacts (#604) Fixes #342 2019-06-23 00:18:08 +03:00			`task["dependencies"] = new_dependencies`
Create the initial data pipeline task definition (#313) * Create the data pipeline task definition There is a hook (which runs every day at midnight) that will spawn the data-pipeline using the latest docker images that were build on latest release. The hook itself is updated on each release and is versionned in this repository. The hook will runs once every week and on every successful release. Add task for building the Docker image for HTTP service. 2019-04-26 15:56:49 +03:00
Start tracking training metrics as Taskcluster artifacts (#604) Fixes #342 2019-06-23 00:18:08 +03:00			`tasks.append((task_id, task))`
Create the initial data pipeline task definition (#313) * Create the data pipeline task definition There is a hook (which runs every day at midnight) that will spawn the data-pipeline using the latest docker images that were build on latest release. The hook itself is updated on each release and is versionned in this repository. The hook will runs once every week and on every successful release. Add task for building the Docker image for HTTP service. 2019-04-26 15:56:49 +03:00
			`# Now sends them`
			`queue = taskcluster.Queue(options)`
			`try:`
			`for task_id, task_payload in tasks:`
			`queue.createTask(task_id, task_payload)`

Refactor print statements to use logger (#3330) 2023-03-14 19:54:15 +03:00			`logger.info(`
			`"https://community-tc.services.mozilla.com/tasks/groups/%s", task_group_id`
			`)`
			`except taskcluster.exceptions.TaskclusterAuthFailure:`
			`logger.exception("Failed to authenticate with Taskcluster")`
Create the initial data pipeline task definition (#313) * Create the data pipeline task definition There is a hook (which runs every day at midnight) that will spawn the data-pipeline using the latest docker images that were build on latest release. The hook itself is updated on each release and is versionned in this repository. The hook will runs once every week and on every successful release. Add task for building the Docker image for HTTP service. 2019-04-26 15:56:49 +03:00			`raise`


			`if __name__ == "__main__":`
			`main()`