bugbug/scripts/retrieve_training_metrics.py

# -*- coding: utf-8 -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.

import argparse
import logging
import os
import sys
from os.path import abspath, join

import requests
import taskcluster

from bugbug.utils import get_taskcluster_options

ROOT_URI = "train_{}.per_date"
DATE_URI = "train_{}.per_date.{}"
BASE_URL = "https://community-tc.services.mozilla.com/api/index/v1/task/{}/artifacts/public/metrics.json"
NAMESPACE_URI = "project.bugbug.{}"

LOGGER = logging.getLogger(__name__)

logging.basicConfig(level=logging.INFO)


def get_task_metrics_from_uri(index_uri):
    index_url = BASE_URL.format(index_uri)
    LOGGER.info("Retrieving metrics from %s", index_url)
    r = requests.get(index_url)

    if r.status_code == 404:
        LOGGER.error(f"File not found for URL {index_url}, check your arguments")
        sys.exit(1)

    r.raise_for_status()

    return r


def get_namespaces(index, index_uri):
    index_namespaces = index.listNamespaces(index_uri)

    return index_namespaces["namespaces"]


def is_later_or_equal(partial_date, from_date):
    for partial_date_part, from_date_part in zip(partial_date, from_date):
        if int(partial_date_part) > int(from_date_part):
            return True
        elif int(partial_date_part) < int(from_date_part):
            return False
        else:
            continue

    return True


def get_task_metrics_from_date(model, date, output_directory):
    options = get_taskcluster_options()

    index = taskcluster.Index(options)

    index.ping()

    # Split the date
    from_date = date.split(".")

    namespaces = []

    # Start at the root level
    # We need an empty list in order to append namespaces part to it
    namespaces.append([])

    # Recursively list all namespaces greater or equals than the given date
    while namespaces:
        current_ns = namespaces.pop()

        # Handle version level namespaces
        if not current_ns:
            ns_uri = ROOT_URI.format(model)
        else:
            current_ns_date = ".".join(current_ns)
            ns_uri = DATE_URI.format(model, current_ns_date)

        ns_full_uri = NAMESPACE_URI.format(ns_uri)

        tasks = index.listTasks(ns_full_uri)
        for task in tasks["tasks"]:
            task_uri = task["namespace"]
            r = get_task_metrics_from_uri(task_uri)

            # Write the file on disk
            file_name = f"metric_{'_'.join(task_uri.split('.'))}.json"
            file_path = abspath(join(output_directory, file_name))
            with open(file_path, "w") as metric_file:
                metric_file.write(r.text)
            LOGGER.info("Metrics saved to %r", file_path)

        for namespace in get_namespaces(index, ns_full_uri):
            new_ns = current_ns.copy()
            new_ns.append(namespace["name"])

            if not is_later_or_equal(new_ns, from_date):
                LOGGER.debug("NEW namespace %s is before %s", new_ns, from_date)
                continue

            # Might not be efficient but size of `namespaces` shouldn't be too
            # big as we are doing a depth-first traversal
            if new_ns not in namespaces:
                namespaces.append(new_ns)


def main():
    description = "Retrieve a model training metrics"
    parser = argparse.ArgumentParser(description=description)

    parser.add_argument(
        "-d",
        "--output-directory",
        default=os.getcwd(),
        help="In which directory the script should save the metrics file. The directory must exists",
    )
    parser.add_argument("model", help="Which model to retrieve training metrics from.")
    parser.add_argument(
        "date",
        nargs="?",
        help="Which date should we retrieve training metrics from. Default to latest",
    )

    args = parser.parse_args()

    get_task_metrics_from_date(args.model, args.date, args.output_directory)


if __name__ == "__main__":
    main()
Start tracking training metrics as Taskcluster artifacts (#604) Fixes #342 2019-06-23 00:18:08 +03:00			`# -- coding: utf-8 --`
Check metrics evolution (#836) Fixes #360 and fixes #641. 2019-08-05 11:22:55 +03:00			`# This Source Code Form is subject to the terms of the Mozilla Public`
			`# License, v. 2.0. If a copy of the MPL was not distributed with this file,`
			`# You can obtain one at http://mozilla.org/MPL/2.0/.`
Start tracking training metrics as Taskcluster artifacts (#604) Fixes #342 2019-06-23 00:18:08 +03:00
			`import argparse`
Move log messages to stderr (#635) As the retrieve script can output the metrics on the standard output, log messages would pollute the output and complicate scripts that would want to parse it. Use logging instead of passing stderr to the print statements as it's mostly the same amount of code. 2019-06-27 11:58:07 +03:00			`import logging`
Check metrics evolution (#836) Fixes #360 and fixes #641. 2019-08-05 11:22:55 +03:00			`import os`
Start tracking training metrics as Taskcluster artifacts (#604) Fixes #342 2019-06-23 00:18:08 +03:00			`import sys`
Check metrics evolution (#836) Fixes #360 and fixes #641. 2019-08-05 11:22:55 +03:00			`from os.path import abspath, join`
Start tracking training metrics as Taskcluster artifacts (#604) Fixes #342 2019-06-23 00:18:08 +03:00
			`import requests`
Retrieve all metrics from a date and later (#734) Fixes #614 2019-08-01 18:05:59 +03:00			`import taskcluster`
Start tracking training metrics as Taskcluster artifacts (#604) Fixes #342 2019-06-23 00:18:08 +03:00
Retrieve all metrics from a date and later (#734) Fixes #614 2019-08-01 18:05:59 +03:00			`from bugbug.utils import get_taskcluster_options`

			`ROOT_URI = "train_{}.per_date"`
			`DATE_URI = "train_{}.per_date.{}"`
Update .taskcluster.yml for community cluster (#1076) 2019-11-09 00:13:10 +03:00			`BASE_URL = "https://community-tc.services.mozilla.com/api/index/v1/task/{}/artifacts/public/metrics.json"`
Update all index URLs, removing the 'relman' string Follow-up to 0ec86af61bbf1632e6a08e0e19b5e0ac19aace10 2020-09-01 15:44:49 +03:00			`NAMESPACE_URI = "project.bugbug.{}"`
Start tracking training metrics as Taskcluster artifacts (#604) Fixes #342 2019-06-23 00:18:08 +03:00
Move log messages to stderr (#635) As the retrieve script can output the metrics on the standard output, log messages would pollute the output and complicate scripts that would want to parse it. Use logging instead of passing stderr to the print statements as it's mostly the same amount of code. 2019-06-27 11:58:07 +03:00			`LOGGER = logging.getLogger(__name__)`

			`logging.basicConfig(level=logging.INFO)`

Start tracking training metrics as Taskcluster artifacts (#604) Fixes #342 2019-06-23 00:18:08 +03:00
Retrieve all metrics from a date and later (#734) Fixes #614 2019-08-01 18:05:59 +03:00			`def get_task_metrics_from_uri(index_uri):`
			`index_url = BASE_URL.format(index_uri)`
Refactored logging statements to use lazy % formatting (#3335) 2023-03-09 13:58:37 +03:00			`LOGGER.info("Retrieving metrics from %s", index_url)`
Retrieve all metrics from a date and later (#734) Fixes #614 2019-08-01 18:05:59 +03:00			`r = requests.get(index_url)`

			`if r.status_code == 404:`
			`LOGGER.error(f"File not found for URL {index_url}, check your arguments")`
			`sys.exit(1)`

			`r.raise_for_status()`

			`return r`


			`def get_namespaces(index, index_uri):`
			`index_namespaces = index.listNamespaces(index_uri)`

			`return index_namespaces["namespaces"]`


			`def is_later_or_equal(partial_date, from_date):`
			`for partial_date_part, from_date_part in zip(partial_date, from_date):`
			`if int(partial_date_part) > int(from_date_part):`
			`return True`
			`elif int(partial_date_part) < int(from_date_part):`
			`return False`
			`else:`
			`continue`

			`return True`


Check metrics evolution (#836) Fixes #360 and fixes #641. 2019-08-05 11:22:55 +03:00			`def get_task_metrics_from_date(model, date, output_directory):`
Retrieve all metrics from a date and later (#734) Fixes #614 2019-08-01 18:05:59 +03:00			`options = get_taskcluster_options()`

			`index = taskcluster.Index(options)`

			`index.ping()`

			`# Split the date`
			`from_date = date.split(".")`

			`namespaces = []`

			`# Start at the root level`
			`# We need an empty list in order to append namespaces part to it`
			`namespaces.append([])`

			`# Recursively list all namespaces greater or equals than the given date`
			`while namespaces:`
			`current_ns = namespaces.pop()`

			`# Handle version level namespaces`
			`if not current_ns:`
			`ns_uri = ROOT_URI.format(model)`
			`else:`
			`current_ns_date = ".".join(current_ns)`
			`ns_uri = DATE_URI.format(model, current_ns_date)`

			`ns_full_uri = NAMESPACE_URI.format(ns_uri)`

			`tasks = index.listTasks(ns_full_uri)`
			`for task in tasks["tasks"]:`
			`task_uri = task["namespace"]`
			`r = get_task_metrics_from_uri(task_uri)`

			`# Write the file on disk`
Check metrics evolution (#836) Fixes #360 and fixes #641. 2019-08-05 11:22:55 +03:00			`file_name = f"metric_{'_'.join(task_uri.split('.'))}.json"`
			`file_path = abspath(join(output_directory, file_name))`
Retrieve all metrics from a date and later (#734) Fixes #614 2019-08-01 18:05:59 +03:00			`with open(file_path, "w") as metric_file:`
			`metric_file.write(r.text)`
Refactor logging statement to use % lazy format (#3401) 2023-03-31 20:03:51 +03:00			`LOGGER.info("Metrics saved to %r", file_path)`
Retrieve all metrics from a date and later (#734) Fixes #614 2019-08-01 18:05:59 +03:00
			`for namespace in get_namespaces(index, ns_full_uri):`
			`new_ns = current_ns.copy()`
			`new_ns.append(namespace["name"])`

			`if not is_later_or_equal(new_ns, from_date):`
			`LOGGER.debug("NEW namespace %s is before %s", new_ns, from_date)`
			`continue`

			# Might not be efficient but size of `namespaces` shouldn't be too
			`# big as we are doing a depth-first traversal`
			`if new_ns not in namespaces:`
			`namespaces.append(new_ns)`


Start tracking training metrics as Taskcluster artifacts (#604) Fixes #342 2019-06-23 00:18:08 +03:00			`def main():`
			`description = "Retrieve a model training metrics"`
			`parser = argparse.ArgumentParser(description=description)`

Check metrics evolution (#836) Fixes #360 and fixes #641. 2019-08-05 11:22:55 +03:00			`parser.add_argument(`
			`"-d",`
			`"--output-directory",`
			`default=os.getcwd(),`
			`help="In which directory the script should save the metrics file. The directory must exists",`
			`)`
Start tracking training metrics as Taskcluster artifacts (#604) Fixes #342 2019-06-23 00:18:08 +03:00			`parser.add_argument("model", help="Which model to retrieve training metrics from.")`
			`parser.add_argument(`
			`"date",`
			`nargs="?",`
			`help="Which date should we retrieve training metrics from. Default to latest",`
			`)`

			`args = parser.parse_args()`

Check metrics evolution (#836) Fixes #360 and fixes #641. 2019-08-05 11:22:55 +03:00			`get_task_metrics_from_date(args.model, args.date, args.output_directory)`
Start tracking training metrics as Taskcluster artifacts (#604) Fixes #342 2019-06-23 00:18:08 +03:00

			`if __name__ == "__main__":`
			`main()`