Retrieve all metrics from a date and later (#734)

Fixes #614
2019-08-01 17:05:59 +02:00 · 2019-08-01 17:05:59 +02:00 · f0ef378e49
--- a/scripts/retrieve_training_metrics.py
+++ b/scripts/retrieve_training_metrics.py
@ -3,53 +3,23 @@
 import argparse
 import logging
 import sys
-from os.path import abspath

 import requests
+import taskcluster

-LATEST_URI = "train_{}.latest"
-VERSIONED_URI = "train_{}.{}"
-DATED_VERSIONED_URI = "train_{}.{}.{}"
-BASE_URL = "https://index.taskcluster.net/v1/task/project.relman.bugbug.{}/artifacts/public/metrics.json"
+from bugbug.utils import get_taskcluster_options
+
+ROOT_URI = "train_{}.per_date"
+DATE_URI = "train_{}.per_date.{}"
+BASE_URL = "https://index.taskcluster.net/v1/task/{}/artifacts/public/metrics.json"
+NAMESPACE_URI = "project.relman.bugbug.{}"

 LOGGER = logging.getLogger(__name__)

 logging.basicConfig(level=logging.INFO)


-def main():
-    description = "Retrieve a model training metrics"
-    parser = argparse.ArgumentParser(description=description)
-
-    parser.add_argument("model", help="Which model to retrieve training metrics from.")
-    parser.add_argument(
-        "version",
-        nargs="?",
-        help="Which bugbug version should we retrieve training metrics from.",
-        default=None,
-    )
-    parser.add_argument(
-        "date",
-        nargs="?",
-        help="Which date should we retrieve training metrics from. Default to latest",
-        default=None,
-    )
-    parser.add_argument(
-        "--output",
-        "-o",
-        help="Where to output the metrics.json file. Default to printing its content",
-        default=None,
-    )
-
-    args = parser.parse_args()
-
-    if not args.version:
-        index_uri = LATEST_URI.format(args.model)
-    elif not args.date:
-        index_uri = VERSIONED_URI.format(args.model, args.version)
-    else:
-        index_uri = DATED_VERSIONED_URI.format(args.model, args.version, args.date)
-
+def get_task_metrics_from_uri(index_uri):
    index_url = BASE_URL.format(index_uri)
    LOGGER.info(f"Retrieving metrics from {index_url}")
    r = requests.get(index_url)
@ -60,13 +30,95 @@ def main():

    r.raise_for_status()

-    if args.output:
-        file_path = abspath(args.output)
-        with open(file_path, "w") as output_file:
-            output_file.write(r.text)
-        LOGGER.info(f"Metrics saved to {file_path!r}")
-    else:
-        print(r.text)
+    return r
+
+
+def get_namespaces(index, index_uri):
+    index_namespaces = index.listNamespaces(index_uri)
+
+    return index_namespaces["namespaces"]
+
+
+def is_later_or_equal(partial_date, from_date):
+    for partial_date_part, from_date_part in zip(partial_date, from_date):
+        if int(partial_date_part) > int(from_date_part):
+            return True
+        elif int(partial_date_part) < int(from_date_part):
+            return False
+        else:
+            continue
+
+    return True
+
+
+def get_task_metrics_from_date(model, date):
+    options = get_taskcluster_options()
+
+    index = taskcluster.Index(options)
+
+    index.ping()
+
+    # Split the date
+    from_date = date.split(".")
+
+    namespaces = []
+
+    # Start at the root level
+    # We need an empty list in order to append namespaces part to it
+    namespaces.append([])
+
+    # Recursively list all namespaces greater or equals than the given date
+    while namespaces:
+        current_ns = namespaces.pop()
+
+        # Handle version level namespaces
+        if not current_ns:
+            ns_uri = ROOT_URI.format(model)
+        else:
+            current_ns_date = ".".join(current_ns)
+            ns_uri = DATE_URI.format(model, current_ns_date)
+
+        ns_full_uri = NAMESPACE_URI.format(ns_uri)
+
+        tasks = index.listTasks(ns_full_uri)
+        for task in tasks["tasks"]:
+            task_uri = task["namespace"]
+            r = get_task_metrics_from_uri(task_uri)
+
+            # Write the file on disk
+            file_path = f"metric_{'_'.join(task_uri.split('.'))}.json"
+            with open(file_path, "w") as metric_file:
+                metric_file.write(r.text)
+            LOGGER.info(f"Metrics saved to {file_path!r}")
+
+        for namespace in get_namespaces(index, ns_full_uri):
+            new_ns = current_ns.copy()
+            new_ns.append(namespace["name"])
+
+            if not is_later_or_equal(new_ns, from_date):
+                LOGGER.debug("NEW namespace %s is before %s", new_ns, from_date)
+                continue
+
+            # Might not be efficient but size of `namespaces` shouldn't be too
+            # big as we are doing a depth-first traversal
+            if new_ns not in namespaces:
+                namespaces.append(new_ns)
+
+
+def main():
+    description = "Retrieve a model training metrics"
+    parser = argparse.ArgumentParser(description=description)
+
+    parser.add_argument("model", help="Which model to retrieve training metrics from.")
+    parser.add_argument(
+        "date",
+        nargs="?",
+        help="Which date should we retrieve training metrics from. Default to latest",
+    )
+
+    args = parser.parse_args()
+
+    get_task_metrics_from_date(args.model, args.date)


 if __name__ == "__main__":