Refactor push_health.py recipe logic into a re-useable 'Push' class

2019-05-23 09:43:36 -04:00 · 2019-05-23 09:43:36 -04:00 · b5e4451eae
--- a/ci_info/init.py
+++ b/ci_info/init.py
@ -0,0 +1 @@
+from .push import Push
--- a/ci_info/push.py
+++ b/ci_info/push.py
@ -0,0 +1,268 @@
+from argparse import Namespace
+from collections import defaultdict
+from dataclasses import dataclass
+from enum import Enum
+from typing import List
+
+import requests
+from adr.query import run_query
+from adr.util.memoize import memoize, memoized_property
+
+HGMO_JSON_URL = "https://hg.mozilla.org/integration/{branch}/rev/{rev}?style=json"
+
+
+class Status(Enum):
+    PASS = 0
+    FAIL = 1
+    INTERMITTENT = 2
+
+
+@dataclass
+class Task:
+    """Contains information pertaining to a single task."""
+    label: str
+    duration: int
+    result: str
+    classification: str
+
+
+@dataclass
+class LabelSummary:
+    """Summarizes the overall state of a task label (across retriggers)."""
+    label: str
+    tasks: List[Task]
+
+    def __post_init__(self):
+        assert all(t.label == self.label for t in self.tasks)
+
+    @property
+    def classifications(self):
+        return set(t.classification for t in self.tasks)
+
+    @property
+    def results(self):
+        return set(t.result for t in self.tasks)
+
+    @memoized_property
+    def status(self):
+        overall_status = None
+        for task in self.tasks:
+            if task.result in ('busted', 'exception', 'testfailed'):
+                status = Status.FAIL
+            else:
+                status = Status.PASS
+
+            if overall_status is None:
+                overall_status = status
+            elif status != overall_status:
+                overall_status = Status.INTERMITTENT
+
+        return overall_status
+
+
+class Push:
+
+    def __init__(self, rev, branch='autoland'):
+        """A representation of a single push.
+
+        Args:
+            rev (str): Revision of the top-most commit in the push.
+            branch (str): Branch to look on (default: autoland).
+        """
+        self.rev = rev
+        self.branch = branch
+
+    @property
+    def backedoutby(self):
+        """The revision of the commit which backs out this one or None.
+
+        Returns:
+            str or None: The commit revision which backs this push out (or None).
+        """
+        return self._hgmo.get('backedoutby') or None
+
+    @property
+    def backedout(self):
+        """Whether the push was backed out or not.
+
+        Returns:
+            bool: True if this push was backed out.
+        """
+        return bool(self.backedoutby)
+
+    @property
+    def pushid(self):
+        """The push id.
+
+        Returns:
+            int: The push id.
+        """
+        return self._hgmo['pushid']
+
+    @property
+    def parent(self):
+        """Returns the parent push of this push.
+
+        Returns:
+            Push: A `Push` instance representing the parent push.
+        """
+        while True:
+            for rev in other._hgmo['parents']:
+                parent = Push(rev)
+                if parent.pushid != self.pushid:
+                    return parent
+
+    @memoized_property
+    def tasks(self):
+        """All tasks that ran on the push, including retriggers and backfills.
+
+        Returns:
+            list: A list of `Task` objects.
+        """
+        args = Namespace(rev=self.rev)
+        data = run_query('push_results', args)['data']
+        return [Task(**kwargs) for kwargs in data]
+
+    @property
+    def task_labels(self):
+        """The set of task labels that ran on this push.
+
+        Returns:
+            set: A set of task labels (str).
+        """
+        return set([t.label for t in self.tasks])
+
+    @memoized_property
+    def target_task_labels(self):
+        """The set of all task labels that could possibly run on this push.
+
+        Returns:
+            set: A set of task labels.
+        """
+        return set(self._get_decision_artifact('target-tasks.json'))
+
+    @memoized_property
+    def scheduled_task_labels(self):
+        """The set of task labels that were originally scheduled to run on this push.
+
+        This excludes retriggers and backfills.
+
+        Returns:
+            set: A set of task labels (str).
+        """
+        tasks = self._get_decision_artifact('task-graph.json').values()
+        return set([t['label'] for t in tasks])
+
+    @property
+    def unscheduled_task_labels(self):
+        """The set of task labels from tasks that were not originally scheduled on
+        the push (i.e they were scheduled via backfill or Add New Jobs).
+
+        Returns:
+            set: A set of task labels (str).
+        """
+        return self.task_labels - self.scheduled_task_labels
+
+    @memoized_property
+    def label_summaries(self):
+        """All label summaries combining retriggers.
+
+        Returns:
+            dict: A dictionary of the form {<label>: [<LabelSummary>]}."""
+        labels = defaultdict(list)
+        for task in self.tasks:
+            labels[task.label].append(task)
+        labels = {label: LabelSummary(label, tasks) for label, tasks in labels.items()}
+        return labels
+
+    @memoized_property
+    def duration(self):
+        """The total duration of all tasks that ran on the push.
+
+        Returns:
+            int: Runtime in hours.
+        """
+        return int(sum(t.duration for t in self.tasks) / 3600)
+
+    @memoized_property
+    def scheduled_duration(self):
+        """The total runtime of tasks excluding retriggers and backfills.
+
+        Returns:
+            int: Runtime in hours.
+        """
+        seen = set()
+        duration = 0
+        for task in self.tasks:
+            if task.label not in seen:
+                seen.add(task.label)
+                duration += task.duration
+        return int(duration / 3600)
+
+    @memoized_property
+    def regressions(self):
+        """The set of all task labels that were regressed by this push.
+
+        Returns:
+            set: Set of task labels (str).
+        """
+        regressions = set()
+        for label, summary in self.label_summaries.items():
+            if summary.status == Status.PASS:
+                continue
+
+            if any(c in ('not classified', 'fixed by commit') for c in summary.classifications):
+                regressions.add(label)
+        return regressions
+
+    @property
+    def regressions_missed(self):
+        """The set of all task labels that were regressed by this push and were
+        not caught by a task that was initially scheduled. E.g the regression was
+        a retrigger/backfill.
+
+        Returns: set: Set of task labels (str).
+        """
+        return self.regressions - self.scheduled_task_labels
+
+    @property
+    def regressions_caught(self):
+        """The set of all task labels that were regressed by this push and were
+        caught by a task that was initially scheduled.
+
+        Returns: set: Set of task labels (str).
+        """
+        return self.regressions & self.scheduled_task_labels
+
+    @memoized_property
+    def _decision_artifact_urls(self):
+        """All artifact urls from the Decision task of this push.
+
+        Returns:
+            list: A list of urls.
+        """
+        return run_query('decision_artifacts', Namespace(rev=self.rev))['data'][0]['artifacts']
+
+    @memoize
+    def _get_decision_artifact(self, name):
+        """Get an artifact from Decision task of this push.
+
+        Args:
+            name (str): Name of the artifact fetch.
+
+        Returns:
+            dict: JSON representation of the artifact.
+        """
+        for url in self._decision_artifact_urls:
+            if url.rsplit('/', 1)[1] == name:
+                return requests.get(url).json()
+
+    @memoized_property
+    def _hgmo(self):
+        """A JSON dict obtained from hg.mozilla.org.
+
+        Returns:
+            dict: Information regarding this push.
+        """
+        url = HGMO_JSON_URL.format(branch=self.branch, rev=self.rev)
+        return requests.get(url).json()
--- a/pyproject.toml
+++ b/pyproject.toml
@ -4,6 +4,7 @@ version = "0.1.0"
 description = "Collection of CI related ActiveData recipes."

 packages = [
+    { include = "ci_info" },
    { include = "recipes" },
 ]
 include = [
--- a/recipes/push_health.py
+++ b/recipes/push_health.py
@ -1,119 +1,35 @@
 """
-Show information related to how "healthy" a push looks. Only works with
-autoland for now.
+Show information related to how "healthy" a push looks (autoland only).

 .. code-block:: bash

    adr push_health -r <revision>
 """

-from argparse import Namespace
-from collections import defaultdict, namedtuple
-from dataclasses import dataclass
-from enum import Enum
-
-import requests
-
-from adr.query import run_query
-from adr.util import memoize
-
-
-class Status(Enum):
-    PASS = 0
-    FAIL = 1
-    INTERMITTENT = 2
-
-
-@dataclass
-class Task:
-    label: str
-    duration: int
-    result: Status
-    classification: str
-
-
-@memoize
-def get_decision_artifact_urls(rev):
-    """Return all artifact urls from the Decision task of the given revision.
-
-    This function is memoized, so it will only run the 'decision_artifacts'
-    query a single time for any given revision.
-
-    Args:
-        rev (str): Revision associated with the push on treeherder.
-
-    Returns:
-        list: List of artifact urls.
-    """
-    return run_query('decision_artifacts', Namespace(rev=rev))['data'][0]['artifacts']
-
-
-def get_decision_artifact(rev, name):
-    """Get an artifact from Decision task of the given revision.
-
-    Args:
-        rev (str): Revision associated with the push on treeherder.
-        name (str): Name of the artifact fetch.
-
-    Returns:
-        dict: JSON representation of the artifact.
-    """
-    for url in get_decision_artifact_urls(rev):
-        if url.rsplit('/', 1)[1] == name:
-            return requests.get(url).json()
+from ci_info import Push


 def run(args):
-    target_task_set = set(get_decision_artifact(args.rev, 'target-tasks.json'))
-    task_set = set([v['label'] for v in get_decision_artifact(args.rev, 'task-graph.json').values()])
+    push = Push(args.rev)

-    data = run_query('push_results', args)['data']
-    tasks = [Task(**kwargs) for kwargs in data]
+    num_scheduled = len(push.scheduled_task_labels)
+    num_total = len(push.target_task_labels)
+    percentage = round(float(num_scheduled) / num_total * 100, 1)

-    labels = defaultdict(lambda: {'status': None, 'classifications': set()})
-    duration = reg_caught = reg_missed = 0
-    for task in tasks:
-        label = labels[task.label]
-        s = Status.FAIL if task.result in ('busted', 'exception', 'testfailed') else Status.PASS
-
-        if label['status'] is None:
-            label['status'] = s
-            # Don't count retriggers in total duration as it is beyond the
-            # scheduler's control.
-            duration += task.duration
-        elif label['status'] != s:
-            label['status'] = Status.INTERMITTENT
-
-        label['classifications'].add(task.classification)
-
-
-    for label, value in labels.items():
-        status = value['status']
-        classifications = value['classifications']
-
-        if status == Status.PASS:
-            continue
-
-        if any(c in ('not classified', 'fixed by commit') for c in classifications):
-            if label in task_set:
-                reg_caught += 1
-            else:
-                reg_missed += 1
-
-
-    header = [
+    return [[
        'Tasks Scheduled',
        'Tasks Total',
        'Percentage',
        'Total Hours',
+        'Backed Out',
        'Regressions Caught',
        'Regressions Missed',
-    ]
-    num_scheduled = len(task_set)
-    num_total = len(target_task_set)
-    percentage = round(float(num_scheduled) / num_total * 100, 1)
-    hours = int(duration / 3600)
-
-    result = [[num_scheduled, num_total, percentage, hours, reg_caught, reg_missed]]
-    result.insert(0, header)
-    return result
+    ], [
+        num_scheduled,
+        num_total,
+        percentage,
+        push.scheduled_duration,
+        push.backedout,
+        len(push.regressions_caught),
+        len(push.regressions_missed),
+    ]]