mozperftest-tools/task_processor.py

import os
import re
import glob
import json

TESTING = 0
SILENT = False


def log(msg):
    # Output message if we are not running on silent mode
    global SILENT
    if not SILENT:
        print(msg)


def pattern_match(name, artifacts_to_get):
    """
    Match an artifact that was requested with the name we have.
    """
    if not artifacts_to_get:
        return None
    for aname in artifacts_to_get:
        if aname in name:
            return aname
    return None


def sorted_nicely(data):
    """
    Sort the given iterable in the way that humans expect.
    """
    convert = lambda text: int(text) if text.isdigit() else text
    alphanum_key = lambda key: [convert(c) for c in re.split("([0-9]+)", key)]
    return sorted(data, key=alphanum_key)


def match_vismets_with_videos(task_group_id, path, vismet_task_ids):
    """
    Returns a mapping from vismet task IDs to the videos.
    """
    task_dir = os.path.join(path, task_group_id)
    taskgraph_json = os.path.join(task_dir, "task-group-information.json")

    with open(taskgraph_json) as f:
        taskgraph = json.load(f)

    # First filter down to only browsertime tasks
    mapping = {task_id: None for task_id in vismet_task_ids}
    for task in taskgraph:
        task_id = task.get("status", {}).get("taskId", "")
        if task_id not in mapping:
            continue

        vismet_fetches = json.loads(task["task"]["payload"]["env"]["MOZ_FETCHES"])
        for fetch in vismet_fetches:
            if "browsertime-results" in fetch["artifact"]:
                mapping[task_id] = fetch["task"]
                break

        if all(mapping):
            break

    return mapping


def get_task_data_paths(
    task_group_id,
    path,
    run_number=None,
    artifact=[],
    artifact_dir="",
    suite_matcher="",
    silent=False,
):
    """
    Opens a folder for a task group and returns the files
    contained within it.
    """
    global SILENT
    SILENT = silent

    if type(artifact) not in (list,):
        artifact = [artifact]

    data = {}

    # Get the directory to search
    task_dir = os.path.join(path, task_group_id)
    if not os.path.exists(task_dir):
        log("Cannot open task directory: %s" % task_dir)
        return

    if run_number is None:
        curr_dir = os.getcwd()
        os.chdir(task_dir)
        dir_list = next(os.walk("."))[1]
        max_num = 0
        for subdir in dir_list:
            run_num = int(subdir)
            if run_num > max_num:
                max_num = run_num
        os.chdir(curr_dir)
        run_number = max_num
        log("No run number supplied. Using the latest one, run number %s" % run_number)

    run_dir = os.path.join(task_dir, str(run_number))
    all_suites = [
        f for f in os.listdir(run_dir) if os.path.isdir(os.path.join(run_dir, f))
    ]

    # Find all the data for this task group
    for suite in all_suites:
        for aname in artifact:
            if suite_matcher and suite_matcher not in suite:
                continue

            suite_dir = os.path.join(run_dir, suite)

            # Get the suite's data directory
            if not artifact_dir:
                artifact_dir = aname
            all_dirs = [
                f
                for f in os.listdir(suite_dir)
                if os.path.isdir(os.path.join(suite_dir, f))
            ]
            suite_data_dir = None
            for d in all_dirs:
                if pattern_match(d, [aname]) or (
                    not artifact_dir and d.endswith("_data")
                ):
                    suite_data_dir = os.path.join(suite_dir, d)
                    break

            if not suite_data_dir:
                log("Cannot find data directory in %s, skipping" % suite_dir)
                continue

            # Now find all data files and order them
            all_files = glob.glob(os.path.join(suite_data_dir, "**/*"), recursive=True)

            all_files = [
                file
                for file in all_files
                if artifact and pattern_match(os.path.split(file)[-1], [aname])
            ]

            if suite not in data:
                data[suite] = []

            data[suite].extend(all_files)
            data[suite] = sorted_nicely(data[suite])

    return data


def get_task_data(
    task_group_id, path, run_number=None, artifact="", suite_matcher="", silent=False
):
    """
    Get the task data paths and opens the data into
    a detected file format. By default, when an unknown file
    format is encountered, the lines will be read and returned.
    """
    global SILENT
    SILENT = silent

    data = {}

    data_paths = get_task_data_paths(
        task_group_id,
        path,
        run_number=run_number,
        artifact=artifact,
        suite_matcher=suite_matcher,
        silent=silent,
    )

    for suite, paths in data_paths.items():
        data[suite] = []
        for path in paths:
            tmpdata = None
            log("Opening %s..." % path)
            if path.endswith(".json"):
                with open(path, "r") as f:
                    tmpdata = json.load(f)
            else:
                with open(path, "r") as f:
                    tmpdata = f.readlines()
            data[suite].append({"data": tmpdata, "file": path})

    return data


if __name__ == "__main__":
    if TESTING:
        data = get_task_data_paths(
            "SssyewAFQiKm40PIouxo_g",
            "/home/sparky/mozilla-source/analysis-scripts/perfunct-testing-data",
            artifact="perfherder-data",
            run_number="4",
        )
        print(json.dumps(data, indent=4))

        data = get_task_data(
            "SssyewAFQiKm40PIouxo_g",
            "/home/sparky/mozilla-source/analysis-scripts/perfunct-testing-data",
            artifact="perfherder-data",
            run_number="4",
        )
Add browsertime comparison script and update README. 2020-11-06 23:05:54 +03:00			`import os`
			`import re`
			`import glob`
			`import json`

			`TESTING = 0`
			`SILENT = False`


			`def log(msg):`
			`# Output message if we are not running on silent mode`
			`global SILENT`
			`if not SILENT:`
			`print(msg)`


			`def pattern_match(name, artifacts_to_get):`
Format Python code using Black (#5) 2021-07-13 16:22:09 +03:00			`"""`
Add browsertime comparison script and update README. 2020-11-06 23:05:54 +03:00			`Match an artifact that was requested with the name we have.`
Format Python code using Black (#5) 2021-07-13 16:22:09 +03:00			`"""`
Add browsertime comparison script and update README. 2020-11-06 23:05:54 +03:00			`if not artifacts_to_get:`
			`return None`
			`for aname in artifacts_to_get:`
			`if aname in name:`
			`return aname`
			`return None`


Format Python code using Black (#5) 2021-07-13 16:22:09 +03:00			`def sorted_nicely(data):`
			`"""`
Add browsertime comparison script and update README. 2020-11-06 23:05:54 +03:00			`Sort the given iterable in the way that humans expect.`
Format Python code using Black (#5) 2021-07-13 16:22:09 +03:00			`"""`
			`convert = lambda text: int(text) if text.isdigit() else text`
			`alphanum_key = lambda key: [convert(c) for c in re.split("([0-9]+)", key)]`
Add browsertime comparison script and update README. 2020-11-06 23:05:54 +03:00			`return sorted(data, key=alphanum_key)`


Add the ability to find video pairings using visual-metrics (#23) * Perform side by side comparisons using visual metrics, and optionally create a step graph for that metric. * Redo integration w/ perfherder visual metrics. * Remove debugging logs. Co-authored-by: Denis Palmeiro <denis.palmeiro@gmail.com> 2022-02-08 01:11:37 +03:00			`def match_vismets_with_videos(task_group_id, path, vismet_task_ids):`
			`"""`
			`Returns a mapping from vismet task IDs to the videos.`
			`"""`
			`task_dir = os.path.join(path, task_group_id)`
			`taskgraph_json = os.path.join(task_dir, "task-group-information.json")`

			`with open(taskgraph_json) as f:`
			`taskgraph = json.load(f)`

			`# First filter down to only browsertime tasks`
			`mapping = {task_id: None for task_id in vismet_task_ids}`
			`for task in taskgraph:`
			`task_id = task.get("status", {}).get("taskId", "")`
			`if task_id not in mapping:`
			`continue`

			`vismet_fetches = json.loads(task["task"]["payload"]["env"]["MOZ_FETCHES"])`
			`for fetch in vismet_fetches:`
			`if "browsertime-results" in fetch["artifact"]:`
			`mapping[task_id] = fetch["task"]`
			`break`

			`if all(mapping):`
			`break`

			`return mapping`


Format Python code using Black (#5) 2021-07-13 16:22:09 +03:00			`def get_task_data_paths(`
			`task_group_id,`
			`path,`
			`run_number=None,`
			`artifact=[],`
			`artifact_dir="",`
			`suite_matcher="",`
			`silent=False,`
			`):`
			`"""`
Add browsertime comparison script and update README. 2020-11-06 23:05:54 +03:00			`Opens a folder for a task group and returns the files`
			`contained within it.`
Format Python code using Black (#5) 2021-07-13 16:22:09 +03:00			`"""`
Add browsertime comparison script and update README. 2020-11-06 23:05:54 +03:00			`global SILENT`
			`SILENT = silent`

			`if type(artifact) not in (list,):`
			`artifact = [artifact]`

			`data = {}`

			`# Get the directory to search`
			`task_dir = os.path.join(path, task_group_id)`
			`if not os.path.exists(task_dir):`
			`log("Cannot open task directory: %s" % task_dir)`
			`return`

			`if run_number is None:`
			`curr_dir = os.getcwd()`
			`os.chdir(task_dir)`
Format Python code using Black (#5) 2021-07-13 16:22:09 +03:00			`dir_list = next(os.walk("."))[1]`
Add browsertime comparison script and update README. 2020-11-06 23:05:54 +03:00			`max_num = 0`
			`for subdir in dir_list:`
			`run_num = int(subdir)`
			`if run_num > max_num:`
			`max_num = run_num`
			`os.chdir(curr_dir)`
			`run_number = max_num`
Format Python code using Black (#5) 2021-07-13 16:22:09 +03:00			`log("No run number supplied. Using the latest one, run number %s" % run_number)`
Add browsertime comparison script and update README. 2020-11-06 23:05:54 +03:00
			`run_dir = os.path.join(task_dir, str(run_number))`
			`all_suites = [`
Format Python code using Black (#5) 2021-07-13 16:22:09 +03:00			`f for f in os.listdir(run_dir) if os.path.isdir(os.path.join(run_dir, f))`
Add browsertime comparison script and update README. 2020-11-06 23:05:54 +03:00			`]`

			`# Find all the data for this task group`
			`for suite in all_suites:`
			`for aname in artifact:`
			`if suite_matcher and suite_matcher not in suite:`
			`continue`

			`suite_dir = os.path.join(run_dir, suite)`

			`# Get the suite's data directory`
			`if not artifact_dir:`
			`artifact_dir = aname`
			`all_dirs = [`
			`f`
			`for f in os.listdir(suite_dir)`
			`if os.path.isdir(os.path.join(suite_dir, f))`
			`]`
			`suite_data_dir = None`
			`for d in all_dirs:`
Format Python code using Black (#5) 2021-07-13 16:22:09 +03:00			`if pattern_match(d, [aname]) or (`
			`not artifact_dir and d.endswith("_data")`
			`):`
Add browsertime comparison script and update README. 2020-11-06 23:05:54 +03:00			`suite_data_dir = os.path.join(suite_dir, d)`
			`break`

			`if not suite_data_dir:`
			`log("Cannot find data directory in %s, skipping" % suite_dir)`
			`continue`

			`# Now find all data files and order them`
Format Python code using Black (#5) 2021-07-13 16:22:09 +03:00			`all_files = glob.glob(os.path.join(suite_data_dir, "*/"), recursive=True)`
Add browsertime comparison script and update README. 2020-11-06 23:05:54 +03:00
			`all_files = [`
			`file`
			`for file in all_files`
			`if artifact and pattern_match(os.path.split(file)[-1], [aname])`
			`]`

			`if suite not in data:`
			`data[suite] = []`

			`data[suite].extend(all_files)`
			`data[suite] = sorted_nicely(data[suite])`

			`return data`


Format Python code using Black (#5) 2021-07-13 16:22:09 +03:00			`def get_task_data(`
			`task_group_id, path, run_number=None, artifact="", suite_matcher="", silent=False`
			`):`
			`"""`
Add browsertime comparison script and update README. 2020-11-06 23:05:54 +03:00			`Get the task data paths and opens the data into`
			`a detected file format. By default, when an unknown file`
			`format is encountered, the lines will be read and returned.`
Format Python code using Black (#5) 2021-07-13 16:22:09 +03:00			`"""`
Add browsertime comparison script and update README. 2020-11-06 23:05:54 +03:00			`global SILENT`
			`SILENT = silent`

			`data = {}`

			`data_paths = get_task_data_paths(`
			`task_group_id,`
			`path,`
			`run_number=run_number,`
			`artifact=artifact,`
			`suite_matcher=suite_matcher,`
Format Python code using Black (#5) 2021-07-13 16:22:09 +03:00			`silent=silent,`
Add browsertime comparison script and update README. 2020-11-06 23:05:54 +03:00			`)`

			`for suite, paths in data_paths.items():`
			`data[suite] = []`
			`for path in paths:`
			`tmpdata = None`
			`log("Opening %s..." % path)`
Format Python code using Black (#5) 2021-07-13 16:22:09 +03:00			`if path.endswith(".json"):`
			`with open(path, "r") as f:`
Add browsertime comparison script and update README. 2020-11-06 23:05:54 +03:00			`tmpdata = json.load(f)`
			`else:`
Format Python code using Black (#5) 2021-07-13 16:22:09 +03:00			`with open(path, "r") as f:`
Add browsertime comparison script and update README. 2020-11-06 23:05:54 +03:00			`tmpdata = f.readlines()`
Format Python code using Black (#5) 2021-07-13 16:22:09 +03:00			`data[suite].append({"data": tmpdata, "file": path})`
Add browsertime comparison script and update README. 2020-11-06 23:05:54 +03:00
			`return data`


Format Python code using Black (#5) 2021-07-13 16:22:09 +03:00			`if __name__ == "__main__":`
Add browsertime comparison script and update README. 2020-11-06 23:05:54 +03:00			`if TESTING:`
			`data = get_task_data_paths(`
Format Python code using Black (#5) 2021-07-13 16:22:09 +03:00			`"SssyewAFQiKm40PIouxo_g",`
			`"/home/sparky/mozilla-source/analysis-scripts/perfunct-testing-data",`
			`artifact="perfherder-data",`
			`run_number="4",`
Add browsertime comparison script and update README. 2020-11-06 23:05:54 +03:00			`)`
			`print(json.dumps(data, indent=4))`

			`data = get_task_data(`
Format Python code using Black (#5) 2021-07-13 16:22:09 +03:00			`"SssyewAFQiKm40PIouxo_g",`
			`"/home/sparky/mozilla-source/analysis-scripts/perfunct-testing-data",`
			`artifact="perfherder-data",`
			`run_number="4",`
Add browsertime comparison script and update README. 2020-11-06 23:05:54 +03:00			`)`