Backed out changeset 67e5d2425c75 (bug 1565316) for causing raptor wasm failures. CLOSED TREE

2019-09-17 04:49:42 +03:00 · 2019-09-17 04:49:42 +03:00 · 2c2409c49e
--- a/testing/raptor/raptor/manifest.py
+++ b/testing/raptor/raptor/manifest.py
@ -195,19 +195,21 @@ def write_test_settings_json(args, test_details, oskey):

    test_settings['raptor-options']['unit'] = test_details.get("unit", "ms")

-    test_settings['raptor-options']['lower_is_better'] = test_details.get("lower_is_better", True)
+    test_settings['raptor-options']['lower_is_better'] = bool_from_str(
+        test_details.get("lower_is_better", "true"))

    # support optional subtest unit/lower_is_better fields
    val = test_details.get('subtest_unit', test_settings['raptor-options']['unit'])
    test_settings['raptor-options']['subtest_unit'] = val
-    subtest_lower_is_better = test_details.get('subtest_lower_is_better')
+    subtest_lower_is_better = test_details.get('subtest_lower_is_better', None)

    if subtest_lower_is_better is None:
        # default to main test values if not set
        test_settings['raptor-options']['subtest_lower_is_better'] = (
            test_settings['raptor-options']['lower_is_better'])
    else:
-        test_settings['raptor-options']['subtest_lower_is_better'] = subtest_lower_is_better
+        test_settings['raptor-options']['subtest_lower_is_better'] = bool_from_str(
+            subtest_lower_is_better)

    if test_details.get("alert_change_type", None) is not None:
        test_settings['raptor-options']['alert_change_type'] = test_details['alert_change_type']
@ -415,9 +417,6 @@ def get_raptor_test_list(args, oskey):
                # remove the 'hero =' line since no longer measuring hero
                del next_test['hero']

-        if next_test.get('lower_is_better') is not None:
-            next_test['lower_is_better'] = bool_from_str(next_test.get('lower_is_better'))
-
    # write out .json test setting files for the control server to read and send to web ext
    if len(tests_to_run) != 0:
        for test in tests_to_run:
--- a/testing/raptor/raptor/output.py
+++ b/testing/raptor/raptor/output.py
@ -13,16 +13,13 @@ import filters
 import json
 import os

-from abc import ABCMeta, abstractmethod
 from logger.logger import RaptorLogger

-LOG = RaptorLogger(component='perftest-output')
+LOG = RaptorLogger(component='raptor-output')


-class PerftestOutput(object):
-    """Abstract base class to handle output of perftest results"""
-
-    __metaclass__ = ABCMeta
+class Output(object):
+    """class for raptor output"""

    def __init__(self, results, supporting_data, subtest_alert_on):
        """
@ -35,290 +32,6 @@ class PerftestOutput(object):
        self.summarized_screenshots = []
        self.subtest_alert_on = subtest_alert_on

-    @abstractmethod
-    def summarize(self, test_names):
-        raise NotImplementedError()
-
-    def summarize_supporting_data(self):
-        '''
-        Supporting data was gathered outside of the main raptor test; it will be kept
-        separate from the main raptor test results. Summarize it appropriately.
-
-        supporting_data = {'type': 'data-type',
-                           'test': 'raptor-test-ran-when-data-was-gathered',
-                           'unit': 'unit that the values are in',
-                           'values': {
-                               'name': value,
-                               'nameN': valueN}}
-
-        More specifically, power data will look like this:
-
-        supporting_data = {'type': 'power',
-                           'test': 'raptor-speedometer-geckoview',
-                           'unit': 'mAh',
-                           'values': {
-                               'cpu': cpu,
-                               'wifi': wifi,
-                               'screen': screen,
-                               'proportional': proportional}}
-
-        We want to treat each value as a 'subtest'; and for the overall aggregated
-        test result, we'll sum together all subtest values.
-        '''
-        if self.supporting_data is None:
-            return
-
-        self.summarized_supporting_data = []
-
-        for data_set in self.supporting_data:
-            suites = []
-            test_results = {
-                'framework': {
-                    'name': 'raptor',
-                },
-                'suites': suites,
-            }
-
-            data_type = data_set['type']
-            LOG.info("summarizing %s data" % data_type)
-
-            # suite name will be name of the actual raptor test that ran, plus the type of
-            # supporting data i.e. 'raptor-speedometer-geckoview-power'
-            vals = []
-            subtests = []
-            suite = {
-                'name': data_set['test'] + "-" + data_set['type'],
-                'type': data_set['type'],
-                'subtests': subtests,
-                'lowerIsBetter': True,
-                'unit': data_set['unit'],
-                'alertThreshold': 2.0
-            }
-
-            suites.append(suite)
-
-            # each supporting data measurement becomes a subtest, with the measurement type
-            # used for the subtest name. i.e. 'raptor-speedometer-geckoview-power-cpu'
-            # the overall 'suite' value for supporting data will be the sum of all measurements
-            for measurement_name, value in data_set['values'].iteritems():
-                new_subtest = {}
-                new_subtest['name'] = data_set['test'] + "-" + data_type + "-" + measurement_name
-                new_subtest['value'] = value
-                new_subtest['lowerIsBetter'] = True
-                new_subtest['alertThreshold'] = 2.0
-                new_subtest['unit'] = data_set['unit']
-                subtests.append(new_subtest)
-                vals.append([new_subtest['value'], new_subtest['name']])
-
-            if len(subtests) > 1:
-                suite['value'] = self.construct_summary(vals, testname="supporting_data")
-
-            subtests.sort(key=lambda subtest: subtest['name'])
-            suites.sort(key=lambda suite: suite['name'])
-
-            self.summarized_supporting_data.append(test_results)
-
-        return
-
-    def output(self, test_names):
-        """output to file and perfherder data json"""
-        if os.getenv('MOZ_UPLOAD_DIR'):
-            # i.e. testing/mozharness/build/raptor.json locally; in production it will
-            # be at /tasks/task_*/build/ (where it will be picked up by mozharness later
-            # and made into a tc artifact accessible in treeherder as perfherder-data.json)
-            results_path = os.path.join(os.path.dirname(os.environ['MOZ_UPLOAD_DIR']),
-                                        'raptor.json')
-            screenshot_path = os.path.join(os.path.dirname(os.environ['MOZ_UPLOAD_DIR']),
-                                           'screenshots.html')
-        else:
-            results_path = os.path.join(os.getcwd(), 'raptor.json')
-            screenshot_path = os.path.join(os.getcwd(), 'screenshots.html')
-
-        if self.summarized_results == {}:
-            LOG.error("no summarized raptor results found for %s" %
-                      ', '.join(test_names))
-        else:
-            with open(results_path, 'w') as f:
-                for result in self.summarized_results:
-                    f.write("%s\n" % result)
-
-        if len(self.summarized_screenshots) > 0:
-            with open(screenshot_path, 'w') as f:
-                for result in self.summarized_screenshots:
-                    f.write("%s\n" % result)
-            LOG.info("screen captures can be found locally at: %s" % screenshot_path)
-
-        # now that we've checked for screen captures too, if there were no actual
-        # test results we can bail out here
-        if self.summarized_results == {}:
-            return False, 0
-
-        # when gecko_profiling, we don't want results ingested by Perfherder
-        extra_opts = self.summarized_results['suites'][0].get('extraOptions', [])
-        test_type = self.summarized_results['suites'][0].get('type', '')
-
-        output_perf_data = True
-        not_posting = '- not posting regular test results for perfherder'
-        if 'gecko_profile' in extra_opts:
-            LOG.info("gecko profiling enabled %s" % not_posting)
-            output_perf_data = False
-        elif test_type == 'scenario':
-            # if a resource-usage flag was supplied the perfherder data
-            # will still be output from output_supporting_data
-            LOG.info("scenario test type was run %s" % not_posting)
-            output_perf_data = False
-
-        total_perfdata = 0
-        if output_perf_data:
-            # if we have supporting data i.e. power, we ONLY want those measurements
-            # dumped out. TODO: Bug 1515406 - Add option to output both supplementary
-            # data (i.e. power) and the regular Raptor test result
-            # Both are already available as separate PERFHERDER_DATA json blobs
-            if len(self.summarized_supporting_data) == 0:
-                LOG.info("PERFHERDER_DATA: %s" % json.dumps(self.summarized_results))
-                total_perfdata = 1
-            else:
-                LOG.info("supporting data measurements exist - only posting those to perfherder")
-
-        json.dump(self.summarized_results, open(results_path, 'w'), indent=2,
-                  sort_keys=True)
-        LOG.info("results can also be found locally at: %s" % results_path)
-
-        return True, total_perfdata
-
-    def output_supporting_data(self, test_names):
-        '''
-        Supporting data was gathered outside of the main raptor test; it has already
-        been summarized, now output it appropriately.
-
-        We want to output supporting data in a completely separate perfherder json blob and
-        in a corresponding file artifact. This way, supporting data can be ingested as its own
-        test suite in perfherder and alerted upon if desired; kept outside of the test results
-        from the actual Raptor test which was run when the supporting data was gathered.
-        '''
-        if len(self.summarized_supporting_data) == 0:
-            LOG.error("no summarized supporting data found for %s" %
-                      ', '.join(test_names))
-            return False, 0
-
-        total_perfdata = 0
-        for next_data_set in self.summarized_supporting_data:
-            data_type = next_data_set['suites'][0]['type']
-
-            if os.environ['MOZ_UPLOAD_DIR']:
-                # i.e. testing/mozharness/build/raptor.json locally; in production it will
-                # be at /tasks/task_*/build/ (where it will be picked up by mozharness later
-                # and made into a tc artifact accessible in treeherder as perfherder-data.json)
-                results_path = os.path.join(os.path.dirname(os.environ['MOZ_UPLOAD_DIR']),
-                                            'raptor-%s.json' % data_type)
-            else:
-                results_path = os.path.join(os.getcwd(), 'raptor-%s.json' % data_type)
-
-            # dump data to raptor-data.json artifact
-            json.dump(next_data_set, open(results_path, 'w'), indent=2, sort_keys=True)
-
-            # the output that treeherder expects to find
-            LOG.info("PERFHERDER_DATA: %s" % json.dumps(next_data_set))
-            LOG.info("%s results can also be found locally at: %s" % (data_type, results_path))
-            total_perfdata += 1
-
-        return True, total_perfdata
-
-    def construct_summary(self, vals, testname):
-
-        def _filter(vals, value=None):
-            if value is None:
-                return [i for i, j in vals]
-            return [i for i, j in vals if j == value]
-
-        if testname.startswith('raptor-v8_7'):
-            return 100 * filters.geometric_mean(_filter(vals))
-
-        if testname.startswith('raptor-speedometer'):
-            correctionFactor = 3
-            results = _filter(vals)
-            # speedometer has 16 tests, each of these are made of up 9 subtests
-            # and a sum of the 9 values.  We receive 160 values, and want to use
-            # the 16 test values, not the sub test values.
-            if len(results) != 160:
-                raise Exception("Speedometer has 160 subtests, found: %s instead" % len(results))
-
-            results = results[9::10]
-            score = 60 * 1000 / filters.geometric_mean(results) / correctionFactor
-            return score
-
-        if testname.startswith('raptor-stylebench'):
-            # see https://bug-172968-attachments.webkit.org/attachment.cgi?id=319888
-            correctionFactor = 3
-            results = _filter(vals)
-
-            # stylebench has 5 tests, each of these are made of up 5 subtests
-            #
-            #   * Adding classes.
-            #   * Removing classes.
-            #   * Mutating attributes.
-            #   * Adding leaf elements.
-            #   * Removing leaf elements.
-            #
-            # which are made of two subtests each (sync/async) and repeated 5 times
-            # each, thus, the list here looks like:
-            #
-            #   [Test name/Adding classes - 0/ Sync; <x>]
-            #   [Test name/Adding classes - 0/ Async; <y>]
-            #   [Test name/Adding classes - 0; <x> + <y>]
-            #   [Test name/Removing classes - 0/ Sync; <x>]
-            #   [Test name/Removing classes - 0/ Async; <y>]
-            #   [Test name/Removing classes - 0; <x> + <y>]
-            #   ...
-            #   [Test name/Adding classes - 1 / Sync; <x>]
-            #   [Test name/Adding classes - 1 / Async; <y>]
-            #   [Test name/Adding classes - 1 ; <x> + <y>]
-            #   ...
-            #   [Test name/Removing leaf elements - 4; <x> + <y>]
-            #   [Test name; <sum>] <- This is what we want.
-            #
-            # So, 5 (subtests) *
-            #     5 (repetitions) *
-            #     3 (entries per repetition (sync/async/sum)) =
-            #     75 entries for test before the sum.
-            #
-            # We receive 76 entries per test, which ads up to 380. We want to use
-            # the 5 test entries, not the rest.
-            if len(results) != 380:
-                raise Exception("StyleBench has 380 entries, found: %s instead" % len(results))
-            results = results[75::76]
-            return 60 * 1000 / filters.geometric_mean(results) / correctionFactor
-
-        if testname.startswith(('raptor-kraken', 'raptor-sunspider', 'supporting_data')):
-            return sum(_filter(vals))
-
-        if testname.startswith(('raptor-unity-webgl', 'raptor-webaudio')):
-            # webaudio_score and unity_webgl_score: self reported as 'Geometric Mean'
-            return filters.mean(_filter(vals, 'Geometric Mean'))
-
-        if testname.startswith('raptor-assorted-dom'):
-            return round(filters.geometric_mean(_filter(vals)), 2)
-
-        if testname.startswith('raptor-wasm-misc'):
-            # wasm_misc_score: self reported as '__total__'
-            return filters.mean(_filter(results, '__total__'))
-
-        if testname.startswith('raptor-wasm-godot'):
-            # wasm_godot_score: first-interactive mean
-            return filters.mean(_filter(vals, 'first-interactive'))
-
-        if testname.startswith('raptor-youtube-playback'):
-            return round(filters.mean(_filter(vals)), 2)
-
-        if len(vals) > 1:
-            return round(filters.geometric_mean(_filter(vals)), 2)
-
-        return round(filters.mean(_filter(vals)), 2)
-
-
-class RaptorOutput(PerftestOutput):
-    """class for raptor output"""
-
    def summarize(self, test_names):
        suites = []
        test_results = {
@ -460,10 +173,6 @@ class RaptorOutput(PerftestOutput):
            if len(subtests) > 1:
                suite['value'] = self.construct_summary(vals, testname=test.name)

-            subtests.sort(key=lambda subtest: subtest['name'])
-
-        suites.sort(key=lambda suite: suite['name'])
-
        self.summarized_results = test_results

    def combine_browser_cycles(self):
@ -585,6 +294,95 @@ class RaptorOutput(PerftestOutput):
        self.summarized_results['suites'] = [item for item in self.summarized_results['suites']
                                             if item.get('to_be_deleted') is not True]

+    def summarize_supporting_data(self):
+        '''
+        Supporting data was gathered outside of the main raptor test; it will be kept
+        separate from the main raptor test results. Summarize it appropriately.
+
+        supporting_data = {'type': 'data-type',
+                           'test': 'raptor-test-ran-when-data-was-gathered',
+                           'unit': 'unit that the values are in',
+                           'values': {
+                               'name': value,
+                               'nameN': valueN}}
+
+        More specifically, power data will look like this:
+
+        supporting_data = {'type': 'power',
+                           'test': 'raptor-speedometer-geckoview',
+                           'unit': 'mAh',
+                           'values': {
+                               'cpu': cpu,
+                               'wifi': wifi,
+                               'screen': screen,
+                               'proportional': proportional}}
+
+        We want to treat each value as a 'subtest'; and for the overall aggregated
+        test result we will add all of these subtest values togther.
+        '''
+        if self.supporting_data is None:
+            return
+
+        self.summarized_supporting_data = []
+        support_data_by_type = {}
+
+        for data_set in self.supporting_data:
+
+            data_type = data_set['type']
+            LOG.info("summarizing %s data" % data_type)
+
+            if data_type not in support_data_by_type:
+                support_data_by_type[data_type] = {
+                    'framework': {
+                        'name': 'raptor',
+                    },
+                    'suites': [],
+                }
+
+            # suite name will be name of the actual raptor test that ran, plus the type of
+            # supporting data i.e. 'raptor-speedometer-geckoview-power'
+            vals = []
+            subtests = []
+            suite = {
+                'name': data_set['test'] + "-" + data_set['type'],
+                'type': data_set['type'],
+                'subtests': subtests,
+                'lowerIsBetter': True,
+                'unit': data_set['unit'],
+                'alertThreshold': 2.0
+            }
+
+            support_data_by_type[data_type]['suites'].append(suite)
+
+            # each supporting data measurement becomes a subtest, with the measurement type
+            # used for the subtest name. i.e. 'power-cpu'
+            # the overall 'suite' value for supporting data is dependent on
+            # the unit of the values, by default the sum of all measurements
+            # is taken.
+            for measurement_name, value in data_set['values'].iteritems():
+                new_subtest = {}
+                new_subtest['name'] = data_type + "-" + measurement_name
+                new_subtest['value'] = value
+                new_subtest['lowerIsBetter'] = True
+                new_subtest['alertThreshold'] = 2.0
+                new_subtest['unit'] = data_set['unit']
+                subtests.append(new_subtest)
+                vals.append([new_subtest['value'], new_subtest['name']])
+
+            if len(subtests) >= 1:
+                suite['value'] = self.construct_summary(
+                    vals,
+                    testname="supporting_data",
+                    unit=data_set['unit']
+                )
+
+        # split the supporting data by type, there will be one
+        # perfherder output per type
+        for data_type in support_data_by_type:
+            self.summarized_supporting_data.append(support_data_by_type[data_type])
+
+        return
+
    def parseSpeedometerOutput(self, test):
        # each benchmark 'index' becomes a subtest; each pagecycle / iteration
        # of the test has multiple values per index/subtest
@ -1089,8 +887,8 @@ class RaptorOutput(PerftestOutput):
        for pagecycle in data:
            for _sub, _value in pagecycle[0].iteritems():
                try:
-                    percent_dropped = (float(_value['droppedFrames']) /
-                                       _value['decodedFrames'] * 100.0)
+                    percent_dropped = float(_value['droppedFrames']) / _value['decodedFrames'] \
+                                      * 100.0
                except ZeroDivisionError:
                    # if no frames have been decoded the playback failed completely
                    percent_dropped = 100.0
@ -1160,126 +958,281 @@ class RaptorOutput(PerftestOutput):

        self.summarized_screenshots.append("""</table></body> </html>""")

+    def output(self, test_names):
+        """output to file and perfherder data json """
+        if os.getenv('MOZ_UPLOAD_DIR'):
+            # i.e. testing/mozharness/build/raptor.json locally; in production it will
+            # be at /tasks/task_*/build/ (where it will be picked up by mozharness later
+            # and made into a tc artifact accessible in treeherder as perfherder-data.json)
+            results_path = os.path.join(os.path.dirname(os.environ['MOZ_UPLOAD_DIR']),
+                                        'raptor.json')
+            screenshot_path = os.path.join(os.path.dirname(os.environ['MOZ_UPLOAD_DIR']),
+                                           'screenshots.html')
+        else:
+            results_path = os.path.join(os.getcwd(), 'raptor.json')
+            screenshot_path = os.path.join(os.getcwd(), 'screenshots.html')

-class BrowsertimeOutput(PerftestOutput):
-    """class for browsertime output"""
-
-    def summarize(self, test_names):
-        """
-        Summarize the parsed browsertime test output, and format accordingly so the output can
-        be ingested by Perfherder.
-
-        At this point each entry in self.results for browsertime-pageload tests is in this format:
-
-        {'statistics':{'fcp': {u'p99': 932, u'mdev': 10.0941, u'min': 712, u'p90': 810, u'max':
-        932, u'median': 758, u'p10': 728, u'stddev': 50, u'mean': 769}, 'dcf': {u'p99': 864,
-        u'mdev': 11.6768, u'min': 614, u'p90': 738, u'max': 864, u'median': 670, u'p10': 632,
-        u'stddev': 58, u'mean': 684}, 'fnbpaint': {u'p99': 830, u'mdev': 9.6851, u'min': 616,
-        u'p90': 719, u'max': 830, u'median': 668, u'p10': 642, u'stddev': 48, u'mean': 680},
-        'loadtime': {u'p99': 5818, u'mdev': 111.7028, u'min': 3220, u'p90': 4450, u'max': 5818,
-        u'median': 3476, u'p10': 3241, u'stddev': 559, u'mean': 3642}}, 'name':
-        'raptor-tp6-guardian-firefox', 'url': 'https://www.theguardian.co.uk', 'lower_is_better':
-        True, 'measurements': {'fcp': [932, 744, 744, 810, 712, 775, 759, 744, 777, 739, 809, 906,
-        734, 742, 760, 758, 728, 792, 757, 759, 742, 759, 775, 726, 730], 'dcf': [864, 679, 637,
-        662, 652, 651, 710, 679, 646, 689, 686, 845, 670, 694, 632, 703, 670, 738, 633, 703, 614,
-        703, 650, 622, 670], 'fnbpaint': [830, 648, 666, 704, 616, 683, 678, 650, 685, 651, 719,
-        820, 634, 664, 681, 664, 642, 703, 668, 670, 669, 668, 681, 652, 642], 'loadtime': [4450,
-        3592, 3770, 3345, 3453, 3220, 3434, 3621, 3511, 3416, 3430, 5818, 4729, 3406, 3506, 3588,
-        3245, 3381, 3707, 3241, 3595, 3483, 3236, 3390, 3476]}, 'subtest_unit': 'ms', 'bt_ver':
-        '4.9.2-android', 'alert_threshold': 2, 'cold': True, 'type': 'browsertime-pageload',
-        'unit': 'ms', 'browser': "{u'userAgent': u'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13;
-        rv:70.0) Gecko/20100101 Firefox/70.0', u'windowSize': u'1366x694'}"}
-
-        Now we must process this further and prepare the result for output suitable for perfherder
-        ingestion.
-
-        Note: For the overall subtest values/results (i.e. for each measurement type) we will use
-        the Browsertime-provided statistics, instead of calcuating our own geomeans from the
-        replicates.
-        """
-        LOG.info("preparing browsertime results for output")
-
-        suites = []
-        test_results = {
-            'framework': {
-                'name': 'browsertime',
-            },
-            'suites': suites,
-        }
-
-        # check if we actually have any results
-        if len(self.results) == 0:
-            LOG.error("no browsertime test results found for %s" %
+        if self.summarized_results == {}:
+            LOG.error("no summarized raptor results found for %s" %
                      ', '.join(test_names))
-            return
+        else:
+            with open(results_path, 'w') as f:
+                for result in self.summarized_results:
+                    f.write("%s\n" % result)

-        for test in self.results:
-            vals = []
-            subtests = []
-            suite = {
-                'name': test['name'],
-                'type': test['type'],
-                'extraOptions': test['extra_options'],
-                'subtests': subtests,
-                'lowerIsBetter': test['lower_is_better'],
-                'unit': test['unit'],
-                'alertThreshold': float(test['alert_threshold'])
-            }
+        if len(self.summarized_screenshots) > 0:
+            with open(screenshot_path, 'w') as f:
+                for result in self.summarized_screenshots:
+                    f.write("%s\n" % result)
+            LOG.info("screen captures can be found locally at: %s" % screenshot_path)

-            # Check if the test has set optional properties
-            if hasattr(test, "alert_change_type"):
-                suite['alertChangeType'] = test['alert_change_type']
+        # now that we've checked for screen captures too, if there were no actual
+        # test results we can bail out here
+        if self.summarized_results == {}:
+            return False, 0

-            # process results for pageloader type of tests
-            if test["type"] != "browsertime-pageload":
-                LOG.error("output.summarize received unsupported test results type for %s" %
-                          test['name'])
-                continue
+        # when gecko_profiling, we don't want results ingested by Perfherder
+        extra_opts = self.summarized_results['suites'][0].get('extraOptions', [])
+        test_type = self.summarized_results['suites'][0].get('type', '')

-            suites.append(suite)
+        output_perf_data = True
+        not_posting = '- not posting regular test results for perfherder'
+        if 'gecko_profile' in extra_opts:
+            LOG.info("gecko profiling enabled %s" % not_posting)
+            output_perf_data = False
+        elif test_type == 'scenario':
+            # if a resource-usage flag was supplied the perfherder data
+            # will still be output from output_supporting_data
+            LOG.info("scenario test type was run %s" % not_posting)
+            output_perf_data = False

-            for measurement_name, replicates in test['measurements'].iteritems():
-                new_subtest = {}
-                new_subtest['name'] = measurement_name
-                new_subtest['replicates'] = replicates
-                new_subtest['lowerIsBetter'] = test['subtest_lower_is_better']
-                new_subtest['alertThreshold'] = float(test['alert_threshold'])
-                new_subtest['value'] = 0
-                new_subtest['unit'] = test['subtest_unit']
+        total_perfdata = 0
+        if output_perf_data:
+            # if we have supporting data i.e. power, we ONLY want those measurements
+            # dumped out. TODO: Bug 1515406 - Add option to output both supplementary
+            # data (i.e. power) and the regular Raptor test result
+            # Both are already available as separate PERFHERDER_DATA json blobs
+            if len(self.summarized_supporting_data) == 0:
+                LOG.info("PERFHERDER_DATA: %s" % json.dumps(self.summarized_results))
+                total_perfdata = 1
+            else:
+                LOG.info("supporting data measurements exist - only posting those to perfherder")

-                # if 'alert_on' is set for this particular measurement, then we want to set the
-                # flag in the perfherder output to turn on alerting for this subtest
-                if self.subtest_alert_on is not None:
-                    if measurement_name in self.subtest_alert_on:
-                        LOG.info("turning on subtest alerting for measurement type: %s"
-                                 % measurement_name)
-                        new_subtest['shouldAlert'] = True
+        json.dump(self.summarized_results, open(results_path, 'w'), indent=2,
+                  sort_keys=True)
+        LOG.info("results can also be found locally at: %s" % results_path)

-                # for the subtest (page-load measurement type) overall score/result/value, we
-                # want to use the median of the replicates - now instead of calculating this
-                # ourselves, we will take this value from the browsertime results themselves
-                # as browsertime calculates the mean (and other values) automatically for us
-                bt_measurement_median = test['statistics'][measurement_name]['median']
-                new_subtest['value'] = bt_measurement_median
+        return True, total_perfdata

-                # we have a vals list that contains all the top level results for each of the
-                # measurement types; this will be used to calculate an overall test result
-                # which will be the geomean of all of the top level results of each type
-                vals.append([new_subtest['value'], new_subtest['name']])
-                subtests.append(new_subtest)
+    def output_supporting_data(self, test_names):
+        '''
+        Supporting data was gathered outside of the main raptor test; it has already
+        been summarized, now output it appropriately.

-            # for pageload tests, if there are > 1 subtests here, that means there
-            # were multiple measurement types captured in each single pageload; we want
-            # to get the mean of those values and report 1 overall 'suite' value
-            # for the page; all replicates will still be available in the JSON artifact
+        We want to output supporting data in a completely separate perfherder json blob and
+        in a corresponding file artifact. This way supporting data can be ingested as it's own
+        test suite in perfherder and alerted upon if desired. Kept outside of the test results
+        from the actual Raptor test that was ran when the supporting data was gathered.
+        '''
+        if len(self.summarized_supporting_data) == 0:
+            LOG.error("no summarized supporting data found for %s" %
+                      ', '.join(test_names))
+            return False, 0

-            # summarize results to get top overall suite result
-            if len(subtests) > 1:
-                suite['value'] = self.construct_summary(vals,
-                                                        testname=test['name'])
+        total_perfdata = 0
+        for next_data_set in self.summarized_supporting_data:
+            data_type = next_data_set['suites'][0]['type']

-            subtests.sort(key=lambda subtest: subtest['name'])
+            if os.environ['MOZ_UPLOAD_DIR']:
+                # i.e. testing/mozharness/build/raptor.json locally; in production it will
+                # be at /tasks/task_*/build/ (where it will be picked up by mozharness later
+                # and made into a tc artifact accessible in treeherder as perfherder-data.json)
+                results_path = os.path.join(os.path.dirname(os.environ['MOZ_UPLOAD_DIR']),
+                                            'raptor-%s.json' % data_type)
+            else:
+                results_path = os.path.join(os.getcwd(), 'raptor-%s.json' % data_type)

-        suites.sort(key=lambda suite: suite['name'])
+            # dump data to raptor-data.json artifact
+            json.dump(next_data_set, open(results_path, 'w'), indent=2, sort_keys=True)

-        self.summarized_results = test_results
+            # the output that treeherder expects to find
+            LOG.info("PERFHERDER_DATA: %s" % json.dumps(next_data_set))
+            LOG.info("%s results can also be found locally at: %s" % (data_type, results_path))
+            total_perfdata += 1
+
+        return True, total_perfdata
+
+    @classmethod
+    def v8_Metric(cls, val_list):
+        results = [i for i, j in val_list]
+        score = 100 * filters.geometric_mean(results)
+        return score
+
+    @classmethod
+    def JS_Metric(cls, val_list):
+        """v8 benchmark score"""
+        results = [i for i, j in val_list]
+        return sum(results)
+
+    @classmethod
+    def speedometer_score(cls, val_list):
+        """
+        speedometer_score: https://bug-172968-attachments.webkit.org/attachment.cgi?id=319888
+        """
+        correctionFactor = 3
+        results = [i for i, j in val_list]
+        # speedometer has 16 tests, each of these are made of up 9 subtests
+        # and a sum of the 9 values.  We receive 160 values, and want to use
+        # the 16 test values, not the sub test values.
+        if len(results) != 160:
+            raise Exception("Speedometer has 160 subtests, found: %s instead" % len(results))
+
+        results = results[9::10]
+        score = 60 * 1000 / filters.geometric_mean(results) / correctionFactor
+        return score
+
+    @classmethod
+    def benchmark_score(cls, val_list):
+        """
+        benchmark_score: ares6/jetstream self reported as 'geomean'
+        """
+        results = [i for i, j in val_list if j == 'geomean']
+        return filters.mean(results)
+
+    @classmethod
+    def webaudio_score(cls, val_list):
+        """
+        webaudio_score: self reported as 'Geometric Mean'
+        """
+        results = [i for i, j in val_list if j == 'Geometric Mean']
+        return filters.mean(results)
+
+    @classmethod
+    def unity_webgl_score(cls, val_list):
+        """
+        unity_webgl_score: self reported as 'Geometric Mean'
+        """
+        results = [i for i, j in val_list if j == 'Geometric Mean']
+        return filters.mean(results)
+
+    @classmethod
+    def wasm_misc_score(cls, val_list):
+        """
+        wasm_misc_score: self reported as '__total__'
+        """
+        results = [i for i, j in val_list if j == '__total__']
+        return filters.mean(results)
+
+    @classmethod
+    def wasm_godot_score(cls, val_list):
+        """
+        wasm_godot_score: first-interactive mean
+        """
+        results = [i for i, j in val_list if j == 'first-interactive']
+        return filters.mean(results)
+
+    @classmethod
+    def stylebench_score(cls, val_list):
+        """
+        stylebench_score: https://bug-172968-attachments.webkit.org/attachment.cgi?id=319888
+        """
+        correctionFactor = 3
+        results = [i for i, j in val_list]
+
+        # stylebench has 5 tests, each of these are made of up 5 subtests
+        #
+        #   * Adding classes.
+        #   * Removing classes.
+        #   * Mutating attributes.
+        #   * Adding leaf elements.
+        #   * Removing leaf elements.
+        #
+        # which are made of two subtests each (sync/async) and repeated 5 times
+        # each, thus, the list here looks like:
+        #
+        #   [Test name/Adding classes - 0/ Sync; <x>]
+        #   [Test name/Adding classes - 0/ Async; <y>]
+        #   [Test name/Adding classes - 0; <x> + <y>]
+        #   [Test name/Removing classes - 0/ Sync; <x>]
+        #   [Test name/Removing classes - 0/ Async; <y>]
+        #   [Test name/Removing classes - 0; <x> + <y>]
+        #   ...
+        #   [Test name/Adding classes - 1 / Sync; <x>]
+        #   [Test name/Adding classes - 1 / Async; <y>]
+        #   [Test name/Adding classes - 1 ; <x> + <y>]
+        #   ...
+        #   [Test name/Removing leaf elements - 4; <x> + <y>]
+        #   [Test name; <sum>] <- This is what we want.
+        #
+        # So, 5 (subtests) *
+        #     5 (repetitions) *
+        #     3 (entries per repetition (sync/async/sum)) =
+        #     75 entries for test before the sum.
+        #
+        # We receive 76 entries per test, which ads up to 380. We want to use
+        # the 5 test entries, not the rest.
+        if len(results) != 380:
+            raise Exception("StyleBench has 380 entries, found: %s instead" % len(results))
+
+        results = results[75::76]
+        score = 60 * 1000 / filters.geometric_mean(results) / correctionFactor
+        return score
+
+    @classmethod
+    def sunspider_score(cls, val_list):
+        results = [i for i, j in val_list]
+        return sum(results)
+
+    @classmethod
+    def assorted_dom_score(cls, val_list):
+        results = [i for i, j in val_list]
+        return round(filters.geometric_mean(results), 2)
+
+    @classmethod
+    def youtube_playback_performance_score(cls, val_list):
+        """Calculate percentage of failed tests."""
+        results = [i for i, j in val_list]
+        return round(filters.mean(results), 2)
+
+    @classmethod
+    def supporting_data_total(cls, val_list):
+        results = [i for i, j in val_list]
+        return sum(results)
+
+    @classmethod
+    def supporting_data_average(cls, val_list):
+        results = [i for i, j in val_list]
+        return sum(results)/len(results)
+
+    def construct_summary(self, vals, testname, unit=None):
+        if testname.startswith('raptor-v8_7'):
+            return self.v8_Metric(vals)
+        elif testname.startswith('raptor-kraken'):
+            return self.JS_Metric(vals)
+        elif testname.startswith('raptor-speedometer'):
+            return self.speedometer_score(vals)
+        elif testname.startswith('raptor-stylebench'):
+            return self.stylebench_score(vals)
+        elif testname.startswith('raptor-sunspider'):
+            return self.sunspider_score(vals)
+        elif testname.startswith('raptor-unity-webgl'):
+            return self.unity_webgl_score(vals)
+        elif testname.startswith('raptor-webaudio'):
+            return self.webaudio_score(vals)
+        elif testname.startswith('raptor-assorted-dom'):
+            return self.assorted_dom_score(vals)
+        elif testname.startswith('raptor-wasm-misc'):
+            return self.wasm_misc_score(vals)
+        elif testname.startswith('raptor-wasm-godot'):
+            return self.wasm_godot_score(vals)
+        elif testname.startswith('raptor-youtube-playback'):
+            return self.youtube_playback_performance_score(vals)
+        elif testname.startswith('supporting_data'):
+            if unit and unit in ('%',):
+                return self.supporting_data_average(vals)
+            else:
+                return self.supporting_data_total(vals)
+        elif len(vals) > 1:
+            return round(filters.geometric_mean([i for i, j in vals]), 2)
+        else:
+            return round(filters.mean([i for i, j in vals]), 2)
--- a/testing/raptor/raptor/raptor.py
+++ b/testing/raptor/raptor/raptor.py
@ -60,7 +60,7 @@ from manifest import get_raptor_test_list
 from memory import generate_android_memory_profile
 from performance_tuning import tune_performance
 from power import init_android_power_test, finish_android_power_test
-from results import RaptorResultsHandler, BrowsertimeResultsHandler
+from results import RaptorResultsHandler
 from utils import view_gecko_profile, write_yml_file
 from cpu import start_android_cpu_profiler

@ -92,9 +92,7 @@ either Raptor or browsertime."""
                 gecko_profile=False, gecko_profile_interval=None, gecko_profile_entries=None,
                 symbols_path=None, host=None, power_test=False, cpu_test=False, memory_test=False,
                 is_release_build=False, debug_mode=False, post_startup_delay=None,
-                 interrupt_handler=None, e10s=True, enable_webrender=False,
-                 results_handler_class=RaptorResultsHandler,
-                 **kwargs):
+                 interrupt_handler=None, e10s=True, enable_webrender=False, **kwargs):

        # Override the magic --host HOST_IP with the value of the environment variable.
        if host == 'HOST_IP':
@ -134,7 +132,6 @@ either Raptor or browsertime."""
        self.profile_class = profile_class or app
        self.firefox_android_apps = FIREFOX_ANDROID_APPS
        self.interrupt_handler = interrupt_handler
-        self.results_handler = results_handler_class(self.config)

        # debug mode is currently only supported when running locally
        self.debug_mode = debug_mode if self.config['run_local'] else False
@ -147,6 +144,9 @@ either Raptor or browsertime."""

        LOG.info("main raptor init, config is: %s" % str(self.config))

+        # setup the control server
+        self.results_handler = RaptorResultsHandler(self.config)
+
        self.build_browser_profile()

    def build_browser_profile(self):
@ -189,12 +189,6 @@ either Raptor or browsertime."""
    def run_test_setup(self, test):
        LOG.info("starting test: %s" % test['name'])

-        # if 'alert_on' was provided in the test INI, add to our config for results/output
-        self.config['subtest_alert_on'] = test.get('alert_on')
-
-        if test.get("preferences") is not None:
-            self.set_browser_test_prefs(test['preferences'])
-
    def run_tests(self, tests, test_names):
        try:
            for test in tests:
@ -205,7 +199,7 @@ either Raptor or browsertime."""
                    LOG.error(e)
                finally:
                    self.run_test_teardown(test)
-            return self.process_results(tests, test_names)
+            return self.process_results(test_names)
        finally:
            self.clean_up()

@ -224,7 +218,7 @@ either Raptor or browsertime."""
            LOG.info("cleaning up after gecko profiling")
            self.gecko_profiler.clean()

-    def process_results(self, tests, test_names):
+    def process_results(self, test_names):
        # when running locally output results in build/raptor.json; when running
        # in production output to a local.json to be turned into tc job artifact
        raptor_json_path = os.path.join(self.artifact_dir, 'raptor.json')
@ -232,11 +226,7 @@ either Raptor or browsertime."""
            raptor_json_path = os.path.join(os.getcwd(), 'local.json')

        self.config['raptor_json_path'] = raptor_json_path
-        return self.results_handler.summarize_and_output(self.config, tests, test_names)
-
-    @abstractmethod
-    def set_browser_test_prefs(self):
-        pass
+        return self.results_handler.summarize_and_output(self.config, test_names)

    @abstractmethod
    def check_for_crashes(self):
@ -336,12 +326,8 @@ class Browsertime(Perftest):
                value = kwargs.pop(key)
                setattr(self, key, value)

-        def klass(config):
-            root_results_dir = os.path.join(os.environ.get('MOZ_UPLOAD_DIR', os.getcwd()),
-                                            'browsertime-results')
-            return BrowsertimeResultsHandler(config, root_results_dir=root_results_dir)
+        super(Browsertime, self).__init__(*args, **kwargs)

-        super(Browsertime, self).__init__(*args, results_handler_class=klass, **kwargs)
        LOG.info("cwd: '{}'".format(os.getcwd()))

        # For debugging.
@ -356,11 +342,6 @@ class Browsertime(Perftest):
            except Exception as e:
                LOG.info("{}: {}".format(k, e))

-    def set_browser_test_prefs(self, raw_prefs):
-        # add test specific preferences
-        LOG.info("setting test-specific Firefox preferences")
-        self.profile.set_preferences(json.loads(raw_prefs))
-
    def run_test_setup(self, test):
        super(Browsertime, self).run_test_setup(test)

@ -374,6 +355,12 @@ class Browsertime(Perftest):
        if self.browsertime_chromedriver:
            self.driver_paths.extend(['--chrome.chromedriverPath', self.browsertime_chromedriver])

+        self.resultdir = [
+            '--resultDir',
+            os.path.join(os.environ.get('MOZ_UPLOAD_DIR', os.getcwd()),
+                         'browsertime-results', test['name']),
+        ]
+
        LOG.info('test: {}'.format(test))

    def run_test_teardown(self, test):
@ -389,26 +376,20 @@ class Browsertime(Perftest):
    def clean_up(self):
        super(Browsertime, self).clean_up()

-    @property
-    def browsertime_args(self):
-        binary_path = self.config['binary']
-        LOG.info('binary_path: {}'.format(binary_path))
-
-        return ['--browser', 'firefox', '--firefox.binaryPath', binary_path]
-
    def run_test(self, test, timeout):

        self.run_test_setup(test)

-        cmd = ([self.browsertime_node, self.browsertime_browsertimejs] +
-               self.driver_paths +
-               self.browsertime_args +
-               ['--skipHar',
-                '--video', 'true',
-                '--visualMetrics', 'false',
-                '-vv',
-                '--resultDir', self.results_handler.result_dir_for_test(test),
-                '-n', str(test.get('browser_cycles', 1)), test['test_url']])
+        cmd = [self.browsertime_node, self.browsertime_browsertimejs, '--browser', 'firefox'] + \
+            self.driver_paths + \
+            ['--firefox.binaryPath', self.config['binary'],
+             '--skipHar',
+             '--video', 'true',
+             '--visualMetrics', 'false',
+             '-vv'] + \
+            self.resultdir + \
+            ['-n', str(test.get('browser_cycles', 1)),
+             test['test_url']]

        # timeout is a single page-load timeout value in ms from the test INI
        # convert timeout to seconds and account for browser cycles
@ -425,7 +406,6 @@ class Browsertime(Perftest):
        LOG.info('timeout (s): {}'.format(timeout))
        LOG.info('browsertime cwd: {}'.format(os.getcwd()))
        LOG.info('browsertime cmd: {}'.format(cmd))
-        LOG.info('browsertime_ffmpeg: {}'.format(self.browsertime_ffmpeg))

        # browsertime requires ffmpeg on the PATH for `--video=true`.
        # It's easier to configure the PATH here than at the TC level.
@ -439,8 +419,6 @@ class Browsertime(Perftest):
                new_path = new_path.encode('utf-8', 'strict')
            env['PATH'] = new_path

-        LOG.info('PATH: {}'.format(env['PATH']))
-
        try:
            proc = mozprocess.ProcessHandler(cmd, env=env)
            proc.run(timeout=timeout,
@ -448,38 +426,11 @@ class Browsertime(Perftest):
            proc.wait()

        except Exception as e:
-            LOG.critical("Error while attempting to run browsertime: %s" % str(e))
-            raise
+            raise Exception("Error while attempting to run browsertime: %s" % str(e))

-
-class BrowsertimeAndroid(Browsertime):
-
-    def __init__(self, app, binary, activity=None, intent=None, **kwargs):
-        super(BrowsertimeAndroid, self).__init__(app, binary, profile_class="firefox", **kwargs)
-
-        self.config.update({
-            'activity': activity,
-            'intent': intent,
-        })
-
-    @property
-    def browsertime_args(self):
-        return ['--browser', 'firefox', '--android',
-                # Work around a `selenium-webdriver` issue where Browsertime
-                # fails to find a Firefox binary even though we're going to
-                # actually do things on an Android device.
-                '--firefox.binaryPath', self.browsertime_node,
-                '--firefox.android.package', self.config['binary'],
-                '--firefox.android.activity', self.config['activity']]
-
-    def build_browser_profile(self):
-        super(BrowsertimeAndroid, self).build_browser_profile()
-
-        # Merge in the Android profile.
-        path = os.path.join(self.profile_data_dir, 'raptor-android')
-        LOG.info("Merging profile: {}".format(path))
-        self.profile.merge(path)
-        self.profile.set_preferences({'browser.tabs.remote.autostart': self.config['e10s']})
+    def process_results(self, test_names):
+        # TODO - Bug 1565316 - Process browsertime results and dump out for perfherder
+        LOG.info("TODO: Bug 1565316 - Process browsertime results and dump out for perfherder")


 class Raptor(Perftest):
@ -492,14 +443,6 @@ class Raptor(Perftest):

        super(Raptor, self).__init__(*args, **kwargs)

-        # set up the results handler
-        self.results_handler = RaptorResultsHandler(
-            gecko_profile=self.config.get('gecko_profile'),
-            power_test=self.config.get('power_test'),
-            cpu_test=self.config.get('cpu_test'),
-            memory_test=self.config.get('memory_test'),
-        )
-
        self.start_control_server()

    def run_test_setup(self, test):
@ -525,6 +468,12 @@ class Raptor(Perftest):

        self.install_raptor_webext()

+        if test.get("preferences") is not None:
+            self.set_browser_test_prefs(test['preferences'])
+
+        # if 'alert_on' was provided in the test INI, add to our config for results/output
+        self.config['subtest_alert_on'] = test.get('alert_on')
+
    def wait_for_test_finish(self, test, timeout):
        # this is a 'back-stop' i.e. if for some reason Raptor doesn't finish for some
        # serious problem; i.e. the test was unable to send a 'page-timeout' to the control
@ -728,9 +677,9 @@ class RaptorDesktop(Raptor):
                    os.mkdir(output_dir)
                if not os.path.exists(test_dir):
                    os.mkdir(test_dir)
-            except Exception:
+            except Exception as e:
                LOG.critical("Could not create directories to store power testing data.")
-                raise
+                raise e

            # Start power measurements with IPG creating a power usage log
            # every 30 seconds with 1 data point per second (or a 1000 milli-
@ -1390,14 +1339,7 @@ def main(args=sys.argv[1:]):
                    value = outer_kwargs.pop(key)
                    inner_kwargs[key] = value

-            if args.app == "firefox":
-                klass = Browsertime
-            elif args.app in CHROMIUM_DISTROS:
-                klass = Browsertime
-            else:
-                klass = BrowsertimeAndroid
-
-            return klass(*inner_args, **inner_kwargs)
+            return Browsertime(*inner_args, **inner_kwargs)

    raptor = raptor_class(args.app,
                          args.binary,
--- a/testing/raptor/raptor/results.py
+++ b/testing/raptor/raptor/results.py
@ -9,32 +9,27 @@ from __future__ import absolute_import
 import json
 import os

-from abc import ABCMeta, abstractmethod
 from logger.logger import RaptorLogger
-from output import RaptorOutput, BrowsertimeOutput
+from output import Output

-LOG = RaptorLogger(component='perftest-results-handler')
+LOG = RaptorLogger(component='raptor-results-handler')


-class PerftestResultsHandler(object):
-    """Abstract base class to handle perftest results"""
+class RaptorResultsHandler():
+    """Handle Raptor test results"""

-    __metaclass__ = ABCMeta
-
-    def __init__(self, gecko_profile=False, power_test=False,
-                 cpu_test=False, memory_test=False, **kwargs):
-        self.gecko_profile = gecko_profile
-        self.power_test = power_test
-        self.cpu_test = cpu_test
-        self.memory_test = memory_test
+    def __init__(self, config=None):
+        self.config = config
        self.results = []
        self.page_timeout_list = []
        self.images = []
        self.supporting_data = None

-    @abstractmethod
    def add(self, new_result_json):
-        raise NotImplementedError()
+        # add to results
+        LOG.info("received results in RaptorResultsHandler.add")
+        new_result = RaptorTestResult(new_result_json)
+        self.results.append(new_result)

    def add_image(self, screenshot, test_name, page_cycle):
        # add to results
@ -82,7 +77,9 @@ class PerftestResultsHandler(object):

    def _get_expected_perfherder(self, output):
        def is_resource_test():
-            if self.power_test or self.cpu_test or self.memory_test:
+            if self.config.get('power_test', None) or \
+               self.config.get('cpu_test', None) or \
+               self.config.get('memory_test', None):
                return True
            return False

@ -109,11 +106,11 @@ class PerftestResultsHandler(object):
            # for the regular raptor tests (i.e. speedometer) so we
            # expect one per resource-type, starting with 0
            expected_perfherder = 0
-            if self.power_test:
+            if self.config.get('power_test', None):
                expected_perfherder += 1
-            if self.memory_test:
+            if self.config.get('memory_test', None):
                expected_perfherder += 1
-            if self.cpu_test:
+            if self.config.get('cpu_test', None):
                expected_perfherder += 1

        return expected_perfherder
@ -152,24 +149,10 @@ class PerftestResultsHandler(object):
            return False
        return True

-    @abstractmethod
-    def summarize_and_output(self, test_config, tests, test_names):
-        raise NotImplementedError()
-
-
-class RaptorResultsHandler(PerftestResultsHandler):
-    """Process Raptor results"""
-
-    def add(self, new_result_json):
-        # add to results
-        LOG.info("received results in RaptorResultsHandler.add")
-        new_result = RaptorTestResult(new_result_json)
-        self.results.append(new_result)
-
-    def summarize_and_output(self, test_config, tests, test_names):
+    def summarize_and_output(self, test_config, test_names):
        # summarize the result data, write to file and output PERFHERDER_DATA
        LOG.info("summarizing raptor test results")
-        output = RaptorOutput(self.results, self.supporting_data, test_config['subtest_alert_on'])
+        output = Output(self.results, self.supporting_data, test_config['subtest_alert_on'])
        output.summarize(test_names)
        # that has each browser cycle separate; need to check if there were multiple browser
        # cycles, and if so need to combine results from all cycles into one overall result
@ -181,7 +164,7 @@ class RaptorResultsHandler(PerftestResultsHandler):
            output.summarize_supporting_data()
            res, out_sup_perfdata = output.output_supporting_data(test_names)
        res, out_perfdata = output.output(test_names)
-        if not self.gecko_profile:
+        if not self.config['gecko_profile']:
            # res will remain True if no problems are encountered
            # during schema validation and perferder_data counting
            res = self._validate_treeherder_data(output, out_sup_perfdata + out_perfdata)
@ -197,221 +180,3 @@ class RaptorTestResult():
        # convert test result json/dict (from control server) to test result object instance
        for key, value in test_result_json.iteritems():
            setattr(self, key, value)
-
-
-class BrowsertimeResultsHandler(PerftestResultsHandler):
-    """Process Browsertime results"""
-    def __init__(self, config, root_results_dir=None):
-        super(BrowsertimeResultsHandler, self).__init__(config)
-        self._root_results_dir = root_results_dir
-
-    def result_dir_for_test(self, test):
-        return os.path.join(self._root_results_dir, test['name'])
-
-    def add(self, new_result_json):
-        # not using control server with bt
-        pass
-
-    def parse_browsertime_json(self, raw_btresults):
-        """
-        Receive a json blob that contains the results direct from the browsertime tool. Parse
-        out the values that we wish to use and add those to our result object. That object will
-        then be further processed in the BrowsertimeOutput class.
-
-        The values that we care about in the browsertime.json are structured as follows.
-        The 'browserScripts' section has one entry for each page-load / browsertime cycle!
-
-        [
-          {
-            "info": {
-              "browsertime": {
-                "version": "4.9.2-android"
-              },
-              "url": "https://www.theguardian.co.uk",
-            },
-            "browserScripts": [
-              {
-                "browser": {
-                  "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:70.0)
-                                Gecko/20100101 Firefox/70.0",
-                  "windowSize": "1366x694"
-                },
-                "timings": {
-                  "firstPaint": 830,
-                  "loadEventEnd": 4450,
-                  "timeToContentfulPaint": 932,
-                  "timeToDomContentFlushed": 864,
-                  }
-                }
-              },
-              {
-                <repeated for every page-load cycle>
-              },
-            ],
-            "statistics": {
-              "timings": {
-                "firstPaint": {
-                  "median": 668,
-                  "mean": 680,
-                  "mdev": 9.6851,
-                  "stddev": 48,
-                  "min": 616,
-                  "p10": 642,
-                  "p90": 719,
-                  "p99": 830,
-                  "max": 830
-                },
-                "loadEventEnd": {
-                  "median": 3476,
-                  "mean": 3642,
-                  "mdev": 111.7028,
-                  "stddev": 559,
-                  "min": 3220,
-                  "p10": 3241,
-                  "p90": 4450,
-                  "p99": 5818,
-                  "max": 5818
-                },
-                "timeToContentfulPaint": {
-                  "median": 758,
-                  "mean": 769,
-                  "mdev": 10.0941,
-                  "stddev": 50,
-                  "min": 712,
-                  "p10": 728,
-                  "p90": 810,
-                  "p99": 932,
-                  "max": 932
-                },
-                "timeToDomContentFlushed": {
-                  "median": 670,
-                  "mean": 684,
-                  "mdev": 11.6768,
-                  "stddev": 58,
-                  "min": 614,
-                  "p10": 632,
-                  "p90": 738,
-                  "p99": 864,
-                  "max": 864
-                },
-              }
-            }
-          }
-        ]
-        """
-        LOG.info("parsing results from browsertime json")
-
-        # For now, assume that browsertime loads only one site.
-        if len(raw_btresults) != 1:
-            raise ValueError("Browsertime did not measure exactly one site.")
-        (_raw_bt_results,) = raw_btresults
-
-        if not _raw_bt_results['browserScripts']:
-            raise ValueError("Browsertime produced no measurements.")
-        bt_browser = _raw_bt_results['browserScripts'][0]['browser']
-
-        bt_ver = _raw_bt_results['info']['browsertime']['version']
-        bt_url = _raw_bt_results['info']['url'],
-        bt_result = {'bt_ver': bt_ver,
-                     'browser': bt_browser,
-                     'url': bt_url,
-                     'measurements': {},
-                     'statistics': {}}
-
-        # bt to raptor names
-        conversion = (('fnbpaint', 'firstPaint'),
-                      ('fcp', 'timeToContentfulPaint'),
-                      ('dcf', 'timeToDomContentFlushed'),
-                      ('loadtime', 'loadEventEnd'))
-
-        # extracting values from browserScripts and statistics
-        for bt, raptor in conversion:
-            # XXX looping several times in the list, could do better
-            bt_result['measurements'][bt] = [cycle['timings'][raptor] for cycle in
-                                             _raw_bt_results['browserScripts']]
-
-            # let's add the browsertime statistics; we'll use those for overall values instead
-            # of calculating our own based on the replicates
-            bt_result['statistics'][bt] = _raw_bt_results['statistics']['timings'][raptor]
-
-        return bt_result
-
-    def summarize_and_output(self, test_config, tests, test_names):
-        """
-        Retrieve, process, and output the browsertime test results. Currently supports page-load
-        type tests only.
-
-        The Raptor framework either ran a single page-load test (one URL) - or - an entire suite
-        of page-load tests (multiple test URLs). Regardless, every test URL measured will
-        have its own 'browsertime.json' results file, located in a sub-folder names after the
-        Raptor test name, i.e.:
-
-        browsertime-results/
-            raptor-tp6-amazon-firefox
-                browsertime.json
-            raptor-tp6-facebook-firefox
-                browsertime.json
-            raptor-tp6-google-firefox
-                browsertime.json
-            raptor-tp6-youtube-firefox
-                browsertime.json
-
-        For each test URL that was measured, find the resulting 'browsertime.json' file, and pull
-        out the values that we care about.
-        """
-        # summarize the browsertime result data, write to file and output PERFHERDER_DATA
-        LOG.info("retrieving browsertime test results")
-
-        for test in tests:
-            bt_res_json = os.path.join(self.result_dir_for_test(test), 'browsertime.json')
-            if os.path.exists(bt_res_json):
-                LOG.info("found browsertime results at %s" % bt_res_json)
-            else:
-                LOG.critical("unable to find browsertime results at %s" % bt_res_json)
-                return False
-
-            try:
-                with open(bt_res_json, 'r') as f:
-                    raw_btresults = json.load(f)
-            except Exception as e:
-                LOG.error("Exception reading %s" % bt_res_json)
-                # XXX this should be replaced by a traceback call
-                LOG.error("Exception: %s %s" % (type(e).__name__, str(e)))
-                raise
-
-            new_result = self.parse_browsertime_json(raw_btresults)
-
-            # add additional info not from the browsertime json
-            for field in ('name', 'unit', 'lower_is_better',
-                          'alert_threshold', 'cold'):
-                new_result[field] = test[field]
-
-            # Differentiate Raptor `pageload` tests from `browsertime-pageload`
-            # tests while we compare and contrast.
-            new_result['type'] = "browsertime-pageload"
-
-            # All Browsertime measurements are elapsed times in milliseconds.
-            new_result['subtest_lower_is_better'] = True
-            new_result['subtest_unit'] = 'ms'
-            LOG.info("parsed new result: %s" % str(new_result))
-
-            # `extra_options` will be populated with Gecko profiling flags in
-            # the future.
-            new_result['extra_options'] = []
-
-            self.results.append(new_result)
-
-        # now have all results gathered from all browsertime test URLs; format them for output
-        output = BrowsertimeOutput(self.results,
-                                   self.supporting_data,
-                                   test_config['subtest_alert_on'])
-
-        output.summarize(test_names)
-        res, out_perfdata = output.output(test_names)
-
-        if not self.gecko_profile:
-            # res will remain True if no problems are encountered
-            # during schema validation and perferder_data counting
-            res = self._validate_treeherder_data(output, out_perfdata)
-
-        return res