Bug 1363104 - Fix perf-reftest to compare perf numbers of basic vs ref pages; r=jmaher

MozReview-Commit-ID: JMtaa9I0atY --HG-- extra : rebase_source : c0bee15606940ab8fe0df544a8fc6b24c988803f
2024-10-19 08:15:31 +00:00 · 2017-06-29 18:18:45 -04:00 · 2017-06-29 18:18:45 -04:00 · 747ea49209
commit 747ea49209
parent 7efba61aa8
6 changed files with 78 additions and 24 deletions
--- a/testing/talos/talos.json
+++ b/testing/talos/talos.json
@ -82,10 +82,10 @@
            "tests": ["tsvgx", "tsvgr_opacity", "tart", "tscrollx", "cart", "tsvg_static"]
        },
        "perf-reftest": {
-            "tests": ["bloom_basic", "bloom_basic_ref"]
+            "tests": ["bloom_basic"]
        },
        "perf-reftest-e10s": {
-            "tests": ["bloom_basic", "bloom_basic_ref"]
+            "tests": ["bloom_basic"]
        },
        "tp5o": {
            "tests": ["tp5o"],
--- a/testing/talos/talos/config.py
+++ b/testing/talos/talos/config.py
@ -42,6 +42,7 @@ DEFAULTS = dict(
        firstpaint=False,
        userready=False,
        testeventmap=[],
+        base_vs_ref=False,
        tpdisable_e10s=False,
        tpnoisy=True,
        tppagecycles=1,
--- a/testing/talos/talos/output.py
+++ b/testing/talos/talos/output.py
@ -54,8 +54,7 @@ class Output(object):
                vals = []
                replicates = {}

-                # TODO: counters!!!! we don't have any, but they suffer the
-                # same
+                # TODO: counters!!!! we don't have any, but they suffer the same
                for result in test.results:
                    # XXX this will not work for manifests which list
                    # the same page name twice. It also ignores cycles
@ -88,6 +87,14 @@ class Output(object):
                            'value': val['filtered'],
                            'replicates': replicates[page],
                        }
+                        # if results are from a comparison test i.e. perf-reftest, it will also
+                        # contain replicates for 'base' and 'reference'; we wish to keep those
+                        # to reference; actual results were calculated as the difference of those
+                        base_runs = result.results[0].get('base_runs', None)
+                        ref_runs = result.results[0].get('ref_runs', None)
+                        if base_runs and ref_runs:
+                            subtest['base_replicates'] = base_runs
+                            subtest['ref_replicates'] = ref_runs
                        subtests.append(subtest)
                        if test.test_config.get('lower_is_better') is not None:
                            subtest['lowerIsBetter'] = \
--- a/testing/talos/talos/run_tests.py
+++ b/testing/talos/talos/run_tests.py
@ -93,7 +93,6 @@ def run_tests(config, browser_config):
    tests = useBaseTestDefaults(config.get('basetest', {}), tests)
    paths = ['profile_path', 'tpmanifest', 'extensions', 'setup', 'cleanup']
    for test in tests:
-
        # Check for profile_path, tpmanifest and interpolate based on Talos
        # root https://bugzilla.mozilla.org/show_bug.cgi?id=727711
        # Build command line from config
@ -255,10 +254,18 @@ def run_tests(config, browser_config):
                # now we have three separate test results, store them
                for test_result in separate_results_list:
                    talos_results.add(test_result)
+
+            # some tests like bloom_basic run two separate tests and then compare those values
+            # we want the results in perfherder to only be the actual difference between those
+            # and store the base and reference test replicates in results.json for upload
+            elif test.get('base_vs_ref', False):
+                # run the test, results will be reported for each page like two tests in the suite
+                base_and_reference_results = mytest.runTest(browser_config, test)
+                # now compare each test, and create a new test object for the comparison
+                talos_results.add(make_comparison_result(base_and_reference_results))
            else:
                # just expecting regular test - one result value per iteration
                talos_results.add(mytest.runTest(browser_config, test))
-
            LOG.test_end(testname, status='OK')

    except TalosRegression as exc:
@ -298,6 +305,56 @@ def run_tests(config, browser_config):
    return 0


+def make_comparison_result(base_and_reference_results):
+    ''' Receive a test result object meant to be used as a base vs reference test. The result
+    object will have one test with two subtests; instead of traditional subtests we want to
+    treat them as separate tests, comparing them together and reporting the comparison results.
+
+    Results with multiple pages used as subtests would look like this normally, with the overall
+    result value being the mean of the pages/subtests:
+
+    PERFHERDER_DATA: {"framework": {"name": "talos"}, "suites": [{"extraOptions": ["e10s"],
+    "name": "bloom_basic", "lowerIsBetter": true, "alertThreshold": 5.0, "value": 594.81,
+    "subtests": [{"name": ".html", "lowerIsBetter": true, "alertThreshold": 5.0, "replicates":
+    [586.52, ...], "value": 586.52], "unit": "ms"}, {"name": "-ref.html", "lowerIsBetter": true,
+    "alertThreshold": 5.0, "replicates": [603.225, ...], "value": 603.225, "unit": "ms"}]}]}
+
+    We want to compare the subtests against eachother (base vs ref) and create a new single test
+    results object with the comparison results, that will look like traditional single test results
+    like this:
+
+    PERFHERDER_DATA: {"framework": {"name": "talos"}, "suites": [{"lowerIsBetter": true,
+    "subtests": [{"name": "", "lowerIsBetter": true, "alertThreshold": 5.0, "replicates":
+    [16.705, ...], "value": 16.705, "unit": "ms"}], "extraOptions": ["e10s"], "name":
+    "bloom_basic", "alertThreshold": 5.0}]}
+    '''
+    # separate the 'base' and 'reference' result run values
+    base_result_runs = base_and_reference_results.results[0].results[0]['runs']
+    ref_result_runs = base_and_reference_results.results[0].results[1]['runs']
+
+    # create a new results object for the comparison result; keep replicates from both pages
+    comparison_result = copy.deepcopy(base_and_reference_results)
+
+    # remove original results from our copy as they will be replaced by one comparison result
+    comparison_result.results[0].results = []
+
+    # populate our new comparison result with 'base' and 'ref' replicates
+    comparison_result.results[0].results.append({'index': 0,
+                                                 'runs': [],
+                                                 'page': '',
+                                                 'base_runs': base_result_runs,
+                                                 'ref_runs': ref_result_runs})
+
+    # now step thru each result, compare 'base' vs 'ref', and store the difference in 'runs'
+    _index = 0
+    for next_ref in comparison_result.results[0].results[0]['ref_runs']:
+        diff = abs(next_ref - comparison_result.results[0].results[0]['base_runs'][_index])
+        comparison_result.results[0].results[0]['runs'].append(round(diff, 3))
+        _index += 1
+
+    return comparison_result
+
+
 def convert_to_separate_test_results(multi_value_result, test_event_map):
    ''' Receive a test result that actually contains multiple values in a single iteration, and
    parse it out in order to 'fake' three seprate test results.
--- a/testing/talos/talos/test.py
+++ b/testing/talos/talos/test.py
@ -107,6 +107,7 @@ class TsBase(Test):
        'firstpaint',
        'userready',
        'testeventmap',
+        'base_vs_ref',
        'extensions',
        'filters',
        'setup',
@ -251,7 +252,7 @@ class PageloaderTest(Test):
    timeout = None
    keys = ['tpmanifest', 'tpcycles', 'tppagecycles', 'tprender', 'tpchrome',
            'tpmozafterpaint', 'tploadnocache', 'firstpaint', 'userready',
-            'testeventmap', 'rss', 'mainthread', 'resolution', 'cycles',
+            'testeventmap', 'base_vs_ref', 'rss', 'mainthread', 'resolution', 'cycles',
            'gecko_profile', 'gecko_profile_interval', 'gecko_profile_entries',
            'tptimeout', 'win_counters', 'w7_counters', 'linux_counters', 'mac_counters',
            'tpscrolltest', 'xperf_counters', 'timeout', 'shutdown', 'responsiveness',
@ -801,8 +802,9 @@ class a11yr(PageloaderTest):
@register_test()
 class bloom_basic(PageloaderTest):
    """
-    Stylo bloom_basic test
+    Stylo bloom_basic: runs bloom_basic and bloom_basic_ref and reports difference
    """
+    base_vs_ref = True  # compare the two test pages with eachother and report comparison
    tpmanifest = '${talos}/tests/perf-reftest/bloom_basic.manifest'
    tpcycles = 1
    tppagecycles = 25
@ -814,22 +816,6 @@ class bloom_basic(PageloaderTest):
    alert_threshold = 5.0


-@register_test()
-class bloom_basic_ref(PageloaderTest):
-    """
-    Stylo bloom_basic_ref test
-    """
-    tpmanifest = '${talos}/tests/perf-reftest/bloom_basic_ref.manifest'
-    tpcycles = 1
-    tppagecycles = 25
-    gecko_profile_interval = 1
-    gecko_profile_entries = 2000000
-    filters = filter.ignore_first.prepare(5) + filter.median.prepare()
-    unit = 'ms'
-    lower_is_better = True
-    alert_threshold = 5.0
-
-
@register_test()
 class quantum_pageload_google(QuantumPageloadTest):
    """
--- a/testing/talos/talos/tests/perf-reftest/bloom_basic.manifest
+++ b/testing/talos/talos/tests/perf-reftest/bloom_basic.manifest
@ -1 +1,4 @@
+# base_vs_ref is set in test.py for this test, so each of these pages are run as separate
+# tests, but then compared against eachother; and the reported results are the comparison
 % http://localhost/tests/perf-reftest/bloom-basic.html
+% http://localhost/tests/perf-reftest/bloom-basic-ref.html