Bug 1272176 - Emit Perfherder data for system resource utilization; r=wlach

This commit teaches the resource monitor in mozharness to emit Perfherder data for system metrics and step times. This will allow us to see when the timing or resource characteristics of jobs in automation changes. The recorded data includes overall CPU percent usage and I/O. Each step has its time and CPU percent recorded. There is certainly more data we could record. However, the immediate goal of this change is to see if the data provides any benefit. I'd rather start small and expand reporting once value from this data is proved. The wonkiest part of this patch is likely the mechanism to define the Perfherder "test" names. We don't appear to have an identifier in mozharness suitable for distinguishing between job types. e.g. the "desktop_unittest.py" script is responsible for running a few dozen jobs. So we invent code for creating an identifier from the script config options. I /think/ Treeherder will automatically assign the project/branch, platform, and build type, which is why these aren't included in the identifier. MozReview-Commit-ID: HjhtXfxOvzJ --HG-- extra : rebase_source : a3f0f2de4a091cde10c5a6815f1b4646bb5dc2f2
2024-10-20 16:55:40 +00:00 · 2016-05-12 13:55:35 -07:00 · 2016-05-12 13:55:35 -07:00 · dc56a5c952
commit dc56a5c952
parent ba1cccb1c4
2 changed files with 111 additions and 0 deletions
--- a/testing/mozharness/mozharness/base/python.py
+++ b/testing/mozharness/mozharness/base/python.py
@ -14,6 +14,7 @@ import time
 import json
 import traceback

+import mozharness
 from mozharness.base.script import (
    PostScriptAction,
    PostScriptRun,
@ -24,6 +25,11 @@ from mozharness.base.errors import VirtualenvErrorList
 from mozharness.base.log import WARNING, FATAL
 from mozharness.mozilla.proxxy import Proxxy

+external_tools_path = os.path.join(
+    os.path.abspath(os.path.dirname(os.path.dirname(mozharness.__file__))),
+    'external_tools',
+)
+
 def get_tlsv1_post():
    # Monkeypatch to work around SSL errors in non-bleeding-edge Python.
    # Taken from https://lukasa.co.uk/2013/01/Choosing_SSL_Version_In_Requests/
@ -458,8 +464,15 @@ class ResourceMonitoringMixin(object):
                                        optional=True)
        self.register_virtualenv_module('mozsystemmonitor==0.3',
                                        method='pip', optional=True)
+        self.register_virtualenv_module('jsonschema==2.5.1',
+                                        method='pip')
        self._resource_monitor = None

+        # 2-tuple of (name, options) to assign Perfherder resource monitor
+        # metrics to. This needs to be assigned by a script in order for
+        # Perfherder metrics to be reported.
+        self.resource_monitor_perfherder_id = None
+
    @PostScriptAction('create-virtualenv')
    def _start_resource_monitoring(self, action, success=None):
        self.activate_virtualenv()
@ -522,6 +535,9 @@ class ResourceMonitoringMixin(object):
                         traceback.format_exc())

    def _log_resource_usage(self):
+        # Delay import because not available until virtualenv is populated.
+        import jsonschema
+
        rm = self._resource_monitor

        if rm.start_time is None:
@ -565,6 +581,72 @@ class ResourceMonitoringMixin(object):
        cpu_percent, cpu_times, io, (swap_in, swap_out) = resources(None)
        duration = rm.end_time - rm.start_time

+        # Write out Perfherder data if configured.
+        if self.resource_monitor_perfherder_id:
+            perfherder_name, perfherder_options = self.resource_monitor_perfherder_id
+
+            suites = []
+            overall = []
+
+            if cpu_percent:
+                overall.append({
+                    'name': 'cpu_percent',
+                    'value': cpu_percent,
+                })
+
+            overall.extend([
+                {'name': 'io_write_bytes', 'value': io.write_bytes},
+                {'name': 'io.read_bytes', 'value': io.read_bytes},
+                {'name': 'io_write_time', 'value': io.write_time},
+                {'name': 'io_read_time', 'value': io.read_time},
+            ])
+
+            suites.append({
+                'name': '%s.overall' % perfherder_name,
+                'extraOptions': perfherder_options,
+                'subtests': overall,
+
+            })
+
+            for phase in rm.phases.keys():
+                phase_duration = rm.phases[phase][1] - rm.phases[phase][0]
+                subtests = [
+                    {
+                        'name': 'time',
+                        'value': phase_duration,
+                    },
+                    {
+                        'name': 'cpu_percent',
+                        'value': rm.aggregate_cpu_percent(phase=phase,
+                                                          per_cpu=False),
+                    }
+                ]
+                # We don't report I/O during each step because measured I/O
+                # is system I/O and that I/O can be delayed (e.g. writes will
+                # buffer before being flushed and recorded in our metrics).
+                suites.append({
+                    'name': '%s.%s' % (perfherder_name, phase),
+                    'subtests': subtests,
+                })
+
+            data = {
+                'framework': {'name': 'job_resource_usage'},
+                'suites': suites,
+            }
+
+            try:
+                schema_path = os.path.join(external_tools_path,
+                                           'performance-artifact-schema.json')
+                with open(schema_path, 'rb') as fh:
+                    schema = json.load(fh)
+
+                self.info('Validating Perfherder data against %s' % schema_path)
+                jsonschema.validate(data, schema)
+            except Exception:
+                self.exception('error while validating Perfherder data; ignoring')
+            else:
+                self.info('PERFHERDER_DATA: %s' % json.dumps(data))
+
        log_usage('Total resource usage', duration, cpu_percent, cpu_times, io)

        # Print special messages so usage shows up in Treeherder.
--- a/testing/mozharness/scripts/desktop_unittest.py
+++ b/testing/mozharness/scripts/desktop_unittest.py
@ -171,6 +171,35 @@ class DesktopUnittest(TestingMixin, MercurialScript, BlobUploadMixin, MozbaseMix
        self.abs_app_dir = None
        self.abs_res_dir = None

+        # Construct an identifier to be used to identify Perfherder data
+        # for resource monitoring recording. This attempts to uniquely
+        # identify this test invocation configuration.
+        perfherder_parts = []
+        perfherder_options = []
+        suites = (
+            ('specified_mochitest_suites', 'mochitest'),
+            ('specified_reftest_suites', 'reftest'),
+            ('specified_xpcshell_suites', 'xpcshell'),
+            ('specified_cppunittest_suites', 'cppunit'),
+            ('specified_gtest_suites', 'gtest'),
+            ('specified_jittest_suites', 'jittest'),
+            ('specified_mozbase_suites', 'mozbase'),
+            ('specified_mozmill_suites', 'mozmill'),
+        )
+        for s, prefix in suites:
+            if s in c:
+                perfherder_parts.append(prefix)
+                perfherder_parts.extend(c[s])
+
+        if 'this_chunk' in c:
+            perfherder_parts.append(c['this_chunk'])
+
+        if c['e10s']:
+            perfherder_options.append('e10s')
+
+        self.resource_monitor_perfherder_id = ('.'.join(perfherder_parts),
+                                               perfherder_options)
+
    # helper methods {{{2
    def _pre_config_lock(self, rw_config):
        super(DesktopUnittest, self)._pre_config_lock(rw_config)