Bug 1834866 - change isolate failures to confirm failures. r=taskgraph-reviewers,gbrown,bhearsum

Differential Revision: https://phabricator.services.mozilla.com/D178989
2024-10-09 11:25:00 +00:00 · 2023-05-29 15:50:16 +00:00 · 2023-05-29 15:50:16 +00:00 · eaedb702ec
commit eaedb702ec
parent f70ce93a5c
1 changed files with 29 additions and 47 deletions
--- a/taskcluster/gecko_taskgraph/actions/confirm_failure.py
+++ b/taskcluster/gecko_taskgraph/actions/confirm_failure.py
@ -5,7 +5,6 @@

 import json
 import logging
-import os
 import re

 from taskgraph.util.parameterization import resolve_task_references
@ -74,6 +73,7 @@ def get_failures(task_id):
                break
        return test_path

+    # collect dirs that don't have a specific manifest
    dirs = set()
    tests = set()
    artifacts = list_artifacts(task_id)
@ -85,44 +85,41 @@ def get_failures(task_id):
        if not stream:
            continue

-        # The number of tasks created is determined by the
-        # `times` value and the number of distinct tests and
-        # directories as: times * (1 + len(tests) + len(dirs)).
-        # Since the maximum value of `times` specifiable in the
-        # Treeherder UI is 100, the number of tasks created can
-        # reach a very large value depending on the number of
-        # unique tests.  During testing, it was found that 10
-        # distinct tests were sufficient to cause the action task
-        # to exceed the maxRunTime of 1800 seconds resulting in it
-        # being aborted.  We limit the number of distinct tests
-        # and thereby the number of distinct test directories to a
-        # maximum of 5 to keep the action task from timing out.
-
        # We handle the stream as raw bytes because it may contain invalid
        # UTF-8 characters in portions other than those containing the error
        # messages we're looking for.
        for line in stream.read().split(b"\n"):
-            test_path = munge_test_path(line.strip())
+            if not line.strip():
+                continue

-            if test_path:
+            l = json.loads(line)
+            if "group_results" in l.keys() and l["status"] != "OK":
+                dirs.add(l["group_results"].group())
+
+            elif "test" in l.keys():
+                test_path = munge_test_path(line.strip())
                tests.add(test_path.decode("utf-8"))
-                test_dir = os.path.dirname(test_path)
-                if test_dir:
-                    dirs.add(test_dir.decode("utf-8"))
+
+                # only run the failing test not both test + dir
+                if l["group"] in dirs:
+                    dirs.remove(l["group"])

            if len(tests) > 4:
                break

+        # turn group into dir by stripping off leafname
+        dirs = set([d.split("/")[0:-1] for d in dirs])
+
    return {"dirs": sorted(dirs), "tests": sorted(tests)}


-def create_isolate_failure_tasks(task_definition, failures, level, times):
+def create_confirm_failure_tasks(task_definition, failures, level):
    """
    Create tasks to re-run the original task plus tasks to test
    each failing test directory and individual path.

    """
-    logger.info(f"Isolate task:\n{json.dumps(task_definition, indent=2)}")
+    logger.info(f"Confirm Failures task:\n{json.dumps(task_definition, indent=2)}")

    # Operate on a copy of the original task_definition
    task_definition = copy_task(task_definition)
@ -153,12 +150,9 @@ def create_isolate_failure_tasks(task_definition, failures, level, times):

    command = copy_task(task_definition["payload"]["command"])

-    th_dict["groupSymbol"] = th_dict["groupSymbol"] + "-I"
+    th_dict["groupSymbol"] = th_dict["groupSymbol"] + "-cf"
    th_dict["tier"] = 3

-    for i in range(times):
-        create_task_from_def(task_definition, level)
-
    if repeatable_task:
        task_definition["payload"]["maxRunTime"] = 3600 * 3

@ -175,7 +169,7 @@ def create_isolate_failure_tasks(task_definition, failures, level, times):
            repeat_args = ["--repeat=19"] if repeatable_task else []
        else:
            logger.error(
-                "create_isolate_failure_tasks: Unknown failure_group {}".format(
+                "create_confirm_failure_tasks: Unknown failure_group {}".format(
                    failure_group
                )
            )
@ -203,33 +197,23 @@ def create_isolate_failure_tasks(task_definition, failures, level, times):
                    failure_path, task_definition["payload"]["command"]
                )
            )
-            for i in range(times):
-                create_task_from_def(task_definition, level)
+            create_task_from_def(task_definition, level)


@register_callback_action(
-    name="isolate-test-failures",
-    title="Isolate test failures in job",
-    symbol="it",
-    description="Re-run Tests for original manifest, directories and tests for failing tests.",
+    name="confirm-failures",
+    title="Confirm failures in job",
+    symbol="cf",
+    description="Re-run Tests for original manifest, directories or tests for failing tests.",
    order=150,
    context=[{"kind": "test"}],
    schema={
        "type": "object",
-        "properties": {
-            "times": {
-                "type": "integer",
-                "default": 1,
-                "minimum": 1,
-                "maximum": 100,
-                "title": "Times",
-                "description": "How many times to run each task.",
-            }
-        },
+        "properties": {},
        "additionalProperties": False,
    },
 )
-def isolate_test_failures(parameters, graph_config, input, task_group_id, task_id):
+def confirm_failures(parameters, graph_config, input, task_group_id, task_id):
    task = get_task_definition(task_id)
    decision_task_id, full_task_graph, label_to_taskid = fetch_graph_and_labels(
        parameters, graph_config
@ -249,7 +233,5 @@ def isolate_test_failures(parameters, graph_config, input, task_group_id, task_i
    task_definition.setdefault("dependencies", []).extend(dependencies.values())

    failures = get_failures(task_id)
-    logger.info("isolate_test_failures: %s" % failures)
-    create_isolate_failure_tasks(
-        task_definition, failures, parameters["level"], input["times"]
-    )
+    logger.info("confirm_failures: %s" % failures)
+    create_confirm_failure_tasks(task_definition, failures, parameters["level"])