Bug 1834866 - change isolate failures to confirm failures. r=taskgraph-reviewers,gbrown,bhearsum

Differential Revision: https://phabricator.services.mozilla.com/D178989
This commit is contained in:
Joel Maher 2023-05-29 15:50:16 +00:00
parent f70ce93a5c
commit eaedb702ec

View File

@ -5,7 +5,6 @@
import json
import logging
import os
import re
from taskgraph.util.parameterization import resolve_task_references
@ -74,6 +73,7 @@ def get_failures(task_id):
break
return test_path
# collect dirs that don't have a specific manifest
dirs = set()
tests = set()
artifacts = list_artifacts(task_id)
@ -85,44 +85,41 @@ def get_failures(task_id):
if not stream:
continue
# The number of tasks created is determined by the
# `times` value and the number of distinct tests and
# directories as: times * (1 + len(tests) + len(dirs)).
# Since the maximum value of `times` specifiable in the
# Treeherder UI is 100, the number of tasks created can
# reach a very large value depending on the number of
# unique tests. During testing, it was found that 10
# distinct tests were sufficient to cause the action task
# to exceed the maxRunTime of 1800 seconds resulting in it
# being aborted. We limit the number of distinct tests
# and thereby the number of distinct test directories to a
# maximum of 5 to keep the action task from timing out.
# We handle the stream as raw bytes because it may contain invalid
# UTF-8 characters in portions other than those containing the error
# messages we're looking for.
for line in stream.read().split(b"\n"):
test_path = munge_test_path(line.strip())
if not line.strip():
continue
if test_path:
l = json.loads(line)
if "group_results" in l.keys() and l["status"] != "OK":
dirs.add(l["group_results"].group())
elif "test" in l.keys():
test_path = munge_test_path(line.strip())
tests.add(test_path.decode("utf-8"))
test_dir = os.path.dirname(test_path)
if test_dir:
dirs.add(test_dir.decode("utf-8"))
# only run the failing test not both test + dir
if l["group"] in dirs:
dirs.remove(l["group"])
if len(tests) > 4:
break
# turn group into dir by stripping off leafname
dirs = set([d.split("/")[0:-1] for d in dirs])
return {"dirs": sorted(dirs), "tests": sorted(tests)}
def create_isolate_failure_tasks(task_definition, failures, level, times):
def create_confirm_failure_tasks(task_definition, failures, level):
"""
Create tasks to re-run the original task plus tasks to test
each failing test directory and individual path.
"""
logger.info(f"Isolate task:\n{json.dumps(task_definition, indent=2)}")
logger.info(f"Confirm Failures task:\n{json.dumps(task_definition, indent=2)}")
# Operate on a copy of the original task_definition
task_definition = copy_task(task_definition)
@ -153,12 +150,9 @@ def create_isolate_failure_tasks(task_definition, failures, level, times):
command = copy_task(task_definition["payload"]["command"])
th_dict["groupSymbol"] = th_dict["groupSymbol"] + "-I"
th_dict["groupSymbol"] = th_dict["groupSymbol"] + "-cf"
th_dict["tier"] = 3
for i in range(times):
create_task_from_def(task_definition, level)
if repeatable_task:
task_definition["payload"]["maxRunTime"] = 3600 * 3
@ -175,7 +169,7 @@ def create_isolate_failure_tasks(task_definition, failures, level, times):
repeat_args = ["--repeat=19"] if repeatable_task else []
else:
logger.error(
"create_isolate_failure_tasks: Unknown failure_group {}".format(
"create_confirm_failure_tasks: Unknown failure_group {}".format(
failure_group
)
)
@ -203,33 +197,23 @@ def create_isolate_failure_tasks(task_definition, failures, level, times):
failure_path, task_definition["payload"]["command"]
)
)
for i in range(times):
create_task_from_def(task_definition, level)
create_task_from_def(task_definition, level)
@register_callback_action(
name="isolate-test-failures",
title="Isolate test failures in job",
symbol="it",
description="Re-run Tests for original manifest, directories and tests for failing tests.",
name="confirm-failures",
title="Confirm failures in job",
symbol="cf",
description="Re-run Tests for original manifest, directories or tests for failing tests.",
order=150,
context=[{"kind": "test"}],
schema={
"type": "object",
"properties": {
"times": {
"type": "integer",
"default": 1,
"minimum": 1,
"maximum": 100,
"title": "Times",
"description": "How many times to run each task.",
}
},
"properties": {},
"additionalProperties": False,
},
)
def isolate_test_failures(parameters, graph_config, input, task_group_id, task_id):
def confirm_failures(parameters, graph_config, input, task_group_id, task_id):
task = get_task_definition(task_id)
decision_task_id, full_task_graph, label_to_taskid = fetch_graph_and_labels(
parameters, graph_config
@ -249,7 +233,5 @@ def isolate_test_failures(parameters, graph_config, input, task_group_id, task_i
task_definition.setdefault("dependencies", []).extend(dependencies.values())
failures = get_failures(task_id)
logger.info("isolate_test_failures: %s" % failures)
create_isolate_failure_tasks(
task_definition, failures, parameters["level"], input["times"]
)
logger.info("confirm_failures: %s" % failures)
create_confirm_failure_tasks(task_definition, failures, parameters["level"])