Bug 1409876 - Record cache history to aid failure analysis; r=dustin

We introduce a per-cache .cachelog file containing important events
in the cache's history such as creation, requirements adjusting,
and utilization. If cache validation fails, we print the cache log.
If a previous task was responsible for getting the cache in a bad
state, its TASK_ID should be printed, allowing us to more easily
identify mis-configured tasks.

MozReview-Commit-ID: BJun5Hi5w0s

--HG--
extra : rebase_source : f4758741ee294a0de53882b6891b473c01463e28
This commit is contained in:
Gregory Szorc 2017-10-25 16:58:37 -07:00
parent cf842bfc0d
commit 926bb25028

View File

@ -295,6 +295,9 @@ def main(args):
# execution in aggregate. Without monitoring for this, people may not notice # execution in aggregate. Without monitoring for this, people may not notice
# the problem and tasks would be slower than they could be. We follow the # the problem and tasks would be slower than they could be. We follow the
# principle of "fail fast" to ensure optimal task execution. # principle of "fail fast" to ensure optimal task execution.
#
# We also write an audit log of who used the caches. This log is printed
# during failures to help aid debugging.
if 'TASKCLUSTER_CACHES' in os.environ: if 'TASKCLUSTER_CACHES' in os.environ:
caches = os.environ['TASKCLUSTER_CACHES'].split(';') caches = os.environ['TASKCLUSTER_CACHES'].split(';')
@ -321,6 +324,12 @@ def main(args):
b'gid=%d' % gid, b'gid=%d' % gid,
} }
def write_audit_entry(path, msg):
now = datetime.datetime.utcnow().isoformat()
with open(path, 'ab') as fh:
fh.write(b'[%sZ %s] %s\n' % (
now, os.environ.get('TASK_ID', 'UNKNOWN'), msg))
for cache in caches: for cache in caches:
if not os.path.isdir(cache): if not os.path.isdir(cache):
print('error: cache %s is not a directory; this should never ' print('error: cache %s is not a directory; this should never '
@ -328,6 +337,7 @@ def main(args):
return 1 return 1
requires_path = os.path.join(cache, '.cacherequires') requires_path = os.path.join(cache, '.cacherequires')
audit_path = os.path.join(cache, '.cachelog')
# The cache is empty. Configure it. # The cache is empty. Configure it.
if not os.listdir(cache): if not os.listdir(cache):
@ -342,6 +352,10 @@ def main(args):
# And make it read-only as a precaution against deletion. # And make it read-only as a precaution against deletion.
os.chmod(requires_path, stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH) os.chmod(requires_path, stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
write_audit_entry(audit_path,
'created; requirements: %s' %
', '.join(sorted(our_requirements)))
set_dir_permissions(cache, uid, gid) set_dir_permissions(cache, uid, gid)
# The cache has content and we have a requirements file. Validate # The cache has content and we have a requirements file. Validate
@ -374,6 +388,10 @@ def main(args):
with open(requires_path, 'wb') as fh: with open(requires_path, 'wb') as fh:
fh.write(b'\n'.join(sorted(our_requirements))) fh.write(b'\n'.join(sorted(our_requirements)))
write_audit_entry(audit_path,
'chown; requirements: %s' %
', '.join(sorted(our_requirements)))
elif missing: elif missing:
print('error: requirements for populated cache %s differ from ' print('error: requirements for populated cache %s differ from '
'this task' % cache) 'this task' % cache)
@ -383,7 +401,19 @@ def main(args):
our_requirements))) our_requirements)))
if any(s.startswith(('uid=', 'gid=')) for s in missing): if any(s.startswith(('uid=', 'gid=')) for s in missing):
print(CACHE_UID_GID_MISMATCH) print(CACHE_UID_GID_MISMATCH)
write_audit_entry(audit_path,
'requirements mismatch; wanted: %s' %
', '.join(sorted(our_requirements)))
print('')
print('audit log:')
with open(audit_path, 'rb') as fh:
print(fh.read())
return 1 return 1
else:
write_audit_entry(audit_path, 'used')
# We don't need to adjust permissions here because the cache is # We don't need to adjust permissions here because the cache is
# associated with a uid/gid and the first task should have set # associated with a uid/gid and the first task should have set
@ -397,6 +427,8 @@ def main(args):
'.cacherequires file; the cache names for this task are ' '.cacherequires file; the cache names for this task are '
'likely mis-configured or TASKCLUSTER_CACHES is not set ' 'likely mis-configured or TASKCLUSTER_CACHES is not set '
'properly' % cache) 'properly' % cache)
write_audit_entry(audit_path, 'missing .cacherequires')
return 1 return 1
if 'TASKCLUSTER_VOLUMES' in os.environ: if 'TASKCLUSTER_VOLUMES' in os.environ: