Bug 875562 - Part 4: Add Support for crash event files to CrashManager; r=Yoric

This patch introduces the concepts of the "crash data store" and "crash
event files." The "crash data store" is a data store containing
information about crashes. Data is added to this store directly through
a JavaScript API or by the presence of "crash event files." A "crash
event file" is simply an individual file containing information about
a crash event. These files are periodically scanned and their contents
are merged into the store.

Currently, no specific event files types are defined. This patch merely
begins to implement the infrastructure for dealing with them. Support
for specific crash events will be added in subsequent patches.

--HG--
extra : rebase_source : c58017b514f31c2823bc8ef4158a42bba758a9ab
This commit is contained in:
Gregory Szorc 2014-01-27 15:49:11 -08:00
parent d9998bc1b6
commit 19738668e0
11 changed files with 1141 additions and 101 deletions

View File

@ -4,16 +4,33 @@
"use strict";
const {interfaces: Ci, utils: Cu} = Components;
const {classes: Cc, interfaces: Ci, utils: Cu} = Components;
Cu.import("resource://gre/modules/Log.jsm", this);
Cu.import("resource://gre/modules/osfile.jsm", this)
Cu.import("resource://gre/modules/Promise.jsm", this);
Cu.import("resource://gre/modules/Services.jsm", this);
Cu.import("resource://gre/modules/Task.jsm", this);
Cu.import("resource://gre/modules/Timer.jsm", this);
Cu.import("resource://gre/modules/XPCOMUtils.jsm", this);
Cu.import("resource://services-common/utils.js", this);
this.EXPORTED_SYMBOLS = [
"CrashManager",
];
/**
* How long to wait after application startup before crash event files are
* automatically aggregated.
*
* We defer aggregation for performance reasons, as we don't want too many
* services competing for I/O immediately after startup.
*/
const AGGREGATE_STARTUP_DELAY_MS = 57000;
const MILLISECONDS_IN_DAY = 24 * 60 * 60 * 1000;
/**
* A gateway to crash-related data.
*
@ -30,15 +47,31 @@ this.EXPORTED_SYMBOLS = [
*
* submittedDumpsDir (string) (required)
* Where records of uploaded dumps are located.
*
* eventsDirs (array)
* Directories (defined as strings) where events files are written. This
* instance will collects events from files in the directories specified.
*
* storeDir (string)
* Directory we will use for our data store. This instance will write
* data files into the directory specified.
*
* telemetryStoreSizeKey (string)
* Telemetry histogram to report store size under.
*/
this.CrashManager = function (options) {
for (let k of ["pendingDumpsDir", "submittedDumpsDir"]) {
for (let k of ["pendingDumpsDir", "submittedDumpsDir", "eventsDirs",
"storeDir"]) {
if (!(k in options)) {
throw new Error("Required key not present in options: " + k);
}
}
for (let [k, v] of Iterator(options)) {
this._log = Log.repository.getLogger("Crashes.CrashManager");
for (let k in options) {
let v = options[k];
switch (k) {
case "pendingDumpsDir":
this._pendingDumpsDir = v;
@ -48,15 +81,57 @@ this.CrashManager = function (options) {
this._submittedDumpsDir = v;
break;
case "eventsDirs":
this._eventsDirs = v;
break;
case "storeDir":
this._storeDir = v;
break;
case "telemetryStoreSizeKey":
this._telemetryStoreSizeKey = v;
break;
default:
throw new Error("Unknown property in options: " + k);
}
}
// Promise for in-progress aggregation operation. We store it on the
// object so it can be returned for in-progress operations.
this._aggregatePromise = null;
// The CrashStore currently attached to this object.
this._store = null;
// The timer controlling the expiration of the CrashStore instance.
this._storeTimer = null;
// This is a semaphore that prevents the store from being freed by our
// timer-based resource freeing mechanism.
this._storeProtectedCount = 0;
};
this.CrashManager.prototype = Object.freeze({
DUMP_REGEX: /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.dmp$/i,
SUBMITTED_REGEX: /^bp-(?:hr-)?([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.txt$/i,
ALL_REGEX: /^(.*)$/,
// How long the store object should persist in memory before being
// automatically garbage collected.
STORE_EXPIRATION_MS: 60 * 1000,
// Number of days after which a crash with no activity will get purged.
PURGE_OLDER_THAN_DAYS: 180,
// The following are return codes for individual event file processing.
// File processed OK.
EVENT_FILE_SUCCESS: "ok",
// The event appears to be malformed.
EVENT_FILE_ERROR_MALFORMED: "malformed",
// The type of event is unknown.
EVENT_FILE_ERROR_UNKNOWN_EVENT: "unknown-event",
/**
* Obtain a list of all dumps pending upload.
@ -108,8 +183,186 @@ this.CrashManager.prototype = Object.freeze({
},
/**
* Helper to obtain all directory entries in a path that match a regexp.
* Aggregates "loose" events files into the unified "database."
*
* This function should be called periodically to collect metadata from
* all events files into the central data store maintained by this manager.
*
* Once events have been stored in the backing store the corresponding
* source files are deleted.
*
* Only one aggregation operation is allowed to occur at a time. If this
* is called when an existing aggregation is in progress, the promise for
* the original call will be returned.
*
* @return promise<int> The number of event files that were examined.
*/
aggregateEventsFiles: function () {
if (this._aggregatePromise) {
return this._aggregatePromise;
}
return this._aggregatePromise = Task.spawn(function* () {
if (this._aggregatePromise) {
return this._aggregatePromise;
}
try {
let unprocessedFiles = yield this._getUnprocessedEventsFiles();
let deletePaths = [];
let needsSave = false;
this._storeProtectedCount++;
for (let entry of unprocessedFiles) {
try {
let result = yield this._processEventFile(entry);
switch (result) {
case this.EVENT_FILE_SUCCESS:
needsSave = true;
// Fall through.
case this.EVENT_FILE_ERROR_MALFORMED:
deletePaths.push(entry.path);
break;
case this.EVENT_FILE_ERROR_UNKNOWN_EVENT:
break;
default:
Cu.reportError("Unhandled crash event file return code. Please " +
"file a bug: " + result);
}
} catch (ex if ex instanceof OS.File.Error) {
this._log.warn("I/O error reading " + entry.path + ": " +
CommonUtils.exceptionStr(ex));
} catch (ex) {
// We should never encounter an exception. This likely represents
// a coding error because all errors should be detected and
// converted to return codes.
//
// If we get here, report the error and delete the source file
// so we don't see it again.
Cu.reportError("Exception when processing crash event file: " +
CommonUtils.exceptionStr(ex));
deletePaths.push(entry.path);
}
}
if (needsSave) {
let store = yield this._getStore();
yield store.save();
}
for (let path of deletePaths) {
try {
yield OS.File.remove(path);
} catch (ex) {
this._log.warn("Error removing event file (" + path + "): " +
CommonUtils.exceptionStr(ex));
}
}
return unprocessedFiles.length;
} finally {
this._aggregatePromise = false;
this._storeProtectedCount--;
}
}.bind(this));
},
/**
* Prune old crash data.
*
* @param date
* (Date) The cutoff point for pruning. Crashes without data newer
* than this will be pruned.
*/
pruneOldCrashes: function (date) {
return Task.spawn(function* () {
let store = yield this._getStore();
store.pruneOldCrashes(date);
yield store.save();
}.bind(this));
},
/**
* Run tasks that should be periodically performed.
*/
runMaintenanceTasks: function () {
return Task.spawn(function* () {
yield this.aggregateEventsFiles();
let offset = this.PURGE_OLDER_THAN_DAYS * MILLISECONDS_IN_DAY;
yield this.pruneOldCrashes(new Date(Date.now() - offset));
}.bind(this));
},
/**
* Schedule maintenance tasks for some point in the future.
*
* @param delay
* (integer) Delay in milliseconds when maintenance should occur.
*/
scheduleMaintenance: function (delay) {
let deferred = Promise.defer();
setTimeout(() => {
this.runMaintenanceTasks().then(deferred.resolve, deferred.reject);
}, delay);
return deferred.promise;
},
/**
* Obtain the paths of all unprocessed events files.
*
* The promise-resolved array is sorted by file mtime, oldest to newest.
*/
_getUnprocessedEventsFiles: function () {
return Task.spawn(function* () {
let entries = [];
for (let dir of this._eventsDirs) {
for (let e of yield this._getDirectoryEntries(dir, this.ALL_REGEX)) {
entries.push(e);
}
}
entries.sort((a, b) => { return a.date - b.date; });
return entries;
}.bind(this));
},
// See docs/crash-events.rst for the file format specification.
_processEventFile: function (entry) {
return Task.spawn(function* () {
let data = yield OS.File.read(entry.path);
let store = yield this._getStore();
let decoder = new TextDecoder();
data = decoder.decode(data);
let sepIndex = data.indexOf("\n");
if (sepIndex == -1) {
return this.EVENT_FILE_ERROR_MALFORMED;
}
let type = data.substring(0, sepIndex);
let payload = data.substring(sepIndex + 1);
return this._handleEventFilePayload(entry, type, payload);
}.bind(this));
},
_handleEventFilePayload: function (entry, type, payload) {
return this.EVENT_FILE_ERROR_UNKNOWN_EVENT;
},
/**
* The resolved promise is an array of objects with the properties:
*
* path -- String filename
@ -155,10 +408,318 @@ this.CrashManager.prototype = Object.freeze({
return entries;
}.bind(this));
},
_getStore: function () {
return Task.spawn(function* () {
if (!this._store) {
let store = new CrashStore(this._storeDir, this._telemetryStoreSizeKey);
yield store.load();
this._store = store;
this._storeTimer = Cc["@mozilla.org/timer;1"].createInstance(Ci.nsITimer);
}
// The application can go long periods without interacting with the
// store. Since the store takes up resources, we automatically "free"
// the store after inactivity so resources can be returned to the system.
// We do this via a timer and a mechanism that tracks when the store
// is being accessed.
this._storeTimer.cancel();
// This callback frees resources from the store unless the store
// is protected from freeing by some other process.
let timerCB = function () {
if (this._storeProtectedCount) {
this._storeTimer.initWithCallback(timerCB, this.STORE_EXPIRATION_MS,
this._storeTimer.TYPE_ONE_SHOT);
return;
}
// We kill the reference that we hold. GC will kill it later. If
// someone else holds a reference, that will prevent GC until that
// reference is gone.
this._store = null;
this._storeTimer = null;
}.bind(this);
this._storeTimer.initWithCallback(timerCB, this.STORE_EXPIRATION_MS,
this._storeTimer.TYPE_ONE_SHOT);
return this._store;
}.bind(this));
},
/**
* Obtain information about all known crashes.
*
* Returns an array of CrashRecord instances. Instances are read-only.
*/
getCrashes: function () {
return Task.spawn(function* () {
let store = yield this._getStore();
return store.crashes;
}.bind(this));
},
});
let gCrashManager;
/**
* Interface to storage of crash data.
*
* This type handles storage of crash metadata. It exists as a separate type
* from the crash manager for performance reasons: since all crash metadata
* needs to be loaded into memory for access, we wish to easily dispose of all
* associated memory when this data is no longer needed. Having an isolated
* object whose references can easily be lost faciliates that simple disposal.
*
* When metadata is updated, the caller must explicitly persist the changes
* to disk. This prevents excessive I/O during updates.
*
* @param storeDir (string)
* Directory the store should be located in.
* @param telemetrySizeKey (string)
* The telemetry histogram that should be used to store the size
* of the data file.
*/
function CrashStore(storeDir, telemetrySizeKey) {
this._storeDir = storeDir;
this._telemetrySizeKey = telemetrySizeKey;
this._storePath = OS.Path.join(storeDir, "store.json.mozlz4");
// Holds the read data from disk.
this._data = null;
}
CrashStore.prototype = Object.freeze({
/**
* Load data from disk.
*
* @return Promise<null>
*/
load: function () {
return Task.spawn(function* () {
this._data = {
v: 1,
crashes: new Map(),
corruptDate: null,
};
try {
let decoder = new TextDecoder();
let data = yield OS.File.read(this._storePath, null, {compression: "lz4"});
data = JSON.parse(decoder.decode(data));
if (data.corruptDate) {
this._data.corruptDate = new Date(data.corruptDate);
}
for (let id in data.crashes) {
let crash = data.crashes[id];
let denormalized = this._denormalize(crash);
this._data.crashes.set(id, denormalized);
}
} catch (ex if ex instanceof OS.File.Error && ex.becauseNoSuchFile) {
// Missing files (first use) are allowed.
} catch (ex) {
// If we can't load for any reason, mark a corrupt date in the instance
// and swallow the error.
//
// The marking of a corrupted file is intentionally not persisted to
// disk yet. Instead, we wait until the next save(). This is to give
// non-permanent failures the opportunity to recover on their own.
this._data.corruptDate = new Date();
}
}.bind(this));
},
/**
* Save data to disk.
*
* @return Promise<null>
*/
save: function () {
return Task.spawn(function* () {
if (!this._data) {
return;
}
let normalized = {
v: 1,
crashes: {},
corruptDate: null,
};
if (this._data.corruptDate) {
normalized.corruptDate = this._data.corruptDate.getTime();
}
for (let [id, crash] of this._data.crashes) {
let c = this._normalize(crash);
normalized.crashes[id] = c;
}
let encoder = new TextEncoder();
let data = encoder.encode(JSON.stringify(normalized));
let size = yield OS.File.writeAtomic(this._storePath, data, {
tmpPath: this._storePath + ".tmp",
compression: "lz4"});
if (this._telemetrySizeKey) {
Services.telemetry.getHistogramById(this._telemetrySizeKey).add(size);
}
}.bind(this));
},
/**
* Normalize an object into one fit for serialization.
*
* This function along with _denormalize() serve to hack around the
* default handling of Date JSON serialization because Date serialization
* is undefined by JSON.
*
* Fields ending with "Date" are assumed to contain Date instances.
* We convert these to milliseconds since epoch on output and back to
* Date on input.
*/
_normalize: function (o) {
let normalized = {};
for (let k in o) {
let v = o[k];
if (v && k.endsWith("Date")) {
normalized[k] = v.getTime();
} else {
normalized[k] = v;
}
}
return normalized;
},
/**
* Convert a serialized object back to its native form.
*/
_denormalize: function (o) {
let n = {};
for (let k in o) {
let v = o[k];
if (v && k.endsWith("Date")) {
n[k] = new Date(parseInt(v, 10));
} else {
n[k] = v;
}
}
return n;
},
/**
* Prune old crash data.
*
* Crashes without recent activity are pruned from the store so the
* size of the store is not unbounded. If there is activity on a crash,
* that activity will keep the crash and all its data around for longer.
*
* @param date
* (Date) The cutoff at which data will be pruned. If an entry
* doesn't have data newer than this, it will be pruned.
*/
pruneOldCrashes: function (date) {
for (let crash of this.crashes) {
let newest = crash.newestDate;
if (!newest || newest.getTime() < date.getTime()) {
this._data.crashes.delete(crash.id);
}
}
},
/**
* Date the store was last corrupted and required a reset.
*
* May be null (no corruption has ever occurred) or a Date instance.
*/
get corruptDate() {
return this._data.corruptDate;
},
/**
* The number of distinct crashes tracked.
*/
get crashesCount() {
return this._data.crashes.size;
},
/**
* All crashes tracked.
*
* This is an array of CrashRecord.
*/
get crashes() {
let crashes = [];
for (let [id, crash] of this._data.crashes) {
crashes.push(new CrashRecord(crash));
}
return crashes;
},
/**
* Obtain a particular crash from its ID.
*
* A CrashRecord will be returned if the crash exists. null will be returned
* if the crash is unknown.
*/
getCrash: function (id) {
for (let crash of this.crashes) {
if (crash.id == id) {
return crash;
}
}
return null;
},
});
/**
* Represents an individual crash with metadata.
*
* This is a wrapper around the low-level anonymous JS objects that define
* crashes. It exposes a consistent and helpful API.
*
* Instances of this type should only be constructured inside this module,
* not externally. The constructor is not considered a public API.
*
* @param o (object)
* The crash's entry from the CrashStore.
*/
function CrashRecord(o) {
this._o = o;
}
CrashRecord.prototype = Object.freeze({
get id() {
return this._o.id;
},
get crashDate() {
return this._o.crashDate;
},
/**
* Obtain the newest date in this record.
*
* This is a convenience getter. The returned value is used to determine when
* to expire a record.
*/
get newestDate() {
// We currently only have 1 date, so this is easy.
return this._o.crashDate;
},
});
/**
* Obtain the global CrashManager instance used by the running application.
*
@ -172,10 +733,26 @@ XPCOMUtils.defineLazyGetter(this.CrashManager, "Singleton", function () {
let crPath = OS.Path.join(OS.Constants.Path.userApplicationDataDir,
"Crash Reports");
let storePath = OS.Path.join(OS.Constants.Path.profileDir, "crashes");
gCrashManager = new CrashManager({
pendingDumpsDir: OS.Path.join(crPath, "pending"),
submittedDumpsDir: OS.Path.join(crPath, "submitted"),
eventsDirs: [OS.Path.join(crPath, "events"), OS.Path.join(storePath, "events")],
storeDir: storePath,
telemetryStoreSizeKey: "CRASH_STORE_COMPRESSED_BYTES",
});
// Automatically aggregate event files shortly after startup. This
// ensures it happens with some frequency.
//
// There are performance considerations here. While this is doing
// work and could negatively impact performance, the amount of work
// is kept small per run by periodically aggregating event files.
// Furthermore, well-behaving installs should not have much work
// here to do. If there is a lot of work, that install has bigger
// issues beyond reduced performance near startup.
gCrashManager.scheduleMaintenance(AGGREGATE_STARTUP_DELAY_MS);
return gCrashManager;
});

View File

@ -0,0 +1,159 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
/*
* This file provides common and shared functionality to facilitate
* testing of the Crashes component (CrashManager.jsm).
*/
"use strict";
const {classes: Cc, interfaces: Ci, utils: Cu} = Components;
this.EXPORTED_SYMBOLS = [
"getManager",
"sleep",
"TestingCrashManager",
];
Cu.import("resource://gre/modules/CrashManager.jsm", this);
Cu.import("resource://gre/modules/osfile.jsm", this);
Cu.import("resource://gre/modules/Promise.jsm", this);
Cu.import("resource://gre/modules/Task.jsm", this);
Cu.import("resource://gre/modules/Timer.jsm", this);
this.sleep = function (wait) {
let deferred = Promise.defer();
setTimeout(() => {
deferred.resolve();
}, wait);
return deferred.promise;
};
this.TestingCrashManager = function (options) {
CrashManager.call(this, options);
}
this.TestingCrashManager.prototype = {
__proto__: CrashManager.prototype,
createDummyDump: function (submitted=false, date=new Date(), hr=false) {
let uuid = Cc["@mozilla.org/uuid-generator;1"]
.getService(Ci.nsIUUIDGenerator)
.generateUUID()
.toString();
uuid = uuid.substring(1, uuid.length - 1);
let path;
let mode;
if (submitted) {
if (hr) {
path = OS.Path.join(this._submittedDumpsDir, "bp-hr-" + uuid + ".txt");
} else {
path = OS.Path.join(this._submittedDumpsDir, "bp-" + uuid + ".txt");
}
mode = OS.Constants.libc.S_IRUSR | OS.Constants.libc.S_IWUSR |
OS.Constants.libc.S_IRGRP | OS.Constants.libc.S_IROTH;
} else {
path = OS.Path.join(this._pendingDumpsDir, uuid + ".dmp");
mode = OS.Constants.libc.S_IRUSR | OS.Constants.libc.S_IWUSR;
}
return Task.spawn(function* () {
let f = yield OS.File.open(path, {create: true}, {unixMode: mode});
yield f.setDates(date, date);
yield f.close();
dump("Created fake crash: " + path + "\n");
return uuid;
});
},
createIgnoredDumpFile: function (filename, submitted=false) {
let path;
if (submitted) {
path = OS.Path.join(this._submittedDumpsDir, filename);
} else {
path = OS.Path.join(this._pendingDumpsDir, filename);
}
return Task.spawn(function* () {
let mode = OS.Constants.libc.S_IRUSR | OS.Constants.libc.S_IWUSR;
yield OS.File.open(path, {create: true}, {unixMode: mode});
dump ("Create ignored dump file: " + path + "\n");
});
},
createEventsFile: function (filename, name, content, index=0, date=new Date()) {
let path = OS.Path.join(this._eventsDirs[index], filename);
let data = name + "\n" + content;
let encoder = new TextEncoder();
let array = encoder.encode(data);
return Task.spawn(function* () {
yield OS.File.writeAtomic(path, array);
yield OS.File.setDates(path, date, date);
});
},
/**
* Overwrite event file handling to process our test file type.
*
* We can probably delete this once we have actual events defined.
*/
_handleEventFilePayload: function (entry, type, payload) {
if (type == "test.1") {
if (payload == "malformed") {
return this.EVENT_FILE_ERROR_MALFORMED;
} else if (payload == "success") {
return this.EVENT_FILE_SUCCESS;
} else {
// Payload is crash ID. Create a duommy record.
this._store._data.crashes.set(payload, {id: payload, crashDate: entry.date});
return this.EVENT_FILE_SUCCESS;
}
}
return CrashManager.prototype._handleEventFilePayload.call(this, type,
payload);
},
};
let DUMMY_DIR_COUNT = 0;
this.getManager = function () {
return Task.spawn(function* () {
const dirMode = OS.Constants.libc.S_IRWXU;
let baseFile = OS.Constants.Path.profileDir;
function makeDir() {
return Task.spawn(function* () {
let path = OS.Path.join(baseFile, "dummy-dir-" + DUMMY_DIR_COUNT++);
dump("Creating directory: " + path + "\n");
yield OS.File.makeDir(path, {unixMode: dirMode});
return path;
});
}
let pendingD = yield makeDir();
let submittedD = yield makeDir();
let eventsD1 = yield makeDir();
let eventsD2 = yield makeDir();
let storeD = yield makeDir();
let m = new TestingCrashManager({
pendingDumpsDir: pendingD,
submittedDumpsDir: submittedD,
eventsDirs: [eventsD1, eventsD2],
storeDir: storeD,
});
return m;
});
};

View File

@ -0,0 +1,5 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
TESTING_JS_MODULES := CrashManagerTest.jsm

View File

@ -0,0 +1,133 @@
============
Crash Events
============
**Crash Events** refers to a special subsystem of Gecko that aims to capture
events of interest related to process crashing and hanging.
When an event worthy of recording occurs, a file containing that event's
information is written to a well-defined location on the filesystem. The Gecko
process periodically scans for produced files and consolidates information
into a more unified and efficient backend store.
Crash Event Files
=================
When a crash-related event occurs, a file describing that event is written
to a well-defined directory. That directory is likely in the directory of
the currently-active profile. However, if a profile is not yet active in
the Gecko process, that directory likely resides in the user's *app data*
directory (*UAppData* from the directory service).
The filename of the event file is not relevant. However, producers need
to choose a filename intelligently to avoid name collisions and race
conditions. Since file locking is potentially dangerous at crash time,
the convention of generating a UUID and using it as a filename has been
adopted.
File Format
-----------
All crash event files share the same high-level file format: the file
consists of an event name string followed by a UNIX newline (*\n*)
followed by its payload data. This allows consumers to read the first line
from the file and then dispatch based on its contents. If an unknown event
type is encountered, the event can safely be ignored until later. This helps
ensure that application downgrades (potentially due to elevated crash rate)
don't result in data loss.
The format and semantics of each event type are meant to be constant once
that event type is committed to the main Firefox repository. If new metadata
needs to be captured or the meaning of data captured in an event changes,
that change should be expressed through the invention of a new event type.
For this reason, event names are highly recommended to contain a version.
e.g. instead of a *Gecko process crashed* event, we prefer a *Gecko process
crashed v1* event.
Event Types
-----------
Each subsection documents the different types of crash events that may be
produced. Each section name corresponds to the first line of the crash
event file.
**No event types are yet defined.**
Aggregated Event Log
====================
Crash events are aggregated together into a unified event *log*. Currently,
this *log* is really a JSON file. However, this is an implementation detail
and it could change at any time. The interface to crash data provided by
the JavaScript API is the only supported interface.
Design Considerations
=====================
There are many considerations influencing the design of this subsystem.
We attempt to document them in this section.
Decoupling of Event Files from Final Data Structure
---------------------------------------------------
While it is certainly possible for the Gecko process to write directly to
the final data structure on disk, there is an intentional decoupling between
the production of events and their transition into final storage. Along the
same vein, the choice to have events written to multiple files by producers
is deliberate.
Some recorded events are written immediately after a process crash. This is
a very uncertain time for the host system. There is a high liklihood the
system is in an exceptional state, such as memory exhaustion. Therefore, any
action taken after crashing needs to be very deliberate about what it does.
Excessive memory allocation and certain system calls may cause the system
to crash again or the machine's condition to worsen. This means that the act
of recording a crash event must be very light weight. Writing a new file from
nothing is very light weight. This is one reason we write separate files.
Another reason we write separate files is because if the main Gecko process
itself crashes (as opposed to say a plugin process), the crash reporter (not
Gecko) is running and the crash reporter needs to handle the writing of the
event info. If this writing is involved (say loading, parsing, updating, and
reserializing back to disk), this logic would need to be implemented in both
Gecko and the crash reporter or would need to be implemented in such a way
that both could use. Neither of these is very practical from a software
lifecycle management perspective. It's much easier to have separate processes
write a simple file and to let a single implementation do all the complex
work.
Idempotent Event Processing
===========================
Processing of event files has been designed such that the result is
idempotent regardless of what order those files are processed in. This is
not only a good design decision, but it is arguably necessary. While event
files are processed in order by file mtime, filesystem times may not have
the resolution required for proper sorting. Therefore, processing order is
merely an optimistic assumption.
Aggregated Storage Format
=========================
Crash events are aggregated into a unified data structure on disk. That data
structure is currently LZ4-compressed JSON and is represented by a single file.
The choice of a single JSON file was initially driven by time and complexity
concerns. Before changing the format or adding significant amounts of new
data, some considerations must be taken into account.
First, in well-behaving installs, crash data should be minimal. Crashes and
hangs will be rare and thus the size of the crash data should remain small
over time.
The choice of a single JSON file has larger implications as the amount of
crash data grows. As new data is accumulated, we need to read and write
an entire file to make small updates. LZ4 compression helps reduce I/O.
But, there is a potential for unbounded file growth. We establish a
limit for the max age of records. Anything older than that limit is
pruned. Future patches will also limit the maximum number of records. This
will establish a hard limit on the size of the file, at least in terms of
crashes.
Care must be taken when new crash data is recorded, as this will increase
the size of the file and make I/O a larger concern.

View File

@ -0,0 +1,22 @@
=============
Crash Manager
=============
The **Crash Manager** is a service and interface for managing crash
data within the Gecko application.
From JavaScript, the service can be accessed via::
Cu.import("resource://gre/modules/Services.jsm");
let crashManager = Services.crashmanager;
That will give you an instance of ``CrashManager`` from ``CrashManager.jsm``.
From there, you can access and manipulate crash data.
Other Documents
===============
.. toctree::
:maxdepth: 1
crash-events

View File

@ -4,6 +4,8 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
SPHINX_TREES['crash-manager'] = 'docs'
EXTRA_COMPONENTS += [
'CrashService.js',
'CrashService.manifest',

View File

@ -6,84 +6,14 @@
const {classes: Cc, interfaces: Ci, utils: Cu} = Components;
Cu.import("resource://gre/modules/CrashManager.jsm", this);
Cu.import("resource://gre/modules/Promise.jsm", this);
Cu.import("resource://gre/modules/Task.jsm", this);
Cu.import("resource://gre/modules/osfile.jsm", this);
let DUMMY_DIR_COUNT = 0;
function getManager() {
function mkdir(f) {
if (f.exists()) {
return;
}
dump("Creating directory: " + f.path + "\n");
f.create(Ci.nsIFile.DIRECTORY_TYPE, dirMode);
}
const dirMode = OS.Constants.libc.S_IRWXU;
let baseFile = do_get_tempdir();
let pendingD = baseFile.clone();
let submittedD = baseFile.clone();
pendingD.append("dummy-dir-" + DUMMY_DIR_COUNT++);
submittedD.append("dummy-dir-" + DUMMY_DIR_COUNT++);
mkdir(pendingD);
mkdir(submittedD);
let m = new CrashManager({
pendingDumpsDir: pendingD.path,
submittedDumpsDir: submittedD.path,
});
m.create_dummy_dump = function (submitted=false, date=new Date(), hr=false) {
let uuid = Cc["@mozilla.org/uuid-generator;1"]
.getService(Ci.nsIUUIDGenerator)
.generateUUID()
.toString();
uuid = uuid.substring(1, uuid.length - 1);
let file;
let mode;
if (submitted) {
file = submittedD.clone();
if (hr) {
file.append("bp-hr-" + uuid + ".txt");
} else {
file.append("bp-" + uuid + ".txt");
}
mode = OS.Constants.libc.S_IRUSR | OS.Constants.libc.S_IWUSR |
OS.Constants.libc.S_IRGRP | OS.Constants.libc.S_IROTH;
} else {
file = pendingD.clone();
file.append(uuid + ".dmp");
mode = OS.Constants.libc.S_IRUSR | OS.Constants.libc.S_IWUSR;
}
file.create(file.NORMAL_FILE_TYPE, mode);
file.lastModifiedTime = date.getTime();
dump("Created fake crash: " + file.path + "\n");
return uuid;
};
m.create_ignored_dump_file = function (filename, submitted=false) {
let file;
if (submitted) {
file = submittedD.clone();
} else {
file = pendingD.clone();
}
file.append(filename);
file.create(file.NORMAL_FILE_TYPE,
OS.Constants.libc.S_IRUSR | OS.Constants.libc.S_IWUSR);
dump("Created ignored dump file: " + file.path + "\n");
};
return m;
}
Cu.import("resource://testing-common/CrashManagerTest.jsm", this);
function run_test() {
do_get_profile();
run_next_test();
}
@ -91,8 +21,10 @@ add_task(function* test_constructor_ok() {
let m = new CrashManager({
pendingDumpsDir: "/foo",
submittedDumpsDir: "/bar",
eventsDirs: [],
storeDir: "/baz",
});
Assert.ok(m);
Assert.ok(m, "CrashManager can be created.");
});
add_task(function* test_constructor_invalid() {
@ -102,23 +34,26 @@ add_task(function* test_constructor_invalid() {
});
add_task(function* test_get_manager() {
let m = getManager();
Assert.ok(m);
let m = yield getManager();
Assert.ok(m, "CrashManager obtained.");
m.create_dummy_dump(true);
m.create_dummy_dump(false);
yield m.createDummyDump(true);
yield m.createDummyDump(false);
run_next_test();
});
// Unsubmitted dump files on disk are detected properly.
add_task(function* test_pending_dumps() {
let m = getManager();
let m = yield getManager();
let now = Date.now();
let ids = [];
const COUNT = 5;
for (let i = 0; i < COUNT; i++) {
ids.push(m.create_dummy_dump(false, new Date(now - i * 86400000)));
ids.push(yield m.createDummyDump(false, new Date(now - i * 86400000)));
}
m.create_ignored_dump_file("ignored", false);
yield m.createIgnoredDumpFile("ignored", false);
let entries = yield m.pendingDumps();
Assert.equal(entries.length, COUNT, "proper number detected.");
@ -136,19 +71,20 @@ add_task(function* test_pending_dumps() {
}
});
// Submitted dump files on disk are detected properly.
add_task(function* test_submitted_dumps() {
let m = getManager();
let m = yield getManager();
let COUNT = 5;
for (let i = 0; i < COUNT; i++) {
m.create_dummy_dump(true);
yield m.createDummyDump(true);
}
m.create_ignored_dump_file("ignored", true);
yield m.createIgnoredDumpFile("ignored", true);
let entries = yield m.submittedDumps();
Assert.equal(entries.length, COUNT, "proper number detected.");
let hrID = m.create_dummy_dump(true, new Date(), true);
let hrID = yield m.createDummyDump(true, new Date(), true);
entries = yield m.submittedDumps();
Assert.equal(entries.length, COUNT + 1, "hr- in filename detected.");
@ -156,18 +92,111 @@ add_task(function* test_submitted_dumps() {
Assert.ok(gotIDs.has(hrID));
});
add_task(function* test_submitted_and_pending() {
let m = getManager();
let pendingIDs = [];
let submittedIDs = [];
// The store should expire after inactivity.
add_task(function* test_store_expires() {
let m = yield getManager();
pendingIDs.push(m.create_dummy_dump(false));
pendingIDs.push(m.create_dummy_dump(false));
submittedIDs.push(m.create_dummy_dump(true));
Object.defineProperty(m, "STORE_EXPIRATION_MS", {
value: 250,
});
let submitted = yield m.submittedDumps();
let pending = yield m.pendingDumps();
let store = yield m._getStore();
Assert.ok(store);
Assert.equal(store, m._store);
Assert.equal(submitted.length, submittedIDs.length);
Assert.equal(pending.length, pendingIDs.length);
yield sleep(300);
Assert.ok(!m._store, "Store has gone away.");
});
// Ensure discovery of unprocessed events files works.
add_task(function* test_unprocessed_events_files() {
let m = yield getManager();
yield m.createEventsFile("1", "test.1", "foo", 0);
yield m.createEventsFile("2", "test.1", "bar", 0);
yield m.createEventsFile("1", "test.1", "baz", 1);
let paths = yield m._getUnprocessedEventsFiles();
Assert.equal(paths.length, 3);
});
// Ensure only 1 aggregateEventsFiles() is allowed at a time.
add_task(function* test_aggregate_events_locking() {
let m = yield getManager();
let p1 = m.aggregateEventsFiles();
let p2 = m.aggregateEventsFiles();
Assert.strictEqual(p1, p2, "Same promise should be returned.");
});
// Malformed events files should be deleted.
add_task(function* test_malformed_files_deleted() {
let m = yield getManager();
yield m.createEventsFile("1", "test.1", "malformed");
let count = yield m.aggregateEventsFiles();
Assert.equal(count, 1);
let crashes = yield m.getCrashes();
Assert.equal(crashes.length, 0);
count = yield m.aggregateEventsFiles();
Assert.equal(count, 0);
});
// Unknown event types should be ignored.
add_task(function* test_aggregate_ignore_unknown_events() {
let m = yield getManager();
yield m.createEventsFile("1", "test.1", "success");
yield m.createEventsFile("2", "foobar.1", "dummy");
let count = yield m.aggregateEventsFiles();
Assert.equal(count, 2);
count = yield m.aggregateEventsFiles();
Assert.equal(count, 1);
count = yield m.aggregateEventsFiles();
Assert.equal(count, 1);
});
add_task(function* test_prune_old() {
let m = yield getManager();
let oldDate = new Date(Date.now() - 86400000);
let newDate = new Date(Date.now() - 10000);
yield m.createEventsFile("1", "test.1", "id1", 0, oldDate);
yield m.createEventsFile("2", "test.1", "id2", 0, newDate);
yield m.aggregateEventsFiles();
let crashes = yield m.getCrashes();
Assert.equal(crashes.length, 2);
yield m.pruneOldCrashes(new Date(oldDate.getTime() + 10000));
crashes = yield m.getCrashes();
Assert.equal(crashes.length, 1, "Old crash has been pruned.");
let c = crashes[0];
Assert.equal(c.id, "id2", "Proper crash was pruned.");
// We can't test exact boundary conditions because dates from filesystem
// don't have same guarantees as JS dates.
yield m.pruneOldCrashes(new Date(newDate.getTime() + 5000));
crashes = yield m.getCrashes();
Assert.equal(crashes.length, 0);
});
add_task(function* test_schedule_maintenance() {
let m = yield getManager();
yield m.createEventsFile("1", "test.1", "id1");
let oldDate = new Date(Date.now() - m.PURGE_OLDER_THAN_DAYS * 2 * 24 * 60 * 60 * 1000);
yield m.createEventsFile("2", "test.1", "id2", 0, oldDate);
yield m.scheduleMaintenance(25);
let crashes = yield m.getCrashes();
Assert.equal(crashes.length, 1);
Assert.equal(crashes[0].id, "id1");
});

View File

@ -26,5 +26,6 @@ add_task(function* test_instantiation() {
Assert.ok(bsp.gCrashManager, "Profile creation makes it available.");
Assert.ok(Services.crashmanager, "CrashManager available via Services.");
Assert.equal(bsp.gCrashManager, Services.crashmanager);
Assert.strictEqual(bsp.gCrashManager, Services.crashmanager,
"The objects are the same.");
});

View File

@ -0,0 +1,104 @@
/* Any copyright is dedicated to the Public Domain.
* http://creativecommons.org/publicdomain/zero/1.0/ */
/*
* This file tests the CrashStore type in CrashManager.jsm.
*/
"use strict";
const {classes: Cc, interfaces: Ci, utils: Cu} = Components;
let bsp = Cu.import("resource://gre/modules/CrashManager.jsm", this);
Cu.import("resource://gre/modules/osfile.jsm", this);
Cu.import("resource://gre/modules/Task.jsm", this);
const CrashStore = bsp.CrashStore;
let STORE_DIR_COUNT = 0;
function getStore() {
return Task.spawn(function* () {
let storeDir = do_get_tempdir().path;
storeDir = OS.Path.join(storeDir, "store-" + STORE_DIR_COUNT++);
yield OS.File.makeDir(storeDir, {unixMode: OS.Constants.libc.S_IRWXU});
let s = new CrashStore(storeDir);
s._addCrash = (id, date) => {
s._data.crashes.set(id, {id: id, crashDate: date});
}
yield s.load();
return s;
});
}
function run_test() {
run_next_test();
}
add_task(function* test_constructor() {
let s = new CrashStore("/some/path");
Assert.ok(s instanceof CrashStore);
});
add_task(function test_add_crash() {
let s = yield getStore();
Assert.equal(s.crashesCount, 0);
let d = new Date(Date.now() - 5000);
// TODO use official APIs once they are implemented.
s._addCrash("id1", d);
Assert.equal(s.crashesCount, 1);
let crashes = s.crashes;
Assert.equal(crashes.length, 1);
let c = crashes[0];
Assert.equal(c.id, "id1", "ID set properly.");
Assert.equal(c.crashDate.getTime(), d.getTime(), "Date set.");
s._addCrash("id2", new Date());
Assert.equal(s.crashesCount, 2);
});
add_task(function test_save_load() {
let s = yield getStore();
yield s.save();
let d1 = new Date();
let d2 = new Date(d1.getTime() - 10000);
s._addCrash("id1", d1);
s._addCrash("id2", d2);
yield s.save();
yield s.load();
Assert.ok(!s.corruptDate);
let crashes = s.crashes;
Assert.equal(crashes.length, 2);
let c = s.getCrash("id1");
Assert.equal(c.crashDate.getTime(), d1.getTime());
});
add_task(function test_corrupt_json() {
let s = yield getStore();
let buffer = new TextEncoder().encode("{bad: json-file");
yield OS.File.writeAtomic(s._storePath, buffer, {compression: "lz4"});
yield s.load();
Assert.ok(s.corruptDate, "Corrupt date is defined.");
let date = s.corruptDate;
yield s.save();
s._data = null;
yield s.load();
Assert.ok(s.corruptDate);
Assert.equal(date.getTime(), s.corruptDate.getTime());
});

View File

@ -4,3 +4,4 @@ tail =
[test_crash_manager.js]
[test_crash_service.js]
[test_crash_store.js]

View File

@ -4336,6 +4336,13 @@
"n_buckets": 15,
"description": "Time (ms) for a WAL checkpoint after collecting all measurements."
},
"CRASH_STORE_COMPRESSED_BYTES": {
"expires_in_version": "never",
"kind": "exponential",
"high": "1000000",
"n_buckets": 202,
"description": "Size (in bytes) of the compressed crash store JSON file."
},
"PDF_VIEWER_USED": {
"expires_in_version": "never",
"kind": "boolean",