Bug 1610134: revert late writes from nsTerminator. r=dthayer

After investigating the potential to reduce the nsTerminator's crash timeout from
1 min, to 20s, and then finally 40s, we have decided to this does not provide
significant gains to justify increasing the amount of shutdown hang crashes
and potential to lose data. We should maintain the crash timeout at 1 min.

Differential Revision: https://phabricator.services.mozilla.com/D77939
This commit is contained in:
Emma Malysz 2020-06-04 19:18:21 +00:00
parent 1b8348b800
commit c2ded60e72
12 changed files with 12 additions and 133 deletions

View File

@ -810,7 +810,6 @@ pref("toolkit.telemetry.debugSlowSql", false);
pref("toolkit.telemetry.unified", true);
// AsyncShutdown delay before crashing in case of shutdown freeze
#if !defined(MOZ_ASAN) && !defined(MOZ_TSAN)
pref("toolkit.asyncshutdown.report_writes_after", 40000); // 40 seconds
pref("toolkit.asyncshutdown.crash_timeout", 60000); // 1 minute
#else
// ASan and TSan builds can be considerably slower. Extend the grace period

View File

@ -2643,23 +2643,6 @@ telemetry:
- 'main'
- 'content'
failed_late_write:
bug_numbers:
- 1610134
description: >
The number of times a late write stack failed to be parsed.
expires: never
kind: uint
notification_emails:
- emalysz@mozilla.com
release_channel_collection: opt-out
products:
- 'firefox'
- 'fennec'
- 'geckoview'
record_in_processes:
- 'main'
telemetry.discarded:
accumulations:
bug_numbers:

View File

@ -97,9 +97,6 @@
#include "TelemetryHistogram.h"
#include "TelemetryOrigin.h"
#include "TelemetryScalar.h"
#ifndef ANDROID
# include "nsTerminator.h"
#endif
namespace {
@ -952,8 +949,8 @@ static bool IsValidBreakpadId(const std::string& breakpadId) {
// Read a stack from the given file name. In case of any error, aStack is
// unchanged.
static void ReadLateWriteStack(PathCharPtr aFileName,
Telemetry::ProcessedStack& aStack) {
static void ReadStack(PathCharPtr aFileName,
Telemetry::ProcessedStack& aStack) {
IFStream file(aFileName);
size_t numModules;
@ -1017,14 +1014,6 @@ static void ReadLateWriteStack(PathCharPtr aFileName,
stack.AddFrame(frame);
}
bool isFromTerminatorWatchdog;
file >> isFromTerminatorWatchdog;
if (file.fail()) {
ScalarAdd(mozilla::Telemetry::ScalarID::TELEMETRY_FAILED_LATE_WRITE, 1);
return;
}
stack.SetIsFromTerminatorWatchdog(isFromTerminatorWatchdog);
aStack = stack;
}
@ -1044,12 +1033,9 @@ void TelemetryImpl::ReadLateWritesStacks(nsIFile* aProfileDir) {
}
Telemetry::ProcessedStack stack;
ReadLateWriteStack(file->NativePath().get(), stack);
ReadStack(file->NativePath().get(), stack);
if (stack.GetStackSize() != 0) {
mLateWritesStacks.AddStack(stack);
if (stack.GetIsFromTerminatorWatchdog()) {
mLateWritesStacks.SetIsFromTerminatorWatchdog(true);
}
}
// Delete the file so that we don't report it again on the next run.
file->Remove(false);
@ -1070,23 +1056,16 @@ TelemetryImpl::GetLateWrites(JSContext* cx, JS::MutableHandle<JS::Value> ret) {
// CreateJSStackObject, but we would still need to figure out where to call
// JS_RemoveObjectRoot. Would it be ok to never call JS_RemoveObjectRoot
// and just set the pointer to nullptr is the telemetry destructor?
JS::Rooted<JSObject*> temp(cx, JS_NewPlainObject(cx));
JSObject* report;
if (!mCachedTelemetryData) {
CombinedStacks empty;
temp = CreateJSStackObject(cx, empty);
report = CreateJSStackObject(cx, empty);
} else {
temp = CreateJSStackObject(cx, mLateWritesStacks);
report = CreateJSStackObject(cx, mLateWritesStacks);
}
// Add this information afer we CreateJSStackObject because
// the function is used elsewhere, not just for latewrites.
JS::Rooted<JSObject*> report(cx, temp);
bool isFromTerminatorWatchdog =
mLateWritesStacks.GetIsFromTerminatorWatchdog();
bool ok = JS_DefineProperty(cx, report, "isFromTerminatorWatchdog",
isFromTerminatorWatchdog, JSPROP_ENUMERATE);
if (temp == nullptr || !ok) {
if (report == nullptr) {
return NS_ERROR_FAILURE;
}

View File

@ -536,7 +536,7 @@ Structure:
lateWrites
----------
This sections reports writes to the file system that happen during shutdown. The reported data contains the stack, the file names of the loaded libraries at the time the writes happened, and indicates if the late writes originated from the terminator watchdog.
This sections reports writes to the file system that happen during shutdown. The reported data contains the stack and the file names of the loaded libraries at the time the writes happened.
The file names of the loaded libraries can contain unicode characters.
@ -560,7 +560,6 @@ Structure:
],
... other stacks ...
],
"isFromTerminatorWatchdog": <boolean>
},
addonDetails

View File

@ -18,9 +18,7 @@ const size_t kMaxChromeStacksKept = 50;
CombinedStacks::CombinedStacks() : CombinedStacks(kMaxChromeStacksKept) {}
CombinedStacks::CombinedStacks(size_t aMaxStacksCount)
: mNextIndex(0),
mMaxStacksCount(aMaxStacksCount),
mIsFromTerminatorWatchdog(false) {}
: mNextIndex(0), mMaxStacksCount(aMaxStacksCount) {}
size_t CombinedStacks::GetModuleCount() const { return mModules.size(); }
@ -106,15 +104,6 @@ void CombinedStacks::RemoveStack(unsigned aIndex) {
}
}
bool CombinedStacks::GetIsFromTerminatorWatchdog() {
return mIsFromTerminatorWatchdog;
}
void CombinedStacks::SetIsFromTerminatorWatchdog(
bool aIsFromTerminatorWatchdog) {
mIsFromTerminatorWatchdog = aIsFromTerminatorWatchdog;
}
void CombinedStacks::Swap(CombinedStacks& aOther) {
mModules.swap(aOther.mModules);
mStacks.swap(aOther.mStacks);

View File

@ -39,8 +39,6 @@ class CombinedStacks {
size_t GetStackCount() const;
size_t SizeOfExcludingThis() const;
void RemoveStack(unsigned aIndex);
bool GetIsFromTerminatorWatchdog();
void SetIsFromTerminatorWatchdog(bool aIsFromTerminatorWatchdog);
#if defined(MOZ_GECKO_PROFILER)
/** Clears the contents of vectors and resets the index. */
@ -55,8 +53,6 @@ class CombinedStacks {
size_t mNextIndex;
// The maximum number of stacks to keep in the CombinedStacks object.
size_t mMaxStacksCount;
// Indicates if this stack came from a late write in nsTerminator.
bool mIsFromTerminatorWatchdog;
friend struct ::IPC::ParamTraits<CombinedStacks>;
};

View File

@ -30,7 +30,7 @@ namespace mozilla::Telemetry {
const size_t kMaxChromeStackDepth = 50;
ProcessedStack::ProcessedStack() : mIsFromTerminatorWatchdog(false){};
ProcessedStack::ProcessedStack() = default;
size_t ProcessedStack::GetStackSize() const { return mStack.size(); }
@ -56,15 +56,6 @@ void ProcessedStack::AddModule(const Module& aModule) {
mModules.push_back(aModule);
}
const bool& ProcessedStack::GetIsFromTerminatorWatchdog() const {
return mIsFromTerminatorWatchdog;
}
void ProcessedStack::SetIsFromTerminatorWatchdog(
const bool aIsFromTerminatorWatchdog) {
mIsFromTerminatorWatchdog = aIsFromTerminatorWatchdog;
}
void ProcessedStack::Clear() {
mModules.clear();
mStack.clear();

View File

@ -48,15 +48,12 @@ class ProcessedStack {
void AddFrame(const Frame& aFrame);
const Module& GetModule(unsigned aIndex) const;
void AddModule(const Module& aFrame);
const bool& GetIsFromTerminatorWatchdog() const;
void SetIsFromTerminatorWatchdog(const bool aIsFromTerminatorWatchdog);
void Clear();
private:
std::vector<Module> mModules;
std::vector<Frame> mStack;
bool mIsFromTerminatorWatchdog;
};
// Get the current list of loaded modules, filter and pair it to the provided

View File

@ -38,7 +38,6 @@ const STACK2 = [
[2, 10],
[3, 15],
];
const isFromTerminatorWatchdog = false;
// XXX The only error checking is for a zero-sized stack.
const STACK_BOGUS = [];
@ -83,8 +82,6 @@ function write_late_writes_file(stack, suffix) {
contents += element[0] + " " + element[1].toString(16) + "\n";
}
contents += isFromTerminatorWatchdog ? 1 : 0 + "\n";
write_string_to_file(file, contents);
}
@ -100,8 +97,6 @@ function run_test() {
Assert.equal(lateWrites.memoryMap.length, 0);
Assert.ok("stacks" in lateWrites);
Assert.equal(lateWrites.stacks.length, 0);
Assert.ok("isFromTerminatorWatchdog" in lateWrites);
Assert.equal(lateWrites.isFromTerminatorWatchdog, false);
do_test_pending();
Telemetry.asyncFetchTelemetryData(function() {
@ -143,9 +138,6 @@ function actual_test() {
return unevalCanonicalStack == obj;
};
}
Assert.ok("isFromTerminatorWatchdog" in lateWrites);
Assert.equal(lateWrites.isFromTerminatorWatchdog, false);
Assert.equal(uneval_STACKS.filter(stackChecker(first_stack)).length, 1);
Assert.equal(uneval_STACKS.filter(stackChecker(second_stack)).length, 1);

View File

@ -50,7 +50,6 @@
#include "mozilla/UniquePtr.h"
#include "mozilla/Unused.h"
#include "mozilla/Telemetry.h"
#include "mozilla/LateWriteChecks.h"
#include "mozilla/dom/workerinternals/RuntimeService.h"
@ -92,7 +91,6 @@ static ShutdownStep sShutdownSteps[] = {
};
Atomic<bool> sShutdownNotified;
Atomic<bool> sHasTerminatorLateWrite;
// Utility function: create a thread that is non-joinable,
// does not prevent the process from terminating, is never
@ -144,12 +142,6 @@ struct Options {
* How many ticks before we should crash the process.
*/
uint32_t crashAfterTicks;
/**
* A reduced number of ticks to crash earlier
* in the event we're hung indefinitely.
*/
uint32_t reportWritesAfterTicks;
};
/**
@ -162,7 +154,6 @@ void RunWatchdog(void* arg) {
// about.
UniquePtr<Options> options((Options*)arg);
uint32_t crashAfterTicks = options->crashAfterTicks;
uint32_t reportWritesAfterTicks = options->reportWritesAfterTicks;
options = nullptr;
const uint32_t timeToLive = crashAfterTicks;
@ -182,17 +173,8 @@ void RunWatchdog(void* arg) {
#else
usleep(1000000 /* usec */);
#endif
if (gHeartbeat++ < timeToLive) {
#if !defined(MOZ_VALGRIND) || !defined(MOZ_CODE_COVERAGE)
// We should not report if we are running on Valgrind because
// it is known to be much slower.
// We only also want to BeginLateWriteCheck once for the first
// time we exceed the reduced number of ticks.
if (gHeartbeat >= reportWritesAfterTicks && !sHasTerminatorLateWrite) {
sHasTerminatorLateWrite = true;
BeginLateWriteChecks();
}
#endif
continue;
}
@ -400,7 +382,6 @@ void nsTerminator::Start() {
#endif // !defined(NS_FREE_PERMANENT_DATA)
mInitialized = true;
sShutdownNotified = false;
sHasTerminatorLateWrite = false;
}
// Prepare, allocate and start the watchdog thread.
@ -409,10 +390,6 @@ void nsTerminator::StartWatchdog() {
int32_t crashAfterMS =
Preferences::GetInt("toolkit.asyncshutdown.crash_timeout",
FALLBACK_ASYNCSHUTDOWN_CRASH_AFTER_MS);
int32_t reducedCrashTimeoutMS =
Preferences::GetInt("toolkit.asyncshutdown.report_writes_after",
FALLBACK_ASYNCSHUTDOWN_CRASH_AFTER_MS);
// Ignore negative values
if (crashAfterMS <= 0) {
crashAfterMS = FALLBACK_ASYNCSHUTDOWN_CRASH_AFTER_MS;
@ -425,7 +402,6 @@ void nsTerminator::StartWatchdog() {
crashAfterMS = INT32_MAX;
} else {
crashAfterMS += ADDITIONAL_WAIT_BEFORE_CRASH_MS;
reducedCrashTimeoutMS += ADDITIONAL_WAIT_BEFORE_CRASH_MS;
}
#ifdef MOZ_VALGRIND
@ -450,11 +426,9 @@ void nsTerminator::StartWatchdog() {
UniquePtr<Options> options(new Options());
const PRIntervalTime ticksDuration = PR_MillisecondsToInterval(1000);
options->crashAfterTicks = crashAfterMS / ticksDuration;
options->reportWritesAfterTicks = reducedCrashTimeoutMS / ticksDuration;
// Handle systems where ticksDuration is greater than crashAfterMS.
if (options->crashAfterTicks == 0) {
options->crashAfterTicks = crashAfterMS / 1000;
options->reportWritesAfterTicks = reducedCrashTimeoutMS / 1000;
}
DebugOnly<PRThread*> watchdogThread =
@ -606,8 +580,6 @@ void nsTerminator::UpdateCrashReport(const char* aTopic) {
CrashReporter::Annotation::ShutdownProgress, report);
}
bool nsTerminator::IsCheckingLateWrites() { return sHasTerminatorLateWrite; }
void XPCOMShutdownNotified() {
MOZ_DIAGNOSTIC_ASSERT(sShutdownNotified == false);
sShutdownNotified = true;

View File

@ -18,7 +18,6 @@ class nsTerminator final : public nsIObserver {
NS_DECL_NSIOBSERVER
nsTerminator();
static bool IsCheckingLateWrites();
private:
nsresult SelfInit();

View File

@ -14,14 +14,10 @@
#include "mozilla/StaticPtr.h"
#include "mozilla/Telemetry.h"
#include "mozilla/Unused.h"
#include "mozilla/Mutex.h"
#include "nsAppDirectoryServiceDefs.h"
#include "nsDirectoryServiceUtils.h"
#include "nsLocalFile.h"
#include "nsPrintfCString.h"
#ifndef ANDROID
# include "nsTerminator.h"
#endif
#include "mozilla/StackWalk.h"
#include "plstr.h"
#include "prio.h"
@ -199,11 +195,6 @@ void LateWriteObserver::Observe(
sha1Stream.Printf("%d %x\n", frame.mModIndex, (unsigned)frame.mOffset);
}
#ifndef ANDROID
sha1Stream.Printf("%d\n", mozilla::nsTerminator::IsCheckingLateWrites());
#else
sha1Stream.Printf("%d\n", false);
#endif
mozilla::SHA1Sum::Hash sha1;
sha1Stream.Finish(sha1);
@ -225,7 +216,6 @@ void LateWriteObserver::Observe(
/******************************* Setup/Teardown *******************************/
static mozilla::StaticAutoPtr<LateWriteObserver> sLateWriteObserver;
mozilla::Mutex* mMutex = nullptr;
namespace mozilla {
@ -238,20 +228,13 @@ void InitLateWriteChecks() {
sLateWriteObserver = new LateWriteObserver(nativePath.get());
}
}
mMutex = new Mutex("LateWriteCheck::mMutex");
}
void BeginLateWriteChecks() {
if (mMutex) {
MutexAutoLock lock(*mMutex);
}
if (sLateWriteObserver) {
IOInterposer::Register(IOInterposeObserver::OpWriteFSync,
sLateWriteObserver);
}
delete mMutex;
mMutex = nullptr;
}
void StopLateWriteChecks() {