Bug 1442178 - Repair broken socket polling wakeup mechanism after a network change to prevent long load hangs, r=dragana

This commit is contained in:
Honza Bambas 2018-05-30 17:31:09 +03:00
parent fbc0928677
commit dab22ca3b9
5 changed files with 158 additions and 19 deletions

View File

@ -1770,6 +1770,7 @@ pref("network.http.focused_window_transaction_ratio", "0.9");
// Whether or not we give more priority to active tab.
// Note that this requires restart for changes to take effect.
pref("network.http.active_tab_priority", true);
// </http>
// default values for FTP
// in a DSCP environment this should be 40 (0x28, or AF11), per RFC-4594,
@ -1785,7 +1786,12 @@ pref("network.sts.max_time_for_events_between_two_polls", 100);
// During shutdown we limit PR_Close calls. If time exceeds this pref (in ms)
// let sockets just leak.
pref("network.sts.max_time_for_pr_close_during_shutdown", 5000);
// </http>
// When the polling socket pair we use to wake poll() up on demand doesn't
// get signalled (is not readable) within this timeout, we try to repair it.
// This timeout can be disabled by setting this pref to 0.
// The value is expected in seconds.
pref("network.sts.pollable_event_timeout", 6);
// 2147483647 == PR_INT32_MAX == ~2 GB
pref("network.websocket.max-message-size", 2147483647);

View File

@ -140,6 +140,8 @@ PollableEvent::PollableEvent()
: mWriteFD(nullptr)
, mReadFD(nullptr)
, mSignaled(false)
, mWriteFailed(false)
, mSignalTimestampAdjusted(false)
{
MOZ_COUNT_CTOR(PollableEvent);
MOZ_ASSERT(OnSocketThread(), "not on socket thread");
@ -218,6 +220,7 @@ PollableEvent::PollableEvent()
// prime the system to deal with races invovled in [dc]tor cycle
SOCKET_LOG(("PollableEvent() ctor ok\n"));
mSignaled = true;
MarkFirstSignalTimestamp();
PR_Write(mWriteFD, "I", 1);
}
}
@ -274,13 +277,20 @@ PollableEvent::Signal()
}
#endif
mSignaled = true;
if (!mSignaled) {
mSignaled = true;
MarkFirstSignalTimestamp();
}
int32_t status = PR_Write(mWriteFD, "M", 1);
SOCKET_LOG(("PollableEvent::Signal PR_Write %d\n", status));
if (status != 1) {
NS_WARNING("PollableEvent::Signal Failed\n");
SOCKET_LOG(("PollableEvent::Signal Failed\n"));
mSignaled = false;
mWriteFailed = true;
} else {
mWriteFailed = false;
}
return (status == 1);
}
@ -292,11 +302,21 @@ PollableEvent::Clear()
MOZ_ASSERT(OnSocketThread(), "not on socket thread");
SOCKET_LOG(("PollableEvent::Clear\n"));
if (!mFirstSignalAfterClear.IsNull()) {
SOCKET_LOG(("PollableEvent::Clear time to signal %ums",
(uint32_t)(TimeStamp::NowLoRes() - mFirstSignalAfterClear).ToMilliseconds()));
}
mFirstSignalAfterClear = TimeStamp();
mSignalTimestampAdjusted = false;
mSignaled = false;
if (!mReadFD) {
SOCKET_LOG(("PollableEvent::Clear mReadFD is null\n"));
return false;
}
char buf[2048];
int32_t status;
#ifdef XP_WIN
@ -304,7 +324,7 @@ PollableEvent::Clear()
// do not have any deadlock read from the socket as much as we can.
while (true) {
status = PR_Read(mReadFD, buf, 2048);
SOCKET_LOG(("PollableEvent::Signal PR_Read %d\n", status));
SOCKET_LOG(("PollableEvent::Clear PR_Read %d\n", status));
if (status == 0) {
SOCKET_LOG(("PollableEvent::Clear EOF!\n"));
return false;
@ -321,7 +341,7 @@ PollableEvent::Clear()
}
#else
status = PR_Read(mReadFD, buf, 2048);
SOCKET_LOG(("PollableEvent::Signal PR_Read %d\n", status));
SOCKET_LOG(("PollableEvent::Clear PR_Read %d\n", status));
if (status == 1) {
return true;
@ -345,5 +365,47 @@ PollableEvent::Clear()
#endif //XP_WIN
}
void
PollableEvent::MarkFirstSignalTimestamp()
{
if (mFirstSignalAfterClear.IsNull()) {
SOCKET_LOG(("PollableEvent::MarkFirstSignalTimestamp"));
mFirstSignalAfterClear = TimeStamp::NowLoRes();
}
}
void
PollableEvent::AdjustFirstSignalTimestamp()
{
if (!mSignalTimestampAdjusted && !mFirstSignalAfterClear.IsNull()) {
SOCKET_LOG(("PollableEvent::AdjustFirstSignalTimestamp"));
mFirstSignalAfterClear = TimeStamp::NowLoRes();
mSignalTimestampAdjusted = true;
}
}
bool
PollableEvent::IsSignallingAlive(TimeDuration const& timeout)
{
if (mWriteFailed) {
return false;
}
#ifdef DEBUG
// The timeout would be just a disturbance in a debug build.
return true;
#else
if (!mSignaled || mFirstSignalAfterClear.IsNull() || timeout == TimeDuration()) {
return true;
}
TimeDuration delay = (TimeStamp::NowLoRes() - mFirstSignalAfterClear);
bool timedOut = delay > timeout;
return !timedOut;
#endif // DEBUG
}
} // namespace net
} // namespace mozilla

View File

@ -8,6 +8,7 @@
#define PollableEvent_h__
#include "mozilla/Mutex.h"
#include "mozilla/TimeStamp.h"
namespace mozilla {
namespace net {
@ -21,15 +22,42 @@ public:
// Signal/Clear return false only if they fail
bool Signal();
// This is called only when we get non-null out_flags for the socket pair
bool Clear();
bool Valid() { return mWriteFD && mReadFD; }
// We want to detect if writing to one of the socket pair sockets takes
// too long to be received by the other socket from the pair.
// Hence, we remember the timestamp of the earliest write by a call to
// MarkFirstSignalTimestamp() from Signal(). After waking up from poll()
// we check how long it took get the 'readable' signal on the socket pair.
void MarkFirstSignalTimestamp();
// Called right before we enter poll() to exclude any possible delay between
// the earlist call to Signal() and entering poll() caused by processing
// of events dispatched to the socket transport thread.
void AdjustFirstSignalTimestamp();
// This returns false on following conditions:
// - PR_Write has failed
// - no out_flags were signalled on the socket pair for too long after
// the earliest Signal()
bool IsSignallingAlive(TimeDuration const& timeout);
PRFileDesc *PollableFD() { return mReadFD; }
private:
PRFileDesc *mWriteFD;
PRFileDesc *mReadFD;
bool mSignaled;
// true when PR_Write to the socket pair has failed (status < 1)
bool mWriteFailed;
// Set true after AdjustFirstSignalTimestamp() was called
// Set false after Clear() was called
// Ensures shifting the timestamp before entering poll() only once
// between Clear()'ings.
bool mSignalTimestampAdjusted;
// Timestamp of the first call to Signal() (or time we enter poll())
// that happened after the last Clear() call
TimeStamp mFirstSignalAfterClear;
};
} // namespace net

View File

@ -53,6 +53,7 @@ static Atomic<PRThread*, Relaxed> gSocketThread;
#define MAX_TIME_BETWEEN_TWO_POLLS "network.sts.max_time_for_events_between_two_polls"
#define TELEMETRY_PREF "toolkit.telemetry.enabled"
#define MAX_TIME_FOR_PR_CLOSE_DURING_SHUTDOWN "network.sts.max_time_for_pr_close_during_shutdown"
#define POLLABLE_EVENT_TIMEOUT "network.sts.pollable_event_timeout"
#define REPAIR_POLLABLE_EVENT_TIME 10
@ -135,6 +136,7 @@ nsSocketTransportService::nsSocketTransportService()
, mKeepaliveRetryIntervalS(1)
, mKeepaliveProbeCount(kDefaultTCPKeepCount)
, mKeepaliveEnabledPref(false)
, mPollableEventTimeout(TimeDuration::FromSeconds(6))
, mServingPendingQueue(false)
, mMaxTimePerPollIter(100)
, mTelemetryEnabledPref(false)
@ -606,6 +608,7 @@ nsSocketTransportService::Init()
tmpPrefService->AddObserver(MAX_TIME_BETWEEN_TWO_POLLS, this, false);
tmpPrefService->AddObserver(TELEMETRY_PREF, this, false);
tmpPrefService->AddObserver(MAX_TIME_FOR_PR_CLOSE_DURING_SHUTDOWN, this, false);
tmpPrefService->AddObserver(POLLABLE_EVENT_TIMEOUT, this, false);
}
UpdatePrefs();
@ -1159,6 +1162,20 @@ nsSocketTransportService::DoPollIteration(TimeDuration *pollDuration)
MoveToPollList(&mIdleList[i]);
}
{
MutexAutoLock lock(mLock);
if (mPollableEvent) {
// we want to make sure the timeout is measured from the time
// we enter poll(). This method resets the timestamp to 'now',
// if we were first signalled between leaving poll() and here.
// If we didn't do this and processing events took longer than
// the allowed signal timeout, we would detect it as a
// false-positive. AdjustFirstSignalTimestamp is then a no-op
// until mPollableEvent->Clear() is called.
mPollableEvent->AdjustFirstSignalTimestamp();
}
}
SOCKET_LOG((" calling PR_Poll [active=%u idle=%u]\n", mActiveCount, mIdleCount));
#if defined(XP_WIN)
@ -1230,29 +1247,26 @@ nsSocketTransportService::DoPollIteration(TimeDuration *pollDuration)
DetachSocket(mActiveList, &mActiveList[i]);
}
if (n != 0 && (mPollList[0].out_flags & (PR_POLL_READ | PR_POLL_EXCEPT))) {
{
MutexAutoLock lock(mLock);
// acknowledge pollable event (should not block)
if (mPollableEvent &&
((mPollList[0].out_flags & PR_POLL_EXCEPT) ||
!mPollableEvent->Clear())) {
if (n != 0 &&
(mPollList[0].out_flags & (PR_POLL_READ | PR_POLL_EXCEPT)) &&
mPollableEvent &&
((mPollList[0].out_flags & PR_POLL_EXCEPT) || !mPollableEvent->Clear())) {
// On Windows, the TCP loopback connection in the
// pollable event may become broken when a laptop
// switches between wired and wireless networks or
// wakes up from hibernation. We try to create a
// new pollable event. If that fails, we fall back
// on "busy wait".
NS_WARNING("Trying to repair mPollableEvent");
mPollableEvent.reset(new PollableEvent());
if (!mPollableEvent->Valid()) {
mPollableEvent = nullptr;
}
SOCKET_LOG(("running socket transport thread without "
"a pollable event now valid=%d", !!mPollableEvent));
mPollList[0].fd = mPollableEvent ? mPollableEvent->PollableFD() : nullptr;
mPollList[0].in_flags = PR_POLL_READ | PR_POLL_EXCEPT;
mPollList[0].out_flags = 0;
TryRepairPollableEvent();
}
if (mPollableEvent &&
!mPollableEvent->IsSignallingAlive(mPollableEventTimeout)) {
SOCKET_LOG(("Pollable event signalling failed/timed out"));
TryRepairPollableEvent();
}
}
}
@ -1335,6 +1349,14 @@ nsSocketTransportService::UpdatePrefs()
if (NS_SUCCEEDED(rv) && maxTimeForPrClosePref >=0) {
mMaxTimeForPrClosePref = PR_MillisecondsToInterval(maxTimeForPrClosePref);
}
int32_t pollableEventTimeout;
rv = tmpPrefService->GetIntPref(POLLABLE_EVENT_TIMEOUT,
&pollableEventTimeout);
if (NS_SUCCEEDED(rv) && pollableEventTimeout >= 0) {
MutexAutoLock lock(mLock);
mPollableEventTimeout = TimeDuration::FromSeconds(pollableEventTimeout);
}
}
return NS_OK;
@ -1695,7 +1717,24 @@ nsSocketTransportService::EndPolling()
mPollRepairTimer->Cancel();
}
}
#endif
void nsSocketTransportService::TryRepairPollableEvent()
{
mLock.AssertCurrentThreadOwns();
NS_WARNING("Trying to repair mPollableEvent");
mPollableEvent.reset(new PollableEvent());
if (!mPollableEvent->Valid()) {
mPollableEvent = nullptr;
}
SOCKET_LOG(("running socket transport thread without "
"a pollable event now valid=%d", !!mPollableEvent));
mPollList[0].fd = mPollableEvent ? mPollableEvent->PollableFD() : nullptr;
mPollList[0].in_flags = PR_POLL_READ | PR_POLL_EXCEPT;
mPollList[0].out_flags = 0;
}
} // namespace net
} // namespace mozilla

View File

@ -247,6 +247,8 @@ private:
int32_t mKeepaliveProbeCount;
// True if TCP keepalive is enabled globally.
bool mKeepaliveEnabledPref;
// Timeout of pollable event signalling.
TimeDuration mPollableEventTimeout;
Atomic<bool> mServingPendingQueue;
Atomic<int32_t, Relaxed> mMaxTimePerPollIter;
@ -285,6 +287,8 @@ private:
void StartPolling();
void EndPolling();
#endif
void TryRepairPollableEvent();
};
extern nsSocketTransportService *gSocketTransportService;