Bug 1442178 - Repair broken socket polling wakeup mechanism after a network change to prevent long load hangs, r=dragana

This commit is contained in:
Honza Bambas 2018-05-30 17:31:09 +03:00
parent fbc0928677
commit dab22ca3b9
5 changed files with 158 additions and 19 deletions

View File

@ -1770,6 +1770,7 @@ pref("network.http.focused_window_transaction_ratio", "0.9");
// Whether or not we give more priority to active tab. // Whether or not we give more priority to active tab.
// Note that this requires restart for changes to take effect. // Note that this requires restart for changes to take effect.
pref("network.http.active_tab_priority", true); pref("network.http.active_tab_priority", true);
// </http>
// default values for FTP // default values for FTP
// in a DSCP environment this should be 40 (0x28, or AF11), per RFC-4594, // in a DSCP environment this should be 40 (0x28, or AF11), per RFC-4594,
@ -1785,7 +1786,12 @@ pref("network.sts.max_time_for_events_between_two_polls", 100);
// During shutdown we limit PR_Close calls. If time exceeds this pref (in ms) // During shutdown we limit PR_Close calls. If time exceeds this pref (in ms)
// let sockets just leak. // let sockets just leak.
pref("network.sts.max_time_for_pr_close_during_shutdown", 5000); pref("network.sts.max_time_for_pr_close_during_shutdown", 5000);
// </http>
// When the polling socket pair we use to wake poll() up on demand doesn't
// get signalled (is not readable) within this timeout, we try to repair it.
// This timeout can be disabled by setting this pref to 0.
// The value is expected in seconds.
pref("network.sts.pollable_event_timeout", 6);
// 2147483647 == PR_INT32_MAX == ~2 GB // 2147483647 == PR_INT32_MAX == ~2 GB
pref("network.websocket.max-message-size", 2147483647); pref("network.websocket.max-message-size", 2147483647);

View File

@ -140,6 +140,8 @@ PollableEvent::PollableEvent()
: mWriteFD(nullptr) : mWriteFD(nullptr)
, mReadFD(nullptr) , mReadFD(nullptr)
, mSignaled(false) , mSignaled(false)
, mWriteFailed(false)
, mSignalTimestampAdjusted(false)
{ {
MOZ_COUNT_CTOR(PollableEvent); MOZ_COUNT_CTOR(PollableEvent);
MOZ_ASSERT(OnSocketThread(), "not on socket thread"); MOZ_ASSERT(OnSocketThread(), "not on socket thread");
@ -218,6 +220,7 @@ PollableEvent::PollableEvent()
// prime the system to deal with races invovled in [dc]tor cycle // prime the system to deal with races invovled in [dc]tor cycle
SOCKET_LOG(("PollableEvent() ctor ok\n")); SOCKET_LOG(("PollableEvent() ctor ok\n"));
mSignaled = true; mSignaled = true;
MarkFirstSignalTimestamp();
PR_Write(mWriteFD, "I", 1); PR_Write(mWriteFD, "I", 1);
} }
} }
@ -274,13 +277,20 @@ PollableEvent::Signal()
} }
#endif #endif
mSignaled = true; if (!mSignaled) {
mSignaled = true;
MarkFirstSignalTimestamp();
}
int32_t status = PR_Write(mWriteFD, "M", 1); int32_t status = PR_Write(mWriteFD, "M", 1);
SOCKET_LOG(("PollableEvent::Signal PR_Write %d\n", status)); SOCKET_LOG(("PollableEvent::Signal PR_Write %d\n", status));
if (status != 1) { if (status != 1) {
NS_WARNING("PollableEvent::Signal Failed\n"); NS_WARNING("PollableEvent::Signal Failed\n");
SOCKET_LOG(("PollableEvent::Signal Failed\n")); SOCKET_LOG(("PollableEvent::Signal Failed\n"));
mSignaled = false; mSignaled = false;
mWriteFailed = true;
} else {
mWriteFailed = false;
} }
return (status == 1); return (status == 1);
} }
@ -292,11 +302,21 @@ PollableEvent::Clear()
MOZ_ASSERT(OnSocketThread(), "not on socket thread"); MOZ_ASSERT(OnSocketThread(), "not on socket thread");
SOCKET_LOG(("PollableEvent::Clear\n")); SOCKET_LOG(("PollableEvent::Clear\n"));
if (!mFirstSignalAfterClear.IsNull()) {
SOCKET_LOG(("PollableEvent::Clear time to signal %ums",
(uint32_t)(TimeStamp::NowLoRes() - mFirstSignalAfterClear).ToMilliseconds()));
}
mFirstSignalAfterClear = TimeStamp();
mSignalTimestampAdjusted = false;
mSignaled = false; mSignaled = false;
if (!mReadFD) { if (!mReadFD) {
SOCKET_LOG(("PollableEvent::Clear mReadFD is null\n")); SOCKET_LOG(("PollableEvent::Clear mReadFD is null\n"));
return false; return false;
} }
char buf[2048]; char buf[2048];
int32_t status; int32_t status;
#ifdef XP_WIN #ifdef XP_WIN
@ -304,7 +324,7 @@ PollableEvent::Clear()
// do not have any deadlock read from the socket as much as we can. // do not have any deadlock read from the socket as much as we can.
while (true) { while (true) {
status = PR_Read(mReadFD, buf, 2048); status = PR_Read(mReadFD, buf, 2048);
SOCKET_LOG(("PollableEvent::Signal PR_Read %d\n", status)); SOCKET_LOG(("PollableEvent::Clear PR_Read %d\n", status));
if (status == 0) { if (status == 0) {
SOCKET_LOG(("PollableEvent::Clear EOF!\n")); SOCKET_LOG(("PollableEvent::Clear EOF!\n"));
return false; return false;
@ -321,7 +341,7 @@ PollableEvent::Clear()
} }
#else #else
status = PR_Read(mReadFD, buf, 2048); status = PR_Read(mReadFD, buf, 2048);
SOCKET_LOG(("PollableEvent::Signal PR_Read %d\n", status)); SOCKET_LOG(("PollableEvent::Clear PR_Read %d\n", status));
if (status == 1) { if (status == 1) {
return true; return true;
@ -345,5 +365,47 @@ PollableEvent::Clear()
#endif //XP_WIN #endif //XP_WIN
} }
void
PollableEvent::MarkFirstSignalTimestamp()
{
if (mFirstSignalAfterClear.IsNull()) {
SOCKET_LOG(("PollableEvent::MarkFirstSignalTimestamp"));
mFirstSignalAfterClear = TimeStamp::NowLoRes();
}
}
void
PollableEvent::AdjustFirstSignalTimestamp()
{
if (!mSignalTimestampAdjusted && !mFirstSignalAfterClear.IsNull()) {
SOCKET_LOG(("PollableEvent::AdjustFirstSignalTimestamp"));
mFirstSignalAfterClear = TimeStamp::NowLoRes();
mSignalTimestampAdjusted = true;
}
}
bool
PollableEvent::IsSignallingAlive(TimeDuration const& timeout)
{
if (mWriteFailed) {
return false;
}
#ifdef DEBUG
// The timeout would be just a disturbance in a debug build.
return true;
#else
if (!mSignaled || mFirstSignalAfterClear.IsNull() || timeout == TimeDuration()) {
return true;
}
TimeDuration delay = (TimeStamp::NowLoRes() - mFirstSignalAfterClear);
bool timedOut = delay > timeout;
return !timedOut;
#endif // DEBUG
}
} // namespace net } // namespace net
} // namespace mozilla } // namespace mozilla

View File

@ -8,6 +8,7 @@
#define PollableEvent_h__ #define PollableEvent_h__
#include "mozilla/Mutex.h" #include "mozilla/Mutex.h"
#include "mozilla/TimeStamp.h"
namespace mozilla { namespace mozilla {
namespace net { namespace net {
@ -21,15 +22,42 @@ public:
// Signal/Clear return false only if they fail // Signal/Clear return false only if they fail
bool Signal(); bool Signal();
// This is called only when we get non-null out_flags for the socket pair
bool Clear(); bool Clear();
bool Valid() { return mWriteFD && mReadFD; } bool Valid() { return mWriteFD && mReadFD; }
// We want to detect if writing to one of the socket pair sockets takes
// too long to be received by the other socket from the pair.
// Hence, we remember the timestamp of the earliest write by a call to
// MarkFirstSignalTimestamp() from Signal(). After waking up from poll()
// we check how long it took get the 'readable' signal on the socket pair.
void MarkFirstSignalTimestamp();
// Called right before we enter poll() to exclude any possible delay between
// the earlist call to Signal() and entering poll() caused by processing
// of events dispatched to the socket transport thread.
void AdjustFirstSignalTimestamp();
// This returns false on following conditions:
// - PR_Write has failed
// - no out_flags were signalled on the socket pair for too long after
// the earliest Signal()
bool IsSignallingAlive(TimeDuration const& timeout);
PRFileDesc *PollableFD() { return mReadFD; } PRFileDesc *PollableFD() { return mReadFD; }
private: private:
PRFileDesc *mWriteFD; PRFileDesc *mWriteFD;
PRFileDesc *mReadFD; PRFileDesc *mReadFD;
bool mSignaled; bool mSignaled;
// true when PR_Write to the socket pair has failed (status < 1)
bool mWriteFailed;
// Set true after AdjustFirstSignalTimestamp() was called
// Set false after Clear() was called
// Ensures shifting the timestamp before entering poll() only once
// between Clear()'ings.
bool mSignalTimestampAdjusted;
// Timestamp of the first call to Signal() (or time we enter poll())
// that happened after the last Clear() call
TimeStamp mFirstSignalAfterClear;
}; };
} // namespace net } // namespace net

View File

@ -53,6 +53,7 @@ static Atomic<PRThread*, Relaxed> gSocketThread;
#define MAX_TIME_BETWEEN_TWO_POLLS "network.sts.max_time_for_events_between_two_polls" #define MAX_TIME_BETWEEN_TWO_POLLS "network.sts.max_time_for_events_between_two_polls"
#define TELEMETRY_PREF "toolkit.telemetry.enabled" #define TELEMETRY_PREF "toolkit.telemetry.enabled"
#define MAX_TIME_FOR_PR_CLOSE_DURING_SHUTDOWN "network.sts.max_time_for_pr_close_during_shutdown" #define MAX_TIME_FOR_PR_CLOSE_DURING_SHUTDOWN "network.sts.max_time_for_pr_close_during_shutdown"
#define POLLABLE_EVENT_TIMEOUT "network.sts.pollable_event_timeout"
#define REPAIR_POLLABLE_EVENT_TIME 10 #define REPAIR_POLLABLE_EVENT_TIME 10
@ -135,6 +136,7 @@ nsSocketTransportService::nsSocketTransportService()
, mKeepaliveRetryIntervalS(1) , mKeepaliveRetryIntervalS(1)
, mKeepaliveProbeCount(kDefaultTCPKeepCount) , mKeepaliveProbeCount(kDefaultTCPKeepCount)
, mKeepaliveEnabledPref(false) , mKeepaliveEnabledPref(false)
, mPollableEventTimeout(TimeDuration::FromSeconds(6))
, mServingPendingQueue(false) , mServingPendingQueue(false)
, mMaxTimePerPollIter(100) , mMaxTimePerPollIter(100)
, mTelemetryEnabledPref(false) , mTelemetryEnabledPref(false)
@ -606,6 +608,7 @@ nsSocketTransportService::Init()
tmpPrefService->AddObserver(MAX_TIME_BETWEEN_TWO_POLLS, this, false); tmpPrefService->AddObserver(MAX_TIME_BETWEEN_TWO_POLLS, this, false);
tmpPrefService->AddObserver(TELEMETRY_PREF, this, false); tmpPrefService->AddObserver(TELEMETRY_PREF, this, false);
tmpPrefService->AddObserver(MAX_TIME_FOR_PR_CLOSE_DURING_SHUTDOWN, this, false); tmpPrefService->AddObserver(MAX_TIME_FOR_PR_CLOSE_DURING_SHUTDOWN, this, false);
tmpPrefService->AddObserver(POLLABLE_EVENT_TIMEOUT, this, false);
} }
UpdatePrefs(); UpdatePrefs();
@ -1159,6 +1162,20 @@ nsSocketTransportService::DoPollIteration(TimeDuration *pollDuration)
MoveToPollList(&mIdleList[i]); MoveToPollList(&mIdleList[i]);
} }
{
MutexAutoLock lock(mLock);
if (mPollableEvent) {
// we want to make sure the timeout is measured from the time
// we enter poll(). This method resets the timestamp to 'now',
// if we were first signalled between leaving poll() and here.
// If we didn't do this and processing events took longer than
// the allowed signal timeout, we would detect it as a
// false-positive. AdjustFirstSignalTimestamp is then a no-op
// until mPollableEvent->Clear() is called.
mPollableEvent->AdjustFirstSignalTimestamp();
}
}
SOCKET_LOG((" calling PR_Poll [active=%u idle=%u]\n", mActiveCount, mIdleCount)); SOCKET_LOG((" calling PR_Poll [active=%u idle=%u]\n", mActiveCount, mIdleCount));
#if defined(XP_WIN) #if defined(XP_WIN)
@ -1230,29 +1247,26 @@ nsSocketTransportService::DoPollIteration(TimeDuration *pollDuration)
DetachSocket(mActiveList, &mActiveList[i]); DetachSocket(mActiveList, &mActiveList[i]);
} }
if (n != 0 && (mPollList[0].out_flags & (PR_POLL_READ | PR_POLL_EXCEPT))) { {
MutexAutoLock lock(mLock); MutexAutoLock lock(mLock);
// acknowledge pollable event (should not block) // acknowledge pollable event (should not block)
if (mPollableEvent && if (n != 0 &&
((mPollList[0].out_flags & PR_POLL_EXCEPT) || (mPollList[0].out_flags & (PR_POLL_READ | PR_POLL_EXCEPT)) &&
!mPollableEvent->Clear())) { mPollableEvent &&
((mPollList[0].out_flags & PR_POLL_EXCEPT) || !mPollableEvent->Clear())) {
// On Windows, the TCP loopback connection in the // On Windows, the TCP loopback connection in the
// pollable event may become broken when a laptop // pollable event may become broken when a laptop
// switches between wired and wireless networks or // switches between wired and wireless networks or
// wakes up from hibernation. We try to create a // wakes up from hibernation. We try to create a
// new pollable event. If that fails, we fall back // new pollable event. If that fails, we fall back
// on "busy wait". // on "busy wait".
NS_WARNING("Trying to repair mPollableEvent"); TryRepairPollableEvent();
mPollableEvent.reset(new PollableEvent()); }
if (!mPollableEvent->Valid()) {
mPollableEvent = nullptr; if (mPollableEvent &&
} !mPollableEvent->IsSignallingAlive(mPollableEventTimeout)) {
SOCKET_LOG(("running socket transport thread without " SOCKET_LOG(("Pollable event signalling failed/timed out"));
"a pollable event now valid=%d", !!mPollableEvent)); TryRepairPollableEvent();
mPollList[0].fd = mPollableEvent ? mPollableEvent->PollableFD() : nullptr;
mPollList[0].in_flags = PR_POLL_READ | PR_POLL_EXCEPT;
mPollList[0].out_flags = 0;
} }
} }
} }
@ -1335,6 +1349,14 @@ nsSocketTransportService::UpdatePrefs()
if (NS_SUCCEEDED(rv) && maxTimeForPrClosePref >=0) { if (NS_SUCCEEDED(rv) && maxTimeForPrClosePref >=0) {
mMaxTimeForPrClosePref = PR_MillisecondsToInterval(maxTimeForPrClosePref); mMaxTimeForPrClosePref = PR_MillisecondsToInterval(maxTimeForPrClosePref);
} }
int32_t pollableEventTimeout;
rv = tmpPrefService->GetIntPref(POLLABLE_EVENT_TIMEOUT,
&pollableEventTimeout);
if (NS_SUCCEEDED(rv) && pollableEventTimeout >= 0) {
MutexAutoLock lock(mLock);
mPollableEventTimeout = TimeDuration::FromSeconds(pollableEventTimeout);
}
} }
return NS_OK; return NS_OK;
@ -1695,7 +1717,24 @@ nsSocketTransportService::EndPolling()
mPollRepairTimer->Cancel(); mPollRepairTimer->Cancel();
} }
} }
#endif #endif
void nsSocketTransportService::TryRepairPollableEvent()
{
mLock.AssertCurrentThreadOwns();
NS_WARNING("Trying to repair mPollableEvent");
mPollableEvent.reset(new PollableEvent());
if (!mPollableEvent->Valid()) {
mPollableEvent = nullptr;
}
SOCKET_LOG(("running socket transport thread without "
"a pollable event now valid=%d", !!mPollableEvent));
mPollList[0].fd = mPollableEvent ? mPollableEvent->PollableFD() : nullptr;
mPollList[0].in_flags = PR_POLL_READ | PR_POLL_EXCEPT;
mPollList[0].out_flags = 0;
}
} // namespace net } // namespace net
} // namespace mozilla } // namespace mozilla

View File

@ -247,6 +247,8 @@ private:
int32_t mKeepaliveProbeCount; int32_t mKeepaliveProbeCount;
// True if TCP keepalive is enabled globally. // True if TCP keepalive is enabled globally.
bool mKeepaliveEnabledPref; bool mKeepaliveEnabledPref;
// Timeout of pollable event signalling.
TimeDuration mPollableEventTimeout;
Atomic<bool> mServingPendingQueue; Atomic<bool> mServingPendingQueue;
Atomic<int32_t, Relaxed> mMaxTimePerPollIter; Atomic<int32_t, Relaxed> mMaxTimePerPollIter;
@ -285,6 +287,8 @@ private:
void StartPolling(); void StartPolling();
void EndPolling(); void EndPolling();
#endif #endif
void TryRepairPollableEvent();
}; };
extern nsSocketTransportService *gSocketTransportService; extern nsSocketTransportService *gSocketTransportService;