Bug 489820 part 2 - Grow buffers to the worst-case size before tokenizing; fix comments. r=wchen.

This commit is contained in:
Henri Sivonen 2015-08-25 18:05:45 +03:00
parent 9cb0eb6ae0
commit 02d517f7ac
16 changed files with 231 additions and 52 deletions

View File

@ -53,6 +53,12 @@ struct jArray {
jArray<T,L> newArray = { new T[len], len };
return newArray;
}
static jArray<T,L> newFallibleJArray(L const len) {
MOZ_ASSERT(len >= 0, "Negative length.");
T* a = new (mozilla::fallible) T[len];
jArray<T,L> newArray = { a, a ? len : 0 };
return newArray;
}
operator T*() { return arr; }
T& operator[] (L const index) {
MOZ_ASSERT(index >= 0, "Array access with negative index.");

View File

@ -53,7 +53,7 @@ import org.xml.sax.SAXParseException;
/**
* An implementation of
* http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html
* https://html.spec.whatwg.org/multipage/syntax.html#tokenization
*
* This class implements the <code>Locator</code> interface. This is not an
* incidental implementation detail: Users of this class are encouraged to make
@ -262,11 +262,6 @@ public class Tokenizer implements Locator {
*/
private static final @NoLength char[] LF = { '\n' };
/**
* Buffer growth parameter.
*/
private static final int BUFFER_GROW_BY = 1024;
/**
* "CDATA[" as <code>char[]</code>
*/
@ -442,7 +437,7 @@ public class Tokenizer implements Locator {
protected boolean html4;
/**
* Whether the stream is past the first 512 bytes.
* Whether the stream is past the first 1024 bytes.
*/
private boolean metaBoundaryPassed;
@ -842,37 +837,32 @@ public class Tokenizer implements Locator {
}
/**
* Appends to the smaller buffer.
* Appends to the buffer.
*
* @param c
* the UTF-16 code unit to append
*/
private void appendStrBuf(char c) {
if (strBufLen == strBuf.length) {
char[] newBuf = new char[strBuf.length + Tokenizer.BUFFER_GROW_BY];
System.arraycopy(strBuf, 0, newBuf, 0, strBuf.length);
strBuf = newBuf;
}
strBuf[strBufLen++] = c;
}
/**
* The smaller buffer as a String. Currently only used for error reporting.
* The buffer as a String. Currently only used for error reporting.
*
* <p>
* C++ memory note: The return value must be released.
*
* @return the smaller buffer as a string
* @return the buffer as a string
*/
protected String strBufToString() {
return Portability.newStringFromBuffer(strBuf, 0, strBufLen);
}
/**
* Returns the short buffer as a local name. The return value is released in
* Returns the buffer as a local name. The return value is released in
* emitDoctypeToken().
*
* @return the smaller buffer as local name
* @return the buffer as local name
*/
private void strBufToDoctypeName() {
doctypeName = Portability.newLocalNameFromBuffer(strBuf, 0, strBufLen,
@ -880,7 +870,7 @@ public class Tokenizer implements Locator {
}
/**
* Emits the smaller buffer as character tokens.
* Emits the buffer as character tokens.
*
* @throws SAXException
* if the token handler threw
@ -962,7 +952,7 @@ public class Tokenizer implements Locator {
}
/**
* Append the contents of the smaller buffer to the larger one.
* Append the contents of the char reference buffer to the main one.
*/
@Inline private void appendCharRefBufToStrBuf() {
appendStrBuf(charRefBuf, 0, charRefBufLen);
@ -1260,6 +1250,13 @@ public class Tokenizer implements Locator {
lastCR = false;
int start = buffer.getStart();
int end = buffer.getEnd();
// In C++, the caller of tokenizeBuffer needs to do this explicitly.
// [NOCPP[
ensureBufferSpace(end - start);
// ]NOCPP]
/**
* The index of the last <code>char</code> read from <code>buf</code>.
*/
@ -1310,9 +1307,9 @@ public class Tokenizer implements Locator {
// CPPONLY: }
// [NOCPP[
pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState,
buffer.getEnd());
end);
// ]NOCPP]
if (pos == buffer.getEnd()) {
if (pos == end) {
// exiting due to end of buffer
buffer.setStart(pos);
} else {
@ -1321,6 +1318,32 @@ public class Tokenizer implements Locator {
return lastCR;
}
// [NOCPP[
private void ensureBufferSpace(int inputLength) throws SAXException {
// Add 2 to account for emissions of LT_GT, LT_SOLIDUS and RSQB_RSQB.
// Adding to the general worst case instead of only the
// TreeBuilder-exposed worst case to avoid re-introducing a bug when
// unifying the tokenizer and tree builder buffers in the future.
int worstCase = strBufLen + inputLength + charRefBufLen + 2;
tokenHandler.ensureBufferSpace(worstCase);
if (strBuf == null) {
// Add an arbitrary small value to avoid immediate reallocation
// once there are a few characters in the buffer.
strBuf = new char[worstCase + 128];
} else if (worstCase > strBuf.length) {
// HotSpot reportedly allocates memory with 8-byte accuracy, so
// there's no point in trying to do math here to avoid slop.
// Maybe we should add some small constant to worstCase here
// but not doing that without profiling. In C++ with jemalloc,
// the corresponding method should do math to round up here
// to avoid slop.
char[] newBuf = new char[worstCase];
System.arraycopy(strBuf, 0, newBuf, 0, strBufLen);
strBuf = newBuf;
}
}
// ]NOCPP]
@SuppressWarnings("unused") private int stateLoop(int state, char c,
int pos, @NoLength char[] buf, boolean reconsume, int returnState,
int endPos) throws SAXException {
@ -6710,7 +6733,7 @@ public class Tokenizer implements Locator {
public void initializeWithoutStarting() throws SAXException {
confident = false;
strBuf = new char[1024];
strBuf = null;
line = 1;
// [NOCPP[
html4 = false;

View File

@ -602,7 +602,7 @@ public abstract class TreeBuilder<T> implements TokenHandler,
// ]NOCPP]
start(fragment);
charBufferLen = 0;
charBuffer = new char[1024];
charBuffer = null;
framesetOk = true;
if (fragment) {
T elt;
@ -5594,14 +5594,30 @@ public abstract class TreeBuilder<T> implements TokenHandler,
private final void accumulateCharactersForced(@Const @NoLength char[] buf,
int start, int length) throws SAXException {
int newLen = charBufferLen + length;
if (newLen > charBuffer.length) {
char[] newBuf = new char[newLen];
System.arraycopy(buf, start, charBuffer, charBufferLen, length);
charBufferLen += length;
}
@Override public void ensureBufferSpace(int inputLength)
throws SAXException {
// TODO: Unify Tokenizer.strBuf and TreeBuilder.charBuffer so that
// this method becomes unnecessary.
int worstCase = charBufferLen + inputLength;
if (charBuffer == null) {
// Add an arbitrary small value to avoid immediate reallocation
// once there are a few characters in the buffer.
charBuffer = new char[worstCase + 128];
} else if (worstCase > charBuffer.length) {
// HotSpot reportedly allocates memory with 8-byte accuracy, so
// there's no point in trying to do math here to avoid slop.
// Maybe we should add some small constant to worstCase here
// but not doing that without profiling. In C++ with jemalloc,
// the corresponding method should do math to round up here
// to avoid slop.
char[] newBuf = new char[worstCase];
System.arraycopy(charBuffer, 0, newBuf, 0, charBufferLen);
charBuffer = newBuf;
}
System.arraycopy(buf, start, charBuffer, charBufferLen, length);
charBufferLen = newLen;
}
// ]NOCPP]

View File

@ -117,6 +117,15 @@ public final class UTF16Buffer {
return start < end;
}
/**
* Returns <code>end - start</code>.
*
* @return <code>end - start</code>
*/
public int getLength() {
return end - start;
}
/**
* Adjusts the start index to skip over the first character if it is a line
* feed and the previous character was a carriage return.

View File

@ -370,6 +370,9 @@ nsHtml5Parser::Parse(const nsAString& aSourceBuffer,
lineNumberSave = mTokenizer->getLineNumber();
}
if (!mTokenizer->EnsureBufferSpace(stackBuffer.getLength())) {
return mExecutor->MarkAsBroken(NS_ERROR_OUT_OF_MEMORY);
}
mLastWasCR = mTokenizer->tokenizeBuffer(&stackBuffer);
if (inRootContext) {
@ -476,6 +479,10 @@ nsHtml5Parser::Parse(const nsAString& aSourceBuffer,
while (stackBuffer.hasMore()) {
stackBuffer.adjust(mDocWriteSpeculativeLastWasCR);
if (stackBuffer.hasMore()) {
if (!mDocWriteSpeculativeTokenizer->EnsureBufferSpace(
stackBuffer.getLength())) {
return mExecutor->MarkAsBroken(NS_ERROR_OUT_OF_MEMORY);
}
mDocWriteSpeculativeLastWasCR =
mDocWriteSpeculativeTokenizer->tokenizeBuffer(&stackBuffer);
}
@ -613,16 +620,19 @@ nsHtml5Parser::ParseUntilBlocked()
return NS_OK;
}
if (mDocumentClosed) {
nsresult rv;
NS_ASSERTION(!GetStreamParser(),
"This should only happen with script-created parser.");
mTokenizer->eof();
mTreeBuilder->StreamEnded();
if (NS_SUCCEEDED((rv = mExecutor->IsBroken()))) {
mTokenizer->eof();
mTreeBuilder->StreamEnded();
}
mTreeBuilder->Flush();
mExecutor->FlushDocumentWrite();
// The below call does memory cleanup, so call it even if the
// parser has been marked as broken.
mTokenizer->end();
return NS_OK;
return rv;
}
// never release the last buffer.
NS_ASSERTION(!mLastBuffer->getStart() && !mLastBuffer->getEnd(),
@ -662,6 +672,9 @@ nsHtml5Parser::ParseUntilBlocked()
if (inRootContext) {
mTokenizer->setLineNumber(mRootContextLineNumber);
}
if (!mTokenizer->EnsureBufferSpace(mFirstBuffer->getLength())) {
return mExecutor->MarkAsBroken(NS_ERROR_OUT_OF_MEMORY);
}
mLastWasCR = mTokenizer->tokenizeBuffer(mFirstBuffer);
if (inRootContext) {
mRootContextLineNumber = mTokenizer->getLineNumber();

View File

@ -890,7 +890,10 @@ nsHtml5StreamParser::OnStartRequest(nsIRequest* aRequest, nsISupports* aContext)
mTreeBuilder->StartPlainText();
mTokenizer->StartPlainText();
} else if (mMode == VIEW_SOURCE_PLAIN) {
mTreeBuilder->StartPlainTextViewSource(NS_ConvertUTF8toUTF16(mViewSourceTitle));
nsAutoString viewSourceTitle;
CopyUTF8toUTF16(mViewSourceTitle, viewSourceTitle);
mTreeBuilder->EnsureBufferSpace(viewSourceTitle.Length());
mTreeBuilder->StartPlainTextViewSource(viewSourceTitle);
mTokenizer->StartPlainText();
}
@ -1364,10 +1367,12 @@ nsHtml5StreamParser::ParseAvailableData()
0);
}
}
mTokenizer->eof();
mTreeBuilder->StreamEnded();
if (mMode == VIEW_SOURCE_HTML || mMode == VIEW_SOURCE_XML) {
mTokenizer->EndViewSource();
if (NS_SUCCEEDED(mTreeBuilder->IsBroken())) {
mTokenizer->eof();
mTreeBuilder->StreamEnded();
if (mMode == VIEW_SOURCE_HTML || mMode == VIEW_SOURCE_XML) {
mTokenizer->EndViewSource();
}
}
FlushTreeOpsAndDisarmTimer();
return; // no more data and not expecting more
@ -1384,6 +1389,10 @@ nsHtml5StreamParser::ParseAvailableData()
mFirstBuffer->adjust(mLastWasCR);
mLastWasCR = false;
if (mFirstBuffer->hasMore()) {
if (!mTokenizer->EnsureBufferSpace(mFirstBuffer->getLength())) {
MarkAsBroken(NS_ERROR_OUT_OF_MEMORY);
return;
}
mLastWasCR = mTokenizer->tokenizeBuffer(mFirstBuffer);
// At this point, internalEncodingDeclaration() may have called
// Terminate, but that never happens together with script.

View File

@ -109,6 +109,10 @@ nsHtml5StringParser::Tokenize(const nsAString& aSourceBuffer,
buffer.adjust(lastWasCR);
lastWasCR = false;
if (buffer.hasMore()) {
if (!mTokenizer->EnsureBufferSpace(buffer.getLength())) {
rv = mBuilder->MarkAsBroken(NS_ERROR_OUT_OF_MEMORY);
break;
}
lastWasCR = mTokenizer->tokenizeBuffer(&buffer);
if (NS_FAILED(rv = mBuilder->IsBroken())) {
break;
@ -116,7 +120,9 @@ nsHtml5StringParser::Tokenize(const nsAString& aSourceBuffer,
}
}
}
mTokenizer->eof();
if (NS_SUCCEEDED(rv)) {
mTokenizer->eof();
}
mTokenizer->end();
mBuilder->Finish();
mAtomTable.Clear();

View File

@ -223,11 +223,6 @@ nsHtml5Tokenizer::emitOrAppendCharRefBuf(int32_t returnState)
void
nsHtml5Tokenizer::appendStrBuf(char16_t c)
{
if (strBufLen == strBuf.length) {
jArray<char16_t,int32_t> newBuf = jArray<char16_t,int32_t>::newJArray(strBuf.length + NS_HTML5TOKENIZER_BUFFER_GROW_BY);
nsHtml5ArrayCopy::arraycopy(strBuf, newBuf, strBuf.length);
strBuf = newBuf;
}
strBuf[strBufLen++] = c;
}
@ -374,6 +369,7 @@ nsHtml5Tokenizer::tokenizeBuffer(nsHtml5UTF16Buffer* buffer)
shouldSuspend = false;
lastCR = false;
int32_t start = buffer->getStart();
int32_t end = buffer->getEnd();
int32_t pos = start - 1;
switch(state) {
case NS_HTML5TOKENIZER_DATA:
@ -408,7 +404,7 @@ nsHtml5Tokenizer::tokenizeBuffer(nsHtml5UTF16Buffer* buffer)
} else {
pos = stateLoop<nsHtml5SilentPolicy>(state, c, pos, buffer->getBuffer(), false, returnState, buffer->getEnd());
}
if (pos == buffer->getEnd()) {
if (pos == end) {
buffer->setStart(pos);
} else {
buffer->setStart(pos + 1);
@ -4045,7 +4041,7 @@ void
nsHtml5Tokenizer::initializeWithoutStarting()
{
confident = false;
strBuf = jArray<char16_t,int32_t>::newJArray(1024);
strBuf = nullptr;
line = 1;
resetToDataState();
}

View File

@ -367,7 +367,6 @@ class nsHtml5Tokenizer
#define NS_HTML5TOKENIZER_PROCESSING_INSTRUCTION 73
#define NS_HTML5TOKENIZER_PROCESSING_INSTRUCTION_QUESTION_MARK 74
#define NS_HTML5TOKENIZER_LEAD_OFFSET (0xD800 - (0x10000 >> 10))
#define NS_HTML5TOKENIZER_BUFFER_GROW_BY 1024
#endif

View File

@ -4,6 +4,44 @@
#include "mozilla/Likely.h"
bool
nsHtml5Tokenizer::EnsureBufferSpace(int32_t aLength)
{
MOZ_ASSERT(aLength >= 0, "Negative length.");
// Add 2 to account for emissions of LT_GT, LT_SOLIDUS and RSQB_RSQB.
// Adding to the general worst case instead of only the
// TreeBuilder-exposed worst case to avoid re-introducing a bug when
// unifying the tokenizer and tree builder buffers in the future.
size_t worstCase = size_t(strBufLen) +
size_t(aLength) +
size_t(charRefBufLen) +
size_t(2);
if (worstCase > INT32_MAX) {
// Since we index into the buffer using int32_t due to the Java heritage
// of the code, let's treat this as OOM.
return false;
}
// TODO: Unify nsHtml5Tokenizer::strBuf and nsHtml5TreeBuilder::charBuffer
// so that the call below becomes unnecessary.
tokenHandler->EnsureBufferSpace(worstCase);
if (!strBuf) {
// Add one to round to the next power of two to avoid immediate
// reallocation once there are a few characters in the buffer.
strBuf = jArray<char16_t,int32_t>::newFallibleJArray(mozilla::RoundUpPow2(worstCase + 1));
if (!strBuf) {
return false;
}
} else if (worstCase > size_t(strBuf.length)) {
jArray<char16_t,int32_t> newBuf = jArray<char16_t,int32_t>::newFallibleJArray(mozilla::RoundUpPow2(worstCase));
if (!newBuf) {
return false;
}
memcpy(newBuf,strBuf, sizeof(char16_t) * strBufLen);
strBuf = newBuf;
}
return true;
}
void
nsHtml5Tokenizer::StartPlainText()
{

View File

@ -7,6 +7,16 @@ inline nsHtml5HtmlAttributes* GetAttributes()
return attributes;
}
/**
* Makes sure the buffers are large enough to be able to tokenize aLength
* UTF-16 code units before having to make the buffers larger.
*
* @param aLength the number of UTF-16 code units to be tokenized before the
* next call to this method.
* @return true if successful; false if out of memory
*/
bool EnsureBufferSpace(int32_t aLength);
nsAutoPtr<nsHtml5Highlighter> mViewSource;
/**

View File

@ -88,7 +88,7 @@ nsHtml5TreeBuilder::startTokenization(nsHtml5Tokenizer* self)
deepTreeSurrogateParent = nullptr;
start(fragment);
charBufferLen = 0;
charBuffer = jArray<char16_t,int32_t>::newJArray(1024);
charBuffer = nullptr;
framesetOk = true;
if (fragment) {
nsIContentHandle* elt;

View File

@ -24,6 +24,7 @@ nsHtml5TreeBuilder::nsHtml5TreeBuilder(nsHtml5OplessBuilder* aBuilder)
, mHandles(nullptr)
, mHandlesUsed(0)
, mSpeculativeLoadStage(nullptr)
, mBroken(NS_OK)
, mCurrentHtmlScriptIsAsyncOrDefer(false)
, mPreventScriptExecution(false)
#ifdef DEBUG
@ -48,6 +49,7 @@ nsHtml5TreeBuilder::nsHtml5TreeBuilder(nsAHtml5TreeOpSink* aOpSink,
, mHandles(new nsIContent*[NS_HTML5_TREE_BUILDER_HANDLE_ARRAY_LENGTH])
, mHandlesUsed(0)
, mSpeculativeLoadStage(aStage)
, mBroken(NS_OK)
, mCurrentHtmlScriptIsAsyncOrDefer(false)
, mPreventScriptExecution(false)
#ifdef DEBUG
@ -912,15 +914,39 @@ nsHtml5TreeBuilder::elementPopped(int32_t aNamespace, nsIAtom* aName, nsIContent
void
nsHtml5TreeBuilder::accumulateCharacters(const char16_t* aBuf, int32_t aStart, int32_t aLength)
{
int32_t newFillLen = charBufferLen + aLength;
if (newFillLen > charBuffer.length) {
int32_t newAllocLength = newFillLen + (newFillLen >> 1);
jArray<char16_t,int32_t> newBuf = jArray<char16_t,int32_t>::newJArray(newAllocLength);
MOZ_ASSERT(charBufferLen + aLength <= charBuffer.length,
"About to memcpy past the end of the buffer!");
memcpy(charBuffer + charBufferLen, aBuf + aStart, sizeof(char16_t) * aLength);
charBufferLen += aLength;
}
bool
nsHtml5TreeBuilder::EnsureBufferSpace(size_t aLength)
{
// TODO: Unify nsHtml5Tokenizer::strBuf and nsHtml5TreeBuilder::charBuffer
// so that this method becomes unnecessary.
size_t worstCase = size_t(charBufferLen) + aLength;
if (worstCase > INT32_MAX) {
// Since we index into the buffer using int32_t due to the Java heritage
// of the code, let's treat this as OOM.
return false;
}
if (!charBuffer) {
// Add one to round to the next power of two to avoid immediate
// reallocation once there are a few characters in the buffer.
charBuffer = jArray<char16_t,int32_t>::newFallibleJArray(mozilla::RoundUpPow2(worstCase + 1));
if (!charBuffer) {
return false;
}
} else if (worstCase > size_t(charBuffer.length)) {
jArray<char16_t,int32_t> newBuf = jArray<char16_t,int32_t>::newFallibleJArray(mozilla::RoundUpPow2(worstCase));
if (!newBuf) {
return false;
}
memcpy(newBuf, charBuffer, sizeof(char16_t) * charBufferLen);
charBuffer = newBuf;
}
memcpy(charBuffer + charBufferLen, aBuf + aStart, sizeof(char16_t) * aLength);
charBufferLen = newFillLen;
return true;
}
nsIContentHandle*
@ -1074,6 +1100,7 @@ nsHtml5TreeBuilder::MarkAsBroken(nsresult aRv)
MOZ_ASSERT_UNREACHABLE("Must not call this with builder.");
return;
}
mBroken = aRv;
mOpQueue.Clear(); // Previous ops don't matter anymore
mOpQueue.AppendElement()->Init(aRv);
}

View File

@ -18,6 +18,7 @@
int32_t mHandlesUsed;
nsTArray<nsAutoArrayPtr<nsIContent*> > mOldHandles;
nsHtml5TreeOpStage* mSpeculativeLoadStage;
nsresult mBroken;
bool mCurrentHtmlScriptIsAsyncOrDefer;
bool mPreventScriptExecution;
#ifdef DEBUG
@ -127,6 +128,16 @@
return mBuilder;
}
/**
* Makes sure the buffers are large enough to be able to tokenize aLength
* UTF-16 code units before having to make the buffers larger.
*
* @param aLength the number of UTF-16 code units to be tokenized before the
* next call to this method.
* @return true if successful; false if out of memory
*/
bool EnsureBufferSpace(size_t aLength);
void EnableViewSource(nsHtml5Highlighter* aHighlighter);
void errStrayStartTag(nsIAtom* aName);
@ -224,3 +235,12 @@
void errEndWithUnclosedElements(nsIAtom* aName);
void MarkAsBroken(nsresult aRv);
/**
* Checks if this parser is broken. Returns a non-NS_OK (i.e. non-0)
* value if broken.
*/
nsresult IsBroken()
{
return mBroken;
}

View File

@ -84,6 +84,12 @@ nsHtml5UTF16Buffer::hasMore()
return start < end;
}
int32_t
nsHtml5UTF16Buffer::getLength()
{
return end - start;
}
void
nsHtml5UTF16Buffer::adjust(bool lastWasCR)
{

View File

@ -67,6 +67,7 @@ class nsHtml5UTF16Buffer
char16_t* getBuffer();
int32_t getEnd();
bool hasMore();
int32_t getLength();
void adjust(bool lastWasCR);
void setEnd(int32_t end);
static void initializeStatics();