[YAMLIO] Support non-null-terminated inputs

In some places the parser guards against dereferencing `End`, while in
others it relies on the presence of a trailing `'\0'` to elide checks.

Add the remaining guards needed to ensure the parser never attempts to
dereference `End`, making it safe to not require a null-terminated input
buffer.

Update the parser fuzzer harness so that it tests with buffers that are
guaranteed to be non-null-terminated, null-terminated, and 1-terminated,
additionally ensuring the result of the parse is the same in each case.

Some of the regression tests were written by inspection, and some are
cases caught by the fuzzer which required additional fixes in the
parser.

Differential Revision: https://reviews.llvm.org/D84050
This commit is contained in:
Scott Linder 2020-11-16 19:05:06 +00:00
parent 544cb649d7
commit 2980933d85
3 changed files with 127 additions and 19 deletions

View File

@ -200,13 +200,12 @@ static UTF8Decoded decodeUTF8(StringRef Range) {
StringRef::iterator End = Range.end();
// 1 byte: [0x00, 0x7f]
// Bit pattern: 0xxxxxxx
if ((*Position & 0x80) == 0) {
return std::make_pair(*Position, 1);
if (Position < End && (*Position & 0x80) == 0) {
return std::make_pair(*Position, 1);
}
// 2 bytes: [0x80, 0x7ff]
// Bit pattern: 110xxxxx 10xxxxxx
if (Position + 1 != End &&
((*Position & 0xE0) == 0xC0) &&
if (Position + 1 < End && ((*Position & 0xE0) == 0xC0) &&
((*(Position + 1) & 0xC0) == 0x80)) {
uint32_t codepoint = ((*Position & 0x1F) << 6) |
(*(Position + 1) & 0x3F);
@ -215,8 +214,7 @@ static UTF8Decoded decodeUTF8(StringRef Range) {
}
// 3 bytes: [0x8000, 0xffff]
// Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx
if (Position + 2 != End &&
((*Position & 0xF0) == 0xE0) &&
if (Position + 2 < End && ((*Position & 0xF0) == 0xE0) &&
((*(Position + 1) & 0xC0) == 0x80) &&
((*(Position + 2) & 0xC0) == 0x80)) {
uint32_t codepoint = ((*Position & 0x0F) << 12) |
@ -230,8 +228,7 @@ static UTF8Decoded decodeUTF8(StringRef Range) {
}
// 4 bytes: [0x10000, 0x10FFFF]
// Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
if (Position + 3 != End &&
((*Position & 0xF8) == 0xF0) &&
if (Position + 3 < End && ((*Position & 0xF8) == 0xF0) &&
((*(Position + 1) & 0xC0) == 0x80) &&
((*(Position + 2) & 0xC0) == 0x80) &&
((*(Position + 3) & 0xC0) == 0x80)) {
@ -773,7 +770,7 @@ void Scanner::init(MemoryBufferRef Buffer) {
IsSimpleKeyAllowed = true;
Failed = false;
std::unique_ptr<MemoryBuffer> InputBufferOwner =
MemoryBuffer::getMemBuffer(Buffer);
MemoryBuffer::getMemBuffer(Buffer, /*RequiresNullTerminator=*/false);
SM.AddNewSourceBuffer(std::move(InputBufferOwner), SMLoc());
}
@ -1036,7 +1033,7 @@ bool Scanner::rollIndent( int ToColumn
}
void Scanner::skipComment() {
if (*Current != '#')
if (Current == End || *Current != '#')
return;
while (true) {
// This may skip more than one byte, thus Column is only incremented
@ -1051,7 +1048,7 @@ void Scanner::skipComment() {
void Scanner::scanToNextToken() {
while (true) {
while (*Current == ' ' || *Current == '\t') {
while (Current != End && (*Current == ' ' || *Current == '\t')) {
skip(1);
}
@ -1286,7 +1283,7 @@ bool Scanner::scanFlowScalar(bool IsDoubleQuoted) {
&& wasEscaped(Start + 1, Current));
} else {
skip(1);
while (true) {
while (Current != End) {
// Skip a ' followed by another '.
if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') {
skip(2);
@ -1334,13 +1331,14 @@ bool Scanner::scanPlainScalar() {
unsigned LeadingBlanks = 0;
assert(Indent >= -1 && "Indent must be >= -1 !");
unsigned indent = static_cast<unsigned>(Indent + 1);
while (true) {
while (Current != End) {
if (*Current == '#')
break;
while (!isBlankOrBreak(Current)) {
if ( FlowLevel && *Current == ':'
&& !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) {
while (Current != End && !isBlankOrBreak(Current)) {
if (FlowLevel && *Current == ':' &&
(Current + 1 == End ||
!(isBlankOrBreak(Current + 1) || *(Current + 1) == ','))) {
setError("Found unexpected ':' while scanning a plain scalar", Current);
return false;
}
@ -1410,7 +1408,7 @@ bool Scanner::scanAliasOrAnchor(bool IsAlias) {
StringRef::iterator Start = Current;
unsigned ColStart = Column;
skip(1);
while(true) {
while (Current != End) {
if ( *Current == '[' || *Current == ']'
|| *Current == '{' || *Current == '}'
|| *Current == ','

View File

@ -21,12 +21,32 @@ static bool isValidYaml(const uint8_t *Data, size_t Size) {
extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
std::vector<uint8_t> Input(Data, Data + Size);
// Ensure we don't crash on any arbitrary byte string.
isValidYaml(Input.data(), Input.size());
// Ensure we don't crash on byte strings with no null characters.
Input.erase(std::remove(Input.begin(), Input.end(), 0), Input.end());
Input.shrink_to_fit();
bool IsValidWithout0s = isValidYaml(Input.data(), Input.size());
// Ensure we don't crash on byte strings where the only null character is
// one-past-the-end of the actual input to the parser.
Input.erase(std::remove(Input.begin(), Input.end(), 0), Input.end());
Input.push_back(0);
Input.shrink_to_fit();
isValidYaml(Input.data(), Input.size() - 1);
bool IsValidWhen0Terminated = isValidYaml(Input.data(), Input.size() - 1);
// Ensure we don't crash on byte strings with no null characters, but with
// an invalid character one-past-the-end of the actual input to the parser.
Input.back() = 1;
bool IsValidWhen1Terminated = isValidYaml(Input.data(), Input.size() - 1);
// The parser should either accept all of these inputs, or reject all of
// them, because the parser sees an identical byte string in each case. This
// should hopefully catch some cases where the parser is sensitive to what is
// present one-past-the-end of the actual input.
if (IsValidWithout0s != IsValidWhen0Terminated ||
IsValidWhen0Terminated != IsValidWhen1Terminated)
LLVM_BUILTIN_TRAP;
return 0;
}

View File

@ -3111,5 +3111,95 @@ TEST(YAMLIO, TestEmptyAlias) {
TEST(YAMLIO, TestEmptyAnchor) {
Input yin("*");
EXPECT_FALSE(yin.setCurrentDocument());
}
TEST(YAMLIO, TestScannerNoNullEmpty) {
std::vector<char> str{};
Input yin(llvm::StringRef(str.data(), str.size()));
yin.setCurrentDocument();
EXPECT_FALSE(yin.error());
}
TEST(YAMLIO, TestScannerNoNullSequenceOfNull) {
std::vector<char> str{'-'};
Input yin(llvm::StringRef(str.data(), str.size()));
yin.setCurrentDocument();
EXPECT_FALSE(yin.error());
}
TEST(YAMLIO, TestScannerNoNullSimpleSequence) {
std::vector<char> str{'-', ' ', 'a'};
Input yin(llvm::StringRef(str.data(), str.size()));
yin.setCurrentDocument();
EXPECT_FALSE(yin.error());
}
TEST(YAMLIO, TestScannerNoNullUnbalancedMap) {
std::vector<char> str{'{'};
Input yin(llvm::StringRef(str.data(), str.size()));
yin.setCurrentDocument();
EXPECT_TRUE(yin.error());
}
TEST(YAMLIO, TestScannerNoNullEmptyMap) {
std::vector<char> str{'{', '}'};
Input yin(llvm::StringRef(str.data(), str.size()));
yin.setCurrentDocument();
EXPECT_FALSE(yin.error());
}
TEST(YAMLIO, TestScannerNoNullUnbalancedSequence) {
std::vector<char> str{'['};
Input yin(llvm::StringRef(str.data(), str.size()));
yin.setCurrentDocument();
EXPECT_TRUE(yin.error());
}
TEST(YAMLIO, TestScannerNoNullEmptySequence) {
std::vector<char> str{'[', ']'};
Input yin(llvm::StringRef(str.data(), str.size()));
yin.setCurrentDocument();
EXPECT_FALSE(yin.error());
}
TEST(YAMLIO, TestScannerNoNullScalarUnbalancedDoubleQuote) {
std::vector<char> str{'"'};
Input yin(llvm::StringRef(str.data(), str.size()));
yin.setCurrentDocument();
EXPECT_TRUE(yin.error());
}
TEST(YAMLIO, TestScannerNoNullScalarUnbalancedSingleQuote) {
std::vector<char> str{'\''};
Input yin(llvm::StringRef(str.data(), str.size()));
yin.setCurrentDocument();
EXPECT_TRUE(yin.error());
}
TEST(YAMLIO, TestScannerNoNullEmptyAlias) {
std::vector<char> str{'&'};
Input yin(llvm::StringRef(str.data(), str.size()));
yin.setCurrentDocument();
EXPECT_TRUE(yin.error());
}
TEST(YAMLIO, TestScannerNoNullEmptyAnchor) {
std::vector<char> str{'*'};
Input yin(llvm::StringRef(str.data(), str.size()));
yin.setCurrentDocument();
EXPECT_TRUE(yin.error());
}
TEST(YAMLIO, TestScannerNoNullDecodeInvalidUTF8) {
std::vector<char> str{'\xef'};
Input yin(llvm::StringRef(str.data(), str.size()));
yin.setCurrentDocument();
EXPECT_TRUE(yin.error());
}
TEST(YAMLIO, TestScannerNoNullScanPlainScalarInFlow) {
std::vector<char> str{'{', 'a', ':'};
Input yin(llvm::StringRef(str.data(), str.size()));
yin.setCurrentDocument();
EXPECT_TRUE(yin.error());
}