mirror of
https://github.com/capstone-engine/llvm-capstone.git
synced 2024-11-26 23:21:11 +00:00
586ecdf205
This patch replaces uses of StringRef::{starts,ends}with with StringRef::{starts,ends}_with for consistency with std::{string,string_view}::{starts,ends}_with in C++20. I'm planning to deprecate and eventually remove StringRef::{starts,ends}with.
368 lines
10 KiB
C++
368 lines
10 KiB
C++
//===-- ResourceScriptToken.cpp ---------------------------------*- C++-*-===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===---------------------------------------------------------------------===//
|
|
//
|
|
// This file implements an interface defined in ResourceScriptToken.h.
|
|
// In particular, it defines an .rc script tokenizer.
|
|
//
|
|
//===---------------------------------------------------------------------===//
|
|
|
|
#include "ResourceScriptToken.h"
|
|
#include "llvm/ADT/StringExtras.h"
|
|
#include "llvm/Support/raw_ostream.h"
|
|
|
|
#include <algorithm>
|
|
#include <cassert>
|
|
#include <cctype>
|
|
#include <cstdlib>
|
|
#include <utility>
|
|
|
|
using namespace llvm;
|
|
|
|
using Kind = RCToken::Kind;
|
|
|
|
// Checks if Representation is a correct description of an RC integer.
|
|
// It should be a 32-bit unsigned integer, either decimal, octal (0[0-7]+),
|
|
// or hexadecimal (0x[0-9a-f]+). It might be followed by a single 'L'
|
|
// character (that is the difference between our representation and
|
|
// StringRef's one). If Representation is correct, 'true' is returned and
|
|
// the return value is put back in Num.
|
|
static bool rcGetAsInteger(StringRef Representation, uint32_t &Num) {
|
|
size_t Length = Representation.size();
|
|
if (Length == 0)
|
|
return false;
|
|
// Strip the last 'L' if unnecessary.
|
|
if (std::toupper(Representation.back()) == 'L')
|
|
Representation = Representation.drop_back(1);
|
|
|
|
return !Representation.getAsInteger<uint32_t>(0, Num);
|
|
}
|
|
|
|
RCToken::RCToken(RCToken::Kind RCTokenKind, StringRef Value)
|
|
: TokenKind(RCTokenKind), TokenValue(Value) {}
|
|
|
|
uint32_t RCToken::intValue() const {
|
|
assert(TokenKind == Kind::Int);
|
|
// We assume that the token already is a correct integer (checked by
|
|
// rcGetAsInteger).
|
|
uint32_t Result;
|
|
bool IsSuccess = rcGetAsInteger(TokenValue, Result);
|
|
assert(IsSuccess);
|
|
(void)IsSuccess; // Silence the compiler warning when -DNDEBUG flag is on.
|
|
return Result;
|
|
}
|
|
|
|
bool RCToken::isLongInt() const {
|
|
return TokenKind == Kind::Int && std::toupper(TokenValue.back()) == 'L';
|
|
}
|
|
|
|
StringRef RCToken::value() const { return TokenValue; }
|
|
|
|
Kind RCToken::kind() const { return TokenKind; }
|
|
|
|
bool RCToken::isBinaryOp() const {
|
|
switch (TokenKind) {
|
|
case Kind::Plus:
|
|
case Kind::Minus:
|
|
case Kind::Pipe:
|
|
case Kind::Amp:
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
static Error getStringError(const Twine &message) {
|
|
return make_error<StringError>("Error parsing file: " + message,
|
|
inconvertibleErrorCode());
|
|
}
|
|
|
|
namespace {
|
|
|
|
class Tokenizer {
|
|
public:
|
|
Tokenizer(StringRef Input) : Data(Input), DataLength(Input.size()), Pos(0) {}
|
|
|
|
Expected<std::vector<RCToken>> run();
|
|
|
|
private:
|
|
// All 'advancing' methods return boolean values; if they're equal to false,
|
|
// the stream has ended or failed.
|
|
bool advance(size_t Amount = 1);
|
|
bool skipWhitespaces();
|
|
|
|
// Consumes a token. If any problem occurred, a non-empty Error is returned.
|
|
Error consumeToken(const Kind TokenKind);
|
|
|
|
// Check if tokenizer is about to read FollowingChars.
|
|
bool willNowRead(StringRef FollowingChars) const;
|
|
|
|
// Check if tokenizer can start reading an identifier at current position.
|
|
// The original tool did non specify the rules to determine what is a correct
|
|
// identifier. We assume they should follow the C convention:
|
|
// [a-zA-Z_][a-zA-Z0-9_]*.
|
|
bool canStartIdentifier() const;
|
|
// Check if tokenizer can continue reading an identifier.
|
|
bool canContinueIdentifier() const;
|
|
|
|
// Check if tokenizer can start reading an integer.
|
|
// A correct integer always starts with a 0-9 digit,
|
|
// can contain characters 0-9A-Fa-f (digits),
|
|
// Ll (marking the integer is 32-bit), Xx (marking the representation
|
|
// is hexadecimal). As some kind of separator should come after the
|
|
// integer, we can consume the integer until a non-alphanumeric
|
|
// character.
|
|
bool canStartInt() const;
|
|
bool canContinueInt() const;
|
|
|
|
bool canStartString() const;
|
|
|
|
// Check if tokenizer can start reading a single line comment (e.g. a comment
|
|
// that begins with '//')
|
|
bool canStartLineComment() const;
|
|
|
|
// Check if tokenizer can start or finish reading a block comment (e.g. a
|
|
// comment that begins with '/*' and ends with '*/')
|
|
bool canStartBlockComment() const;
|
|
|
|
// Throw away all remaining characters on the current line.
|
|
void skipCurrentLine();
|
|
|
|
bool streamEof() const;
|
|
|
|
// Classify the token that is about to be read from the current position.
|
|
Kind classifyCurrentToken() const;
|
|
|
|
// Process the Kind::Identifier token - check if it is
|
|
// an identifier describing a block start or end.
|
|
void processIdentifier(RCToken &token) const;
|
|
|
|
StringRef Data;
|
|
size_t DataLength, Pos;
|
|
};
|
|
|
|
void Tokenizer::skipCurrentLine() {
|
|
Pos = Data.find_first_of("\r\n", Pos);
|
|
Pos = Data.find_first_not_of("\r\n", Pos);
|
|
|
|
if (Pos == StringRef::npos)
|
|
Pos = DataLength;
|
|
}
|
|
|
|
Expected<std::vector<RCToken>> Tokenizer::run() {
|
|
Pos = 0;
|
|
std::vector<RCToken> Result;
|
|
|
|
// Consume an optional UTF-8 Byte Order Mark.
|
|
if (willNowRead("\xef\xbb\xbf"))
|
|
advance(3);
|
|
|
|
while (!streamEof()) {
|
|
if (!skipWhitespaces())
|
|
break;
|
|
|
|
Kind TokenKind = classifyCurrentToken();
|
|
if (TokenKind == Kind::Invalid)
|
|
return getStringError("Invalid token found at position " + Twine(Pos));
|
|
|
|
const size_t TokenStart = Pos;
|
|
if (Error TokenError = consumeToken(TokenKind))
|
|
return std::move(TokenError);
|
|
|
|
// Comments are just deleted, don't bother saving them.
|
|
if (TokenKind == Kind::LineComment || TokenKind == Kind::StartComment)
|
|
continue;
|
|
|
|
RCToken Token(TokenKind, Data.take_front(Pos).drop_front(TokenStart));
|
|
if (TokenKind == Kind::Identifier) {
|
|
processIdentifier(Token);
|
|
} else if (TokenKind == Kind::Int) {
|
|
uint32_t TokenInt;
|
|
if (!rcGetAsInteger(Token.value(), TokenInt)) {
|
|
// The integer has incorrect format or cannot be represented in
|
|
// a 32-bit integer.
|
|
return getStringError("Integer invalid or too large: " +
|
|
Token.value().str());
|
|
}
|
|
}
|
|
|
|
Result.push_back(Token);
|
|
}
|
|
|
|
return Result;
|
|
}
|
|
|
|
bool Tokenizer::advance(size_t Amount) {
|
|
Pos += Amount;
|
|
return !streamEof();
|
|
}
|
|
|
|
bool Tokenizer::skipWhitespaces() {
|
|
while (!streamEof() && isSpace(Data[Pos]))
|
|
advance();
|
|
return !streamEof();
|
|
}
|
|
|
|
Error Tokenizer::consumeToken(const Kind TokenKind) {
|
|
switch (TokenKind) {
|
|
// One-character token consumption.
|
|
#define TOKEN(Name)
|
|
#define SHORT_TOKEN(Name, Ch) case Kind::Name:
|
|
#include "ResourceScriptTokenList.def"
|
|
advance();
|
|
return Error::success();
|
|
|
|
case Kind::LineComment:
|
|
advance(2);
|
|
skipCurrentLine();
|
|
return Error::success();
|
|
|
|
case Kind::StartComment: {
|
|
advance(2);
|
|
auto EndPos = Data.find("*/", Pos);
|
|
if (EndPos == StringRef::npos)
|
|
return getStringError(
|
|
"Unclosed multi-line comment beginning at position " + Twine(Pos));
|
|
advance(EndPos - Pos);
|
|
advance(2);
|
|
return Error::success();
|
|
}
|
|
case Kind::Identifier:
|
|
while (!streamEof() && canContinueIdentifier())
|
|
advance();
|
|
return Error::success();
|
|
|
|
case Kind::Int:
|
|
while (!streamEof() && canContinueInt())
|
|
advance();
|
|
return Error::success();
|
|
|
|
case Kind::String:
|
|
// Consume the preceding 'L', if there is any.
|
|
if (std::toupper(Data[Pos]) == 'L')
|
|
advance();
|
|
// Consume the double-quote.
|
|
advance();
|
|
|
|
// Consume the characters until the end of the file, line or string.
|
|
while (true) {
|
|
if (streamEof()) {
|
|
return getStringError("Unterminated string literal.");
|
|
} else if (Data[Pos] == '"') {
|
|
// Consume the ending double-quote.
|
|
advance();
|
|
// However, if another '"' follows this double-quote, the string didn't
|
|
// end and we just included '"' into the string.
|
|
if (!willNowRead("\""))
|
|
return Error::success();
|
|
} else if (Data[Pos] == '\n') {
|
|
return getStringError("String literal not terminated in the line.");
|
|
}
|
|
|
|
advance();
|
|
}
|
|
|
|
case Kind::Invalid:
|
|
assert(false && "Cannot consume an invalid token.");
|
|
}
|
|
|
|
llvm_unreachable("Unknown RCToken::Kind");
|
|
}
|
|
|
|
bool Tokenizer::willNowRead(StringRef FollowingChars) const {
|
|
return Data.drop_front(Pos).starts_with(FollowingChars);
|
|
}
|
|
|
|
bool Tokenizer::canStartIdentifier() const {
|
|
assert(!streamEof());
|
|
|
|
const char CurChar = Data[Pos];
|
|
return std::isalpha(CurChar) || CurChar == '_' || CurChar == '.';
|
|
}
|
|
|
|
bool Tokenizer::canContinueIdentifier() const {
|
|
assert(!streamEof());
|
|
const char CurChar = Data[Pos];
|
|
return std::isalnum(CurChar) || CurChar == '_' || CurChar == '.' ||
|
|
CurChar == '/' || CurChar == '\\' || CurChar == '-';
|
|
}
|
|
|
|
bool Tokenizer::canStartInt() const {
|
|
assert(!streamEof());
|
|
return std::isdigit(Data[Pos]);
|
|
}
|
|
|
|
bool Tokenizer::canStartBlockComment() const {
|
|
assert(!streamEof());
|
|
return Data.drop_front(Pos).starts_with("/*");
|
|
}
|
|
|
|
bool Tokenizer::canStartLineComment() const {
|
|
assert(!streamEof());
|
|
return Data.drop_front(Pos).starts_with("//");
|
|
}
|
|
|
|
bool Tokenizer::canContinueInt() const {
|
|
assert(!streamEof());
|
|
return std::isalnum(Data[Pos]);
|
|
}
|
|
|
|
bool Tokenizer::canStartString() const {
|
|
return willNowRead("\"") || willNowRead("L\"") || willNowRead("l\"");
|
|
}
|
|
|
|
bool Tokenizer::streamEof() const { return Pos == DataLength; }
|
|
|
|
Kind Tokenizer::classifyCurrentToken() const {
|
|
if (canStartBlockComment())
|
|
return Kind::StartComment;
|
|
if (canStartLineComment())
|
|
return Kind::LineComment;
|
|
|
|
if (canStartInt())
|
|
return Kind::Int;
|
|
if (canStartString())
|
|
return Kind::String;
|
|
// BEGIN and END are at this point of lexing recognized as identifiers.
|
|
if (canStartIdentifier())
|
|
return Kind::Identifier;
|
|
|
|
const char CurChar = Data[Pos];
|
|
|
|
switch (CurChar) {
|
|
// One-character token classification.
|
|
#define TOKEN(Name)
|
|
#define SHORT_TOKEN(Name, Ch) \
|
|
case Ch: \
|
|
return Kind::Name;
|
|
#include "ResourceScriptTokenList.def"
|
|
|
|
default:
|
|
return Kind::Invalid;
|
|
}
|
|
}
|
|
|
|
void Tokenizer::processIdentifier(RCToken &Token) const {
|
|
assert(Token.kind() == Kind::Identifier);
|
|
StringRef Name = Token.value();
|
|
|
|
if (Name.equals_insensitive("begin"))
|
|
Token = RCToken(Kind::BlockBegin, Name);
|
|
else if (Name.equals_insensitive("end"))
|
|
Token = RCToken(Kind::BlockEnd, Name);
|
|
}
|
|
|
|
} // anonymous namespace
|
|
|
|
namespace llvm {
|
|
|
|
Expected<std::vector<RCToken>> tokenizeRC(StringRef Input) {
|
|
return Tokenizer(Input).run();
|
|
}
|
|
|
|
} // namespace llvm
|