Bug 1746374 - Use the number of UTF-16 code units as column number. r=iain

Differential Revision: https://phabricator.services.mozilla.com/D185372
This commit is contained in:
Tooru Fujisawa 2023-08-11 08:11:48 +00:00
parent 148500b129
commit 42011c65e9
28 changed files with 124 additions and 104 deletions

View File

@ -50,7 +50,7 @@ struct ProcessorErrorDetails {
ProcessorErrorDetails() : mLineno(0), mColno(0) {}
// Line number (1-origin).
unsigned mLineno;
// Column number (1-origin).
// Column number in UTF-16 code units (1-origin).
unsigned mColno;
nsString mFilename;
nsString mMessage;

View File

@ -22,8 +22,10 @@ class WorkerErrorBase {
public:
nsString mMessage;
nsString mFilename;
uint32_t mLineNumber; // 1-origin.
uint32_t mColumnNumber; // 1-origin
// Line number (1-origin).
uint32_t mLineNumber;
// Column number in UTF-16 code units (1-origin).
uint32_t mColumnNumber;
uint32_t mErrorNumber;
WorkerErrorBase() : mLineNumber(0), mColumnNumber(0), mErrorNumber(0) {}

View File

@ -455,7 +455,7 @@ class JS_PUBLIC_API ReadOnlyCompileOptions : public TransitiveCompileOptions {
// Line number of the first character (1-origin).
unsigned lineno = 1;
// Column number of the first character (0-origin).
// Column number of the first character in UTF-16 code units (0-origin).
unsigned column = 0;
// The offset within the ScriptSource's full uncompressed text of the first

View File

@ -120,7 +120,7 @@ class JSErrorBase {
// Source line number (1-origin).
unsigned lineno;
// Column number in line (1-origin).
// Column number in line in UTF-16 code units (1-origin).
unsigned column;
// the error number, e.g. see js/public/friend/ErrorNumbers.msg.

View File

@ -39,7 +39,7 @@ struct JitCodeSourceInfo {
// Line number (1-origin).
uint32_t lineno = 0;
// Column number (0-origin).
// Column number in UTF-16 code units (0-origin).
uint32_t colno = 0;
};

View File

@ -248,7 +248,7 @@ class BaseStackFrame {
// Get this frame's line number (1-origin).
virtual uint32_t line() const = 0;
// Get this frame's column number (1-origin).
// Get this frame's column number in UTF-16 code units (1-origin).
virtual uint32_t column() const = 0;
// Get this frame's source name. Never null.

View File

@ -69,8 +69,12 @@ class ImportEntry {
const HeapPtr<ModuleRequestObject*> moduleRequest_;
const HeapPtr<JSAtom*> importName_;
const HeapPtr<JSAtom*> localName_;
const uint32_t lineNumber_; // 1-origin
const uint32_t columnNumber_; // 0-origin
// Line number (1-origin).
const uint32_t lineNumber_;
// Column number in UTF-16 code units (0-origin).
const uint32_t columnNumber_;
public:
ImportEntry(Handle<ModuleRequestObject*> moduleRequest,
@ -93,8 +97,12 @@ class ExportEntry {
const HeapPtr<ModuleRequestObject*> moduleRequest_;
const HeapPtr<JSAtom*> importName_;
const HeapPtr<JSAtom*> localName_;
const uint32_t lineNumber_; // 1-origin.
const uint32_t columnNumber_; // 0-origin
// Line number (1-origin).
const uint32_t lineNumber_;
// Column number in UTF-16 code units (0-origin).
const uint32_t columnNumber_;
public:
ExportEntry(Handle<JSAtom*> maybeExportName,
@ -115,8 +123,12 @@ using ExportEntryVector = GCVector<ExportEntry, 0, SystemAllocPolicy>;
class RequestedModule {
const HeapPtr<ModuleRequestObject*> moduleRequest_;
const uint32_t lineNumber_; // 1-origin
const uint32_t columnNumber_; // 0-origin
// Line number (1-origin).
const uint32_t lineNumber_;
// Column number in UTF-16 code units (0-origin).
const uint32_t columnNumber_;
public:
RequestedModule(Handle<ModuleRequestObject*> moduleRequest,

View File

@ -1136,7 +1136,7 @@ class FlowGraphSummary {
// Line number (1-origin).
size_t lineno_;
// Column number (0-origin).
// Column number in UTF-16 code units (0-origin).
size_t column_;
};

View File

@ -166,8 +166,8 @@ If the referent is an Error object, this is the 1-origin line number at which
the referent was created; `undefined` otherwise.
### `errorColumnNumber`
If the referent is an Error object, this is the 1-origin column number at which
the referent was created; `undefined` otherwise.
If the referent is an Error object, this is the 1-origin column number in
UTF-16 code units at which the referent was created; `undefined` otherwise.
### `isBoundFunction`
If the referent is a debuggee function, returns `true` if the referent is a

View File

@ -111,9 +111,10 @@ source within the file or URL it was loaded from. This is normally `1`, but
may have another value if the source is part of an HTML document.
### `startColumn`
**If the instance refers to JavaScript source**, the 0-origin start column of
the source within the file or URL it was loaded from. This is normally `0`, but
may have another value if the source is part of an HTML document.
**If the instance refers to JavaScript source**, the 0-origin start column in
UTF-16 code units of the source within the file or URL it was loaded from. This
is normally `0`, but may have another value if the source is part of an HTML
document.
### `id`
**If the instance refers to JavaScript source**, an int32 counter that identifies

View File

@ -340,7 +340,7 @@ class BytecodeSection {
// we can get undefined behavior.
uint32_t currentLine_;
// 0-origin column index on currentLine_ of last
// 0-origin column index in UTF-16 code units on currentLine_ of last
// SrcNoteType::ColSpan-annotated opcode.
//
// WARNING: If this becomes out of sync with already-emitted srcnotes,

View File

@ -611,7 +611,7 @@ class StencilModuleEntry {
// Line number (1-origin).
uint32_t lineno = 0;
// Column number (0-origin).
// Column number in UTF-16 code units (0-origin).
uint32_t column = 0;
private:

View File

@ -597,8 +597,23 @@ uint32_t TokenStreamAnyChars::computePartialColumn(
const SourceUnits<Unit>& sourceUnits) const {
lineToken.assertConsistentOffset(offset);
const uint32_t line = lineNumber(lineToken);
const uint32_t start = srcCoords.lineStart(lineToken);
const uint32_t offsetInLine = offset - start;
if constexpr (std::is_same_v<Unit, char16_t>) {
// Column number is in UTF-16 code units.
return offsetInLine;
}
return computePartialColumnForUTF8(lineToken, offset, start, offsetInLine,
sourceUnits);
}
template <typename Unit>
uint32_t TokenStreamAnyChars::computePartialColumnForUTF8(
const LineToken lineToken, const uint32_t offset, const uint32_t start,
const uint32_t offsetInLine, const SourceUnits<Unit>& sourceUnits) const {
const uint32_t line = lineNumber(lineToken);
// Reset the previous offset/column cache for this line, if the previous
// lookup wasn't on this line.
@ -630,13 +645,13 @@ uint32_t TokenStreamAnyChars::computePartialColumn(
partialOffset += offsetDelta;
if (unitsType == UnitsType::GuaranteedSingleUnit) {
MOZ_ASSERT(unicode::CountCodePoints(begin, end) == offsetDelta,
MOZ_ASSERT(unicode::CountUTF16CodeUnits(begin, end) == offsetDelta,
"guaranteed-single-units also guarantee pointer distance "
"equals code point count");
"equals UTF-16 code unit count");
partialCols += offsetDelta;
} else {
partialCols +=
AssertedCast<uint32_t>(unicode::CountCodePoints(begin, end));
AssertedCast<uint32_t>(unicode::CountUTF16CodeUnits(begin, end));
}
this->lastOffsetOfComputedColumn_ = partialOffset;
@ -644,8 +659,6 @@ uint32_t TokenStreamAnyChars::computePartialColumn(
return partialCols;
};
const uint32_t offsetInLine = offset - start;
// We won't add an entry to |longLineColumnInfo_| for lines where the maximum
// column has offset less than this value. The most common (non-minified)
// long line length is likely 80ch, maybe 100ch, so we use that, rounded up to
@ -786,16 +799,17 @@ uint32_t TokenStreamAnyChars::computePartialColumn(
MOZ_ASSERT(chunkLimit <= limit);
size_t numUnits = PointerRangeSize(begin, chunkLimit);
size_t numCodePoints = unicode::CountCodePoints(begin, chunkLimit);
size_t numUTF16CodeUnits =
unicode::CountUTF16CodeUnits(begin, chunkLimit);
// If this chunk (which will become non-final at the end of the loop) is
// all single-unit code points, annotate the chunk accordingly.
if (numUnits == numCodePoints) {
if (numUnits == numUTF16CodeUnits) {
lastChunkVectorForLine_->back().guaranteeSingleUnits();
}
partialOffset += numUnits;
partialColumn += numCodePoints;
partialColumn += numUTF16CodeUnits;
lastChunkVectorForLine_->infallibleEmplaceBack(
partialColumn, UnitsType::PossiblyMultiUnit);

View File

@ -363,12 +363,12 @@ class SourceUnits;
*
* for either |Unit = Utf8Unit| or |Unit = char16_t|.
*
* Note that the latter quantity is *not* the same as a column number, which is
* a count of code *points*. Computing a column number requires the offset
* within the line and the source units of that line (including what type |Unit|
* is, to know how to decode them). If you need a column number, functions in
* |GeneralTokenStreamChars<Unit>| will consult this and source units to compute
* it.
* Note that, if |Unit = Utf8Unit|, the latter quantity is *not* the same as a
* column number, which is a count of UTF-16 code units. Computing a column
* number requires the offset within the line and the source units of that line
* (including what type |Unit| is, to know how to decode them). If you need a
* column number, functions in |GeneralTokenStreamChars<Unit>| will consult
* this and source units to compute it.
*/
class SourceCoords {
// For a given buffer holding source code, |lineStartOffsets_| has one
@ -515,7 +515,7 @@ enum class UnitsType : unsigned char {
class ChunkInfo {
private:
// Column number (0-origin).
// Column number in UTF-16 code units (0-origin).
// Store everything in |unsigned char|s so everything packs.
unsigned char column_[sizeof(uint32_t)];
unsigned char unitsType_;
@ -580,6 +580,7 @@ class TokenStreamAnyChars : public TokenStreamShared {
JS::ConstUTF8CharsZ filename_;
// Column number computation fields.
// Used only for UTF-8 case.
/**
* A map of (line number => sequence of the column numbers at
@ -911,7 +912,7 @@ class TokenStreamAnyChars : public TokenStreamShared {
private:
/**
* Compute the "partial" column number in Unicode code points of the absolute
* Compute the "partial" column number in UTF-16 code units of the absolute
* |offset| within source text on the line of |lineToken| (which must have
* been computed from |offset|).
*
@ -944,13 +945,13 @@ class TokenStreamAnyChars : public TokenStreamShared {
* the browser before SpiderMonkey would see it. So the partial column of the
* "4" in the inequality would be 16, not 19.
*
* Code points are not all equal length, so counting requires *some* kind of
* linear-time counting from the start of the line. This function attempts
* various tricks to reduce this cost. If these optimizations succeed,
* repeated calls to this function on a line will pay a one-time cost linear
* in the length of the line, then each call pays a separate constant-time
* cost. If the optimizations do not succeed, this function works in time
* linear in the length of the line.
* UTF-16 code units are not all equal length in UTF-8 source, so counting
* requires *some* kind of linear-time counting from the start of the line.
* This function attempts various tricks to reduce this cost. If these
* optimizations succeed, repeated calls to this function on a line will pay
* a one-time cost linear in the length of the line, then each call pays a
* separate constant-time cost. If the optimizations do not succeed, this
* function works in time linear in the length of the line.
*
* It's unusual for a function in *this* class to be |Unit|-templated, but
* while this operation manages |Unit|-agnostic fields in this class and in
@ -962,6 +963,11 @@ class TokenStreamAnyChars : public TokenStreamShared {
const uint32_t offset,
const SourceUnits<Unit>& sourceUnits) const;
template <typename Unit>
uint32_t computePartialColumnForUTF8(
const LineToken lineToken, const uint32_t offset, const uint32_t start,
const uint32_t offsetInLine, const SourceUnits<Unit>& sourceUnits) const;
/**
* Update line/column information for the start of a new line at
* |lineStartOffset|.

View File

@ -179,7 +179,7 @@ static size_t ComputeColumn(const Latin1Char* begin, const Latin1Char* end) {
}
static size_t ComputeColumn(const char16_t* begin, const char16_t* end) {
return unicode::CountCodePoints(begin, end);
return unicode::CountUTF16CodeUnits(begin, end);
}
// This function is varargs purely so it can call ReportCompileErrorLatin1.

View File

@ -642,7 +642,7 @@ class MBasicBlock : public TempObject, public InlineListNode<MBasicBlock> {
// Line number (1-origin).
unsigned lineno_;
// Column number (0-origin).
// Column number in UTF-16 code units (0-origin).
unsigned columnIndex_;
public:

View File

@ -96,13 +96,13 @@ bool cls_testPrintError_PrintWarning::warningSuccess = false;
#define BURRITO "\xF0\x9F\x8C\xAF"
BEGIN_TEST(testPrintError_UTF16CodePoints) {
BEGIN_TEST(testPrintError_UTF16CodeUnits) {
AutoStreamBuffer buf;
static const char utf8code[] =
"function f() {\n var x = `\n" BURRITO "`; " BURRITO "; } f();";
CHECK(!execDontReport(utf8code, "testPrintError_UTF16CodePoints.js", 1));
CHECK(!execDontReport(utf8code, "testPrintError_UTF16CodeUnits.js", 1));
JS::ExceptionStack exnStack(cx);
CHECK(JS::StealPendingExceptionStack(cx, &exnStack));
@ -112,14 +112,14 @@ BEGIN_TEST(testPrintError_UTF16CodePoints) {
JS::PrintError(buf.stream(), builder, false);
CHECK(
buf.contains("testPrintError_UTF16CodePoints.js:3:5 SyntaxError: illegal "
buf.contains("testPrintError_UTF16CodeUnits.js:3:6 SyntaxError: illegal "
"character U+1F32F:\n"
"testPrintError_UTF16CodePoints.js:3:5 " BURRITO
"`; " BURRITO "; } f();\n"
"testPrintError_UTF16CodePoints.js:3:5 .....^\n"));
"testPrintError_UTF16CodeUnits.js:3:6 " BURRITO "`; " BURRITO
"; } f();\n"
"testPrintError_UTF16CodeUnits.js:3:6 .....^\n"));
return true;
}
END_TEST(testPrintError_UTF16CodePoints)
END_TEST(testPrintError_UTF16CodeUnits)
#undef BURRITO

View File

@ -416,7 +416,7 @@ template size_t js::PutEscapedString(char* buffer, size_t bufferSize,
const char16_t* chars, size_t length,
uint32_t quote);
size_t js::unicode::CountCodePoints(const Utf8Unit* begin,
size_t js::unicode::CountUTF16CodeUnits(const Utf8Unit* begin,
const Utf8Unit* end) {
MOZ_ASSERT(begin <= end);
@ -430,36 +430,14 @@ size_t js::unicode::CountCodePoints(const Utf8Unit* begin,
continue;
}
#ifdef DEBUG
Maybe<char32_t> cp =
#endif
DecodeOneUtf8CodePoint(lead, &ptr, end);
Maybe<char32_t> cp = DecodeOneUtf8CodePoint(lead, &ptr, end);
MOZ_ASSERT(cp.isSome());
if (*cp > unicode::UTF16Max) {
// This uses surrogate pair.
count++;
}
}
MOZ_ASSERT(ptr == end, "bad code unit count in line?");
return count;
}
size_t js::unicode::CountCodePoints(const char16_t* begin,
const char16_t* end) {
MOZ_ASSERT(begin <= end);
size_t count = 0;
const char16_t* ptr = begin;
while (ptr < end) {
count++;
if (!IsLeadSurrogate(*ptr++)) {
continue;
}
if (ptr < end && IsTrailSurrogate(*ptr)) {
ptr++;
}
}
MOZ_ASSERT(ptr == end, "should have consumed the full range");
return count;
}

View File

@ -355,18 +355,20 @@ bool ContainsFlag(const char* str, const char* flag);
namespace unicode {
/** Compute the number of code points in the valid UTF-8 range [begin, end). */
extern size_t CountCodePoints(const mozilla::Utf8Unit* begin,
/**
* Compute the number of UTF-16 code units in the valid UTF-8 range
* [begin, end).
*/
extern size_t CountUTF16CodeUnits(const mozilla::Utf8Unit* begin,
const mozilla::Utf8Unit* end);
/**
* Count the number of code points in [begin, end).
*
* Unlike the UTF-8 case above, consistent with legacy ECMAScript practice,
* every sequence of 16-bit units is considered valid. Lone surrogates are
* treated as if they represented a code point of the same value.
* Count the number of UTF-16 code units in [begin, end).
*/
extern size_t CountCodePoints(const char16_t* begin, const char16_t* end);
inline size_t CountUTF16CodeUnits(const char16_t* begin, const char16_t* end) {
MOZ_ASSERT(begin <= end);
return end - begin;
}
} // namespace unicode

View File

@ -231,7 +231,7 @@ class BytecodeRangeWithPosition : private BytecodeRange {
// Line number (1-origin).
size_t lineno;
// Column number (0-origin).
// Column number in UTF-16 code units (0-origin).
size_t column;
const SrcNote* sn;

View File

@ -111,7 +111,7 @@ class ErrorObject : public NativeObject {
// Line number (1-origin).
inline uint32_t lineNumber() const;
// Column number (1-origin).
// Column number in UTF-16 code units (1-origin).
inline uint32_t columnNumber() const;
inline JSObject* stack() const;

View File

@ -41,7 +41,7 @@ struct ErrorMetadata {
// Line number (1-origin).
uint32_t lineNumber;
// Column number (0-origin).
// Column number in UTF-16 code units (0-origin).
uint32_t columnNumber;
// If the error occurs at a particular location, context surrounding the

View File

@ -620,7 +620,8 @@ class ScriptSource {
// Line number within the file where this source starts (1-origin).
uint32_t startLine_ = 0;
// Column number within the file where this source starts (0-origin).
// Column number within the file where this source starts,
// in UTF-16 code units (0-origin).
uint32_t startColumn_ = 0;
// See: CompileOptions::mutedErrors.
@ -1542,7 +1543,7 @@ class BaseScript : public gc::TenuredCellWithNonGCPointer<uint8_t> {
// Line number (1-origin)
uint32_t lineno() const { return extent_.lineno; }
// Column number in Unicode Code Points (0-origin)
// Column number in UTF-16 code units (0-origin)
uint32_t column() const { return extent_.column; }
JS::DelazificationOption delazificationMode() const {

View File

@ -50,7 +50,7 @@ class SavedFrame : public NativeObject {
uint32_t getSourceId();
// Line number (1-origin).
uint32_t getLine();
// Column number (1-origin).
// Column number in UTF-16 code units (1-origin).
uint32_t getColumn();
JSAtom* getFunctionDisplayName();
JSAtom* getAsyncCause();

View File

@ -218,7 +218,7 @@ struct MOZ_STACK_CLASS SavedFrame::Lookup {
// Line number (1-origin).
uint32_t line;
// Columm number (1-origin).
// Columm number in UTF-16 code units (1-origin).
uint32_t column;
JSAtom* functionDisplayName;

View File

@ -269,7 +269,7 @@ class SavedStacks {
// Line number (1-origin).
size_t line;
// Column number (1-origin).
// Column number in UTF-16 code units (1-origin).
uint32_t column;
};

View File

@ -219,8 +219,10 @@ struct SourceExtent {
uint32_t toStringEnd = 0;
// Line and column of |sourceStart_| position.
uint32_t lineno = 1; // Line number (1-origin)
uint32_t column = 0; // Column number in Unicode Code Points (0-origin)
// Line number (1-origin).
uint32_t lineno = 1;
// Column number in UTF-16 code units (0-origin).
uint32_t column = 0;
FunctionKey toFunctionKey() const {
// In eval("x=>1"), the arrow function will have a sourceStart of 0 which

View File

@ -602,8 +602,10 @@ class ErrorBase {
nsString mErrorMsg;
nsString mFileName;
uint32_t mSourceId;
uint32_t mLineNumber; // 1-origin.
uint32_t mColumn; // 1-origin.
// Line number (1-origin).
uint32_t mLineNumber;
// Column number in UTF-16 code units (1-origin).
uint32_t mColumn;
ErrorBase() : mSourceId(0), mLineNumber(0), mColumn(0) {}