gecko-dev/dom/base/nsTextFragment.cpp
Henri Sivonen 3edc601325 Bug 1402247 - Use encoding_rs for XPCOM string encoding conversions. r=Nika,erahm,froydnj.
Correctness improvements:

 * UTF errors are handled safely per spec instead of dangerously truncating
   strings.

 * There are fewer converter implementations.

Performance improvements:

 * The old code did exact buffer length math, which meant doing UTF math twice
   on each input string (once for length calculation and another time for
   conversion). Exact length math is more complicated when handling errors
   properly, which the old code didn't do. The new code does UTF math on the
   string content only once (when converting) but risks allocating more than
   once. There are heuristics in place to lower the probability of
   reallocation in cases where the double math avoidance isn't enough of a
   saving to absorb an allocation and memcpy.

 * Previously, in UTF-16 <-> UTF-8 conversions, an ASCII prefix was optimized
   but a single non-ASCII code point pessimized the rest of the string. The
   new code tries to get back on the fast ASCII path.

 * UTF-16 to Latin1 conversion guarantees less about handling of out-of-range
   input to eliminate an operation from the inner loop on x86/x86_64.

 * When assigning to a pre-existing string, the new code tries to reuse the
   old buffer instead of first releasing the old buffer and then allocating a
   new one.

 * When reallocating from the new code, the memcpy covers only the data that
   is part of the logical length of the old string instead of memcpying the
   whole capacity. (For old callers old excess memcpy behavior is preserved
   due to bogus callers. See bug 1472113.)

 * UTF-8 strings in XPConnect that are in the Latin1 range are passed to
   SpiderMonkey as Latin1.

New features:

 * Conversion between UTF-8 and Latin1 is added in order to enable faster
   future interop between Rust code (or otherwise UTF-8-using code) and text
   node and SpiderMonkey code that uses Latin1.

MozReview-Commit-ID: JaJuExfILM9
2018-08-14 14:43:42 +03:00

517 lines
14 KiB
C++

/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
/*
* A class which represents a fragment of text (eg inside a text
* node); if only codepoints below 256 are used, the text is stored as
* a char*; otherwise the text is stored as a char16_t*
*/
#include "nsTextFragment.h"
#include "nsCRT.h"
#include "nsReadableUtils.h"
#include "nsMemory.h"
#include "nsBidiUtils.h"
#include "nsUnicharUtils.h"
#include "mozilla/CheckedInt.h"
#include "mozilla/MemoryReporting.h"
#include "mozilla/SSE.h"
#include "nsTextFragmentImpl.h"
#include <algorithm>
#define TEXTFRAG_WHITE_AFTER_NEWLINE 50
#define TEXTFRAG_MAX_NEWLINES 7
// Static buffer used for common fragments
static char* sSpaceSharedString[TEXTFRAG_MAX_NEWLINES + 1];
static char* sTabSharedString[TEXTFRAG_MAX_NEWLINES + 1];
static char sSingleCharSharedString[256];
using namespace mozilla;
// static
nsresult
nsTextFragment::Init()
{
// Create whitespace strings
uint32_t i;
for (i = 0; i <= TEXTFRAG_MAX_NEWLINES; ++i) {
sSpaceSharedString[i] = new char[1 + i + TEXTFRAG_WHITE_AFTER_NEWLINE];
sTabSharedString[i] = new char[1 + i + TEXTFRAG_WHITE_AFTER_NEWLINE];
sSpaceSharedString[i][0] = ' ';
sTabSharedString[i][0] = ' ';
uint32_t j;
for (j = 1; j < 1 + i; ++j) {
sSpaceSharedString[i][j] = '\n';
sTabSharedString[i][j] = '\n';
}
for (; j < (1 + i + TEXTFRAG_WHITE_AFTER_NEWLINE); ++j) {
sSpaceSharedString[i][j] = ' ';
sTabSharedString[i][j] = '\t';
}
}
// Create single-char strings
for (i = 0; i < 256; ++i) {
sSingleCharSharedString[i] = i;
}
return NS_OK;
}
// static
void
nsTextFragment::Shutdown()
{
uint32_t i;
for (i = 0; i <= TEXTFRAG_MAX_NEWLINES; ++i) {
delete [] sSpaceSharedString[i];
delete [] sTabSharedString[i];
sSpaceSharedString[i] = nullptr;
sTabSharedString[i] = nullptr;
}
}
nsTextFragment::~nsTextFragment()
{
ReleaseText();
MOZ_COUNT_DTOR(nsTextFragment);
}
void
nsTextFragment::ReleaseText()
{
if (mState.mIs2b) {
NS_RELEASE(m2b);
} else if (mState.mLength && m1b && mState.mInHeap) {
free(const_cast<char*>(m1b));
}
m1b = nullptr;
mState.mIsBidi = false;
// Set mState.mIs2b, mState.mInHeap, and mState.mLength = 0 with mAllBits;
mAllBits = 0;
}
nsTextFragment&
nsTextFragment::operator=(const nsTextFragment& aOther)
{
ReleaseText();
if (aOther.mState.mLength) {
if (!aOther.mState.mInHeap) {
MOZ_ASSERT(!aOther.mState.mIs2b);
m1b = aOther.m1b;
} else if (aOther.mState.mIs2b) {
m2b = aOther.m2b;
NS_ADDREF(m2b);
} else {
m1b = static_cast<char*>(malloc(aOther.mState.mLength));
if (m1b) {
memcpy(const_cast<char*>(m1b), aOther.m1b, aOther.mState.mLength);
} else {
// allocate a buffer for a single REPLACEMENT CHARACTER
m2b = nsStringBuffer::Alloc(sizeof(char16_t) * 2).take();
if (!m2b) {
MOZ_CRASH("OOM!");
}
char16_t* data = static_cast<char16_t*>(m2b->Data());
data[0] = 0xFFFD; // REPLACEMENT CHARACTER
data[1] = char16_t(0);
mState.mIs2b = true;
mState.mInHeap = true;
mState.mLength = 1;
return *this;
}
}
mAllBits = aOther.mAllBits;
}
return *this;
}
static inline int32_t
FirstNon8BitUnvectorized(const char16_t *str, const char16_t *end)
{
typedef Non8BitParameters<sizeof(size_t)> p;
const size_t mask = p::mask();
const uint32_t alignMask = p::alignMask();
const uint32_t numUnicharsPerWord = p::numUnicharsPerWord();
const int32_t len = end - str;
int32_t i = 0;
// Align ourselves to a word boundary.
int32_t alignLen =
std::min(len, int32_t(((-NS_PTR_TO_INT32(str)) & alignMask) / sizeof(char16_t)));
for (; i < alignLen; i++) {
if (str[i] > 255)
return i;
}
// Check one word at a time.
const int32_t wordWalkEnd = ((len - i) / numUnicharsPerWord) * numUnicharsPerWord;
for (; i < wordWalkEnd; i += numUnicharsPerWord) {
const size_t word = *reinterpret_cast<const size_t*>(str + i);
if (word & mask)
return i;
}
// Take care of the remainder one character at a time.
for (; i < len; i++) {
if (str[i] > 255)
return i;
}
return -1;
}
#ifdef MOZILLA_MAY_SUPPORT_SSE2
namespace mozilla {
namespace SSE2 {
int32_t FirstNon8Bit(const char16_t *str, const char16_t *end);
} // namespace SSE2
} // namespace mozilla
#endif
/*
* This function returns -1 if all characters in str are 8 bit characters.
* Otherwise, it returns a value less than or equal to the index of the first
* non-8bit character in str. For example, if first non-8bit character is at
* position 25, it may return 25, or for example 24, or 16. But it guarantees
* there is no non-8bit character before returned value.
*/
static inline int32_t
FirstNon8Bit(const char16_t *str, const char16_t *end)
{
#ifdef MOZILLA_MAY_SUPPORT_SSE2
if (mozilla::supports_sse2()) {
return mozilla::SSE2::FirstNon8Bit(str, end);
}
#endif
return FirstNon8BitUnvectorized(str, end);
}
bool
nsTextFragment::SetTo(const char16_t* aBuffer, int32_t aLength,
bool aUpdateBidi, bool aForce2b)
{
if (aForce2b && mState.mIs2b && !m2b->IsReadonly()) {
uint32_t storageSize = m2b->StorageSize();
uint32_t neededSize = aLength * sizeof(char16_t);
if (!neededSize) {
if (storageSize < AutoStringDefaultStorageSize) {
// If we're storing small enough nsStringBuffer, let's preserve it.
static_cast<char16_t*>(m2b->Data())[0] = char16_t(0);
mState.mLength = 0;
mState.mIsBidi = false;
return true;
}
} else if ((neededSize < storageSize) &&
((storageSize / 2) <
(neededSize + AutoStringDefaultStorageSize))) {
// Don't try to reuse the existing nsStringBuffer, if it would have
// lots of unused space.
memcpy(m2b->Data(), aBuffer, neededSize);
static_cast<char16_t*>(m2b->Data())[aLength] = char16_t(0);
mState.mLength = aLength;
mState.mIsBidi = false;
if (aUpdateBidi) {
UpdateBidiFlag(aBuffer, aLength);
}
return true;
}
}
ReleaseText();
if (aLength == 0) {
return true;
}
char16_t firstChar = *aBuffer;
if (!aForce2b && aLength == 1 && firstChar < 256) {
m1b = sSingleCharSharedString + firstChar;
mState.mInHeap = false;
mState.mIs2b = false;
mState.mLength = 1;
return true;
}
const char16_t *ucp = aBuffer;
const char16_t *uend = aBuffer + aLength;
// Check if we can use a shared string
if (!aForce2b &&
aLength <= 1 + TEXTFRAG_WHITE_AFTER_NEWLINE + TEXTFRAG_MAX_NEWLINES &&
(firstChar == ' ' || firstChar == '\n' || firstChar == '\t')) {
if (firstChar == ' ') {
++ucp;
}
const char16_t* start = ucp;
while (ucp < uend && *ucp == '\n') {
++ucp;
}
const char16_t* endNewLine = ucp;
char16_t space = ucp < uend && *ucp == '\t' ? '\t' : ' ';
while (ucp < uend && *ucp == space) {
++ucp;
}
if (ucp == uend &&
endNewLine - start <= TEXTFRAG_MAX_NEWLINES &&
ucp - endNewLine <= TEXTFRAG_WHITE_AFTER_NEWLINE) {
char** strings = space == ' ' ? sSpaceSharedString : sTabSharedString;
m1b = strings[endNewLine - start];
// If we didn't find a space in the beginning, skip it now.
if (firstChar != ' ') {
++m1b;
}
mState.mInHeap = false;
mState.mIs2b = false;
mState.mLength = aLength;
return true;
}
}
// See if we need to store the data in ucs2 or not
int32_t first16bit = aForce2b ? 0 : FirstNon8Bit(ucp, uend);
if (first16bit != -1) { // aBuffer contains no non-8bit character
// Use ucs2 storage because we have to
CheckedUint32 m2bSize = aLength + 1;
m2bSize *= sizeof(char16_t);
if (!m2bSize.isValid()) {
return false;
}
m2b = nsStringBuffer::Alloc(m2bSize.value()).take();
if (!m2b) {
return false;
}
memcpy(m2b->Data(), aBuffer, aLength * sizeof(char16_t));
static_cast<char16_t*>(m2b->Data())[aLength] = char16_t(0);
mState.mIs2b = true;
if (aUpdateBidi) {
UpdateBidiFlag(aBuffer + first16bit, aLength - first16bit);
}
} else {
// Use 1 byte storage because we can
char* buff = static_cast<char*>(malloc(aLength));
if (!buff) {
return false;
}
// Copy data
LossyConvertUTF16toLatin1(MakeSpan(aBuffer, aLength),
MakeSpan(buff, aLength));
m1b = buff;
mState.mIs2b = false;
}
// Setup our fields
mState.mInHeap = true;
mState.mLength = aLength;
return true;
}
void
nsTextFragment::CopyTo(char16_t *aDest, int32_t aOffset, int32_t aCount)
{
NS_ASSERTION(aOffset >= 0, "Bad offset passed to nsTextFragment::CopyTo()!");
NS_ASSERTION(aCount >= 0, "Bad count passed to nsTextFragment::CopyTo()!");
if (aOffset < 0) {
aOffset = 0;
}
if (uint32_t(aOffset + aCount) > GetLength()) {
aCount = mState.mLength - aOffset;
}
if (aCount != 0) {
if (mState.mIs2b) {
memcpy(aDest, Get2b() + aOffset, sizeof(char16_t) * aCount);
} else {
const char *cp = m1b + aOffset;
ConvertLatin1toUTF16(MakeSpan(cp, aCount), MakeSpan(aDest, aCount));
}
}
}
bool
nsTextFragment::Append(const char16_t* aBuffer, uint32_t aLength,
bool aUpdateBidi, bool aForce2b)
{
if (!aLength) {
return true;
}
// This is a common case because some callsites create a textnode
// with a value by creating the node and then calling AppendData.
if (mState.mLength == 0) {
return SetTo(aBuffer, aLength, aUpdateBidi, aForce2b);
}
// Should we optimize for aData.Length() == 0?
// FYI: Don't use CheckedInt in this method since here is very hot path
// in some performance tests.
if (NS_MAX_TEXT_FRAGMENT_LENGTH - mState.mLength < aLength) {
return false; // Would be overflown if we'd keep handling.
}
if (mState.mIs2b) {
size_t size = mState.mLength + aLength + 1;
if (SIZE_MAX / sizeof(char16_t) < size) {
return false; // Would be overflown if we'd keep handling.
}
size *= sizeof(char16_t);
// Already a 2-byte string so the result will be too
nsStringBuffer* buff = nullptr;
nsStringBuffer* bufferToRelease = nullptr;
if (m2b->IsReadonly()) {
buff = nsStringBuffer::Alloc(size).take();
if (!buff) {
return false;
}
bufferToRelease = m2b;
memcpy(static_cast<char16_t*>(buff->Data()), m2b->Data(),
mState.mLength * sizeof(char16_t));
} else {
buff = nsStringBuffer::Realloc(m2b, size);
if (!buff) {
return false;
}
}
char16_t* data = static_cast<char16_t*>(buff->Data());
memcpy(data + mState.mLength, aBuffer,
aLength * sizeof(char16_t));
mState.mLength += aLength;
m2b = buff;
data[mState.mLength] = char16_t(0);
NS_IF_RELEASE(bufferToRelease);
if (aUpdateBidi) {
UpdateBidiFlag(aBuffer, aLength);
}
return true;
}
// Current string is a 1-byte string, check if the new data fits in one byte too.
int32_t first16bit = aForce2b ? 0 : FirstNon8Bit(aBuffer, aBuffer + aLength);
if (first16bit != -1) { // aBuffer contains no non-8bit character
size_t size = mState.mLength + aLength + 1;
if (SIZE_MAX / sizeof(char16_t) < size) {
return false; // Would be overflown if we'd keep handling.
}
size *= sizeof(char16_t);
// The old data was 1-byte, but the new is not so we have to expand it
// all to 2-byte
nsStringBuffer* buff = nsStringBuffer::Alloc(size).take();
if (!buff) {
return false;
}
// Copy data into buff
char16_t* data = static_cast<char16_t*>(buff->Data());
ConvertLatin1toUTF16(MakeSpan(m1b, mState.mLength),
MakeSpan(data, mState.mLength));
memcpy(data + mState.mLength, aBuffer, aLength * sizeof(char16_t));
mState.mLength += aLength;
mState.mIs2b = true;
if (mState.mInHeap) {
free(const_cast<char*>(m1b));
}
data[mState.mLength] = char16_t(0);
m2b = buff;
mState.mInHeap = true;
if (aUpdateBidi) {
UpdateBidiFlag(aBuffer + first16bit, aLength - first16bit);
}
return true;
}
// The new and the old data is all 1-byte
size_t size = mState.mLength + aLength;
MOZ_ASSERT(sizeof(char) == 1);
char* buff;
if (mState.mInHeap) {
buff = static_cast<char*>(realloc(const_cast<char*>(m1b), size));
if (!buff) {
return false;
}
}
else {
buff = static_cast<char*>(malloc(size));
if (!buff) {
return false;
}
memcpy(buff, m1b, mState.mLength);
mState.mInHeap = true;
}
// Copy aBuffer into buff.
LossyConvertUTF16toLatin1(MakeSpan(aBuffer, aLength),
MakeSpan(buff + mState.mLength, aLength));
m1b = buff;
mState.mLength += aLength;
return true;
}
/* virtual */ size_t
nsTextFragment::SizeOfExcludingThis(mozilla::MallocSizeOf aMallocSizeOf) const
{
if (Is2b()) {
return m2b->SizeOfIncludingThisIfUnshared(aMallocSizeOf);
}
if (mState.mInHeap) {
return aMallocSizeOf(m1b);
}
return 0;
}
// To save time we only do this when we really want to know, not during
// every allocation
void
nsTextFragment::UpdateBidiFlag(const char16_t* aBuffer, uint32_t aLength)
{
if (mState.mIs2b && !mState.mIsBidi) {
if (HasRTLChars(MakeSpan(aBuffer, aLength))) {
mState.mIsBidi = true;
}
}
}