gecko-dev/xpcom/string/nsUTF8Utils.h
Henri Sivonen 3edc601325 Bug 1402247 - Use encoding_rs for XPCOM string encoding conversions. r=Nika,erahm,froydnj.
Correctness improvements:

 * UTF errors are handled safely per spec instead of dangerously truncating
   strings.

 * There are fewer converter implementations.

Performance improvements:

 * The old code did exact buffer length math, which meant doing UTF math twice
   on each input string (once for length calculation and another time for
   conversion). Exact length math is more complicated when handling errors
   properly, which the old code didn't do. The new code does UTF math on the
   string content only once (when converting) but risks allocating more than
   once. There are heuristics in place to lower the probability of
   reallocation in cases where the double math avoidance isn't enough of a
   saving to absorb an allocation and memcpy.

 * Previously, in UTF-16 <-> UTF-8 conversions, an ASCII prefix was optimized
   but a single non-ASCII code point pessimized the rest of the string. The
   new code tries to get back on the fast ASCII path.

 * UTF-16 to Latin1 conversion guarantees less about handling of out-of-range
   input to eliminate an operation from the inner loop on x86/x86_64.

 * When assigning to a pre-existing string, the new code tries to reuse the
   old buffer instead of first releasing the old buffer and then allocating a
   new one.

 * When reallocating from the new code, the memcpy covers only the data that
   is part of the logical length of the old string instead of memcpying the
   whole capacity. (For old callers old excess memcpy behavior is preserved
   due to bogus callers. See bug 1472113.)

 * UTF-8 strings in XPConnect that are in the Latin1 range are passed to
   SpiderMonkey as Latin1.

New features:

 * Conversion between UTF-8 and Latin1 is added in order to enable faster
   future interop between Rust code (or otherwise UTF-8-using code) and text
   node and SpiderMonkey code that uses Latin1.

MozReview-Commit-ID: JaJuExfILM9
2018-08-14 14:43:42 +03:00

270 lines
7.5 KiB
C++

/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef nsUTF8Utils_h_
#define nsUTF8Utils_h_
// This file may be used in two ways: if MOZILLA_INTERNAL_API is defined, this
// file will provide signatures for the Mozilla abstract string types. It will
// use XPCOM assertion/debugging macros, etc.
#include "nscore.h"
#include "mozilla/Assertions.h"
#include "mozilla/EndianUtils.h"
#include "mozilla/TypeTraits.h"
#include "nsCharTraits.h"
#ifdef MOZILLA_INTERNAL_API
#define UTF8UTILS_WARNING(msg) NS_WARNING(msg)
#else
#define UTF8UTILS_WARNING(msg)
#endif
class UTF8traits
{
public:
static bool isASCII(char aChar)
{
return (aChar & 0x80) == 0x00;
}
static bool isInSeq(char aChar)
{
return (aChar & 0xC0) == 0x80;
}
static bool is2byte(char aChar)
{
return (aChar & 0xE0) == 0xC0;
}
static bool is3byte(char aChar)
{
return (aChar & 0xF0) == 0xE0;
}
static bool is4byte(char aChar)
{
return (aChar & 0xF8) == 0xF0;
}
static bool is5byte(char aChar)
{
return (aChar & 0xFC) == 0xF8;
}
static bool is6byte(char aChar)
{
return (aChar & 0xFE) == 0xFC;
}
// return the number of bytes in a sequence beginning with aChar
static int bytes(char aChar)
{
if (isASCII(aChar)) {
return 1;
}
if (is2byte(aChar)) {
return 2;
}
if (is3byte(aChar)) {
return 3;
}
if (is4byte(aChar)) {
return 4;
}
MOZ_ASSERT_UNREACHABLE("should not be used for in-sequence characters");
return 1;
}
};
/**
* Extract the next Unicode scalar value from the buffer and return it. The
* pointer passed in is advanced to the start of the next character in the
* buffer. Upon error, the return value is 0xFFFD, *aBuffer is advanced
* over the maximal valid prefix and *aErr is set to true (if aErr is not
* null).
*
* Note: This method never sets *aErr to false to allow error accumulation
* across multiple calls.
*
* Precondition: *aBuffer < aEnd
*/
class UTF8CharEnumerator
{
public:
static inline char32_t NextChar(const char** aBuffer,
const char* aEnd,
bool* aErr = nullptr)
{
MOZ_ASSERT(aBuffer, "null buffer pointer pointer");
MOZ_ASSERT(aEnd, "null end pointer");
const unsigned char* p = reinterpret_cast<const unsigned char*>(*aBuffer);
const unsigned char* end = reinterpret_cast<const unsigned char*>(aEnd);
MOZ_ASSERT(p, "null buffer");
MOZ_ASSERT(p < end, "Bogus range");
unsigned char first = *p++;
if (MOZ_LIKELY(first < 0x80U)) {
*aBuffer = reinterpret_cast<const char*>(p);
return first;
}
// Unsigned underflow is defined behavior
if (MOZ_UNLIKELY((p == end) || ((first - 0xC2U) >= (0xF5U - 0xC2U)))) {
*aBuffer = reinterpret_cast<const char*>(p);
if (aErr) {
*aErr = true;
}
return 0xFFFDU;
}
unsigned char second = *p;
if (first < 0xE0U) {
// Two-byte
if (MOZ_LIKELY((second & 0xC0U) == 0x80U)) {
*aBuffer = reinterpret_cast<const char*>(++p);
return ((uint32_t(first) & 0x1FU) << 6) | (uint32_t(second) & 0x3FU);
}
*aBuffer = reinterpret_cast<const char*>(p);
if (aErr) {
*aErr = true;
}
return 0xFFFDU;
}
if (MOZ_LIKELY(first < 0xF0U)) {
// Three-byte
unsigned char lower = 0x80U;
unsigned char upper = 0xBFU;
if (first == 0xE0U) {
lower = 0xA0U;
} else if (first == 0xEDU) {
upper = 0x9FU;
}
if (MOZ_LIKELY(second >= lower && second <= upper)) {
if (MOZ_LIKELY(p != end)) {
unsigned char third = *++p;
if (MOZ_LIKELY((third & 0xC0U) == 0x80U)) {
*aBuffer = reinterpret_cast<const char*>(++p);
return ((uint32_t(first) & 0xFU) << 12) |
((uint32_t(second) & 0x3FU) << 6) |
(uint32_t(third) & 0x3FU);
}
}
}
*aBuffer = reinterpret_cast<const char*>(p);
if (aErr) {
*aErr = true;
}
return 0xFFFDU;
}
// Four-byte
unsigned char lower = 0x80U;
unsigned char upper = 0xBFU;
if (first == 0xF0U) {
lower = 0x90U;
} else if (first == 0xF4U) {
upper = 0x8FU;
}
if (MOZ_LIKELY(second >= lower && second <= upper)) {
if (MOZ_LIKELY(p != end)) {
unsigned char third = *++p;
if (MOZ_LIKELY((third & 0xC0U) == 0x80U)) {
if (MOZ_LIKELY(p != end)) {
unsigned char fourth = *++p;
if (MOZ_LIKELY((fourth & 0xC0U) == 0x80U)) {
*aBuffer = reinterpret_cast<const char*>(++p);
return ((uint32_t(first) & 0x7U) << 18) |
((uint32_t(second) & 0x3FU) << 12) |
((uint32_t(third) & 0x3FU) << 6) |
(uint32_t(fourth) & 0x3FU);
}
}
}
}
}
*aBuffer = reinterpret_cast<const char*>(p);
if (aErr) {
*aErr = true;
}
return 0xFFFDU;
}
};
/**
* Extract the next Unicode scalar value from the buffer and return it. The
* pointer passed in is advanced to the start of the next character in the
* buffer. Upon error, the return value is 0xFFFD, *aBuffer is advanced over
* the unpaired surrogate and *aErr is set to true (if aErr is not null).
*
* Note: This method never sets *aErr to false to allow error accumulation
* across multiple calls.
*
* Precondition: *aBuffer < aEnd
*/
class UTF16CharEnumerator
{
public:
static inline char32_t NextChar(const char16_t** aBuffer,
const char16_t* aEnd,
bool* aErr = nullptr)
{
MOZ_ASSERT(aBuffer, "null buffer pointer pointer");
MOZ_ASSERT(aEnd, "null end pointer");
const char16_t* p = *aBuffer;
MOZ_ASSERT(p, "null buffer");
MOZ_ASSERT(p < aEnd, "Bogus range");
char16_t c = *p++;
// Let's use encoding_rs-style code golf here.
// Unsigned underflow is defined behavior
char16_t cMinusSurrogateStart = c - 0xD800U;
if (MOZ_LIKELY(cMinusSurrogateStart > (0xDFFFU - 0xD800U))) {
*aBuffer = p;
return c;
}
if (MOZ_LIKELY(cMinusSurrogateStart <= (0xDBFFU - 0xD800U))) {
// High surrogate
if (MOZ_LIKELY(p != aEnd)) {
char16_t second = *p;
// Unsigned underflow is defined behavior
if (MOZ_LIKELY((second - 0xDC00U) <= (0xDFFFU - 0xDC00U))) {
*aBuffer = ++p;
return (uint32_t(c) << 10) + uint32_t(second) -
(((0xD800U << 10) - 0x10000U) + 0xDC00U);
}
}
}
// Unpaired surrogate
*aBuffer = p;
if (aErr) {
*aErr = true;
}
return 0xFFFDU;
}
};
template<typename Char, typename UnsignedT>
inline UnsignedT
RewindToPriorUTF8Codepoint(const Char* utf8Chars, UnsignedT index)
{
static_assert(mozilla::IsSame<Char, char>::value ||
mozilla::IsSame<Char, unsigned char>::value ||
mozilla::IsSame<Char, signed char>::value,
"UTF-8 data must be in 8-bit units");
static_assert(mozilla::IsUnsigned<UnsignedT>::value, "index type must be unsigned");
while (index > 0 && (utf8Chars[index] & 0xC0) == 0x80)
--index;
return index;
}
#undef UTF8UTILS_WARNING
#endif /* !defined(nsUTF8Utils_h_) */