mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-11-27 14:52:16 +00:00
515689e933
When compiled with -std=c++17, u8"" string literals have type `const char[]`. When compiled with -std=c++20, u8"" string literals have type `const char8_t[]`. This patch adds the minimum char8_t overloads needed to compile Firefox with -std=c++20. If we want to use char8_t in more code or replace `Utf8Unit` with char8_t, that will require an extensive redesign of MFBT's and xpcom's string classes after we update from -std=c++17 to c++20. Differential Revision: https://phabricator.services.mozilla.com/D201672
296 lines
9.0 KiB
C++
296 lines
9.0 KiB
C++
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
|
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
|
|
/* This Source Code Form is subject to the terms of the Mozilla Public
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
|
|
|
/* Character/text operations. */
|
|
|
|
#ifndef mozilla_TextUtils_h
|
|
#define mozilla_TextUtils_h
|
|
|
|
#include "mozilla/Assertions.h"
|
|
#include "mozilla/Latin1.h"
|
|
|
|
#ifdef MOZ_HAS_JSRUST
|
|
// Can't include mozilla/Encoding.h here.
|
|
extern "C" {
|
|
// Declared as uint8_t instead of char to match declaration in another header.
|
|
size_t encoding_ascii_valid_up_to(uint8_t const* buffer, size_t buffer_len);
|
|
}
|
|
#endif
|
|
|
|
namespace mozilla {
|
|
|
|
// See Utf8.h for IsUtf8() and conversions between UTF-8 and UTF-16.
|
|
// See Latin1.h for testing UTF-16 and UTF-8 for Latin1ness and
|
|
// for conversions to and from Latin1.
|
|
|
|
// The overloads below are not templated in order to make
|
|
// implicit conversions to span work as expected for the Span
|
|
// overloads.
|
|
|
|
/** Returns true iff |aChar| is ASCII, i.e. in the range [0, 0x80). */
|
|
inline constexpr bool IsAscii(unsigned char aChar) { return aChar < 0x80; }
|
|
|
|
/** Returns true iff |aChar| is ASCII, i.e. in the range [0, 0x80). */
|
|
inline constexpr bool IsAscii(signed char aChar) {
|
|
return IsAscii(static_cast<unsigned char>(aChar));
|
|
}
|
|
|
|
/** Returns true iff |aChar| is ASCII, i.e. in the range [0, 0x80). */
|
|
inline constexpr bool IsAscii(char aChar) {
|
|
return IsAscii(static_cast<unsigned char>(aChar));
|
|
}
|
|
|
|
#ifdef __cpp_char8_t
|
|
/** Returns true iff |aChar| is ASCII, i.e. in the range [0, 0x80). */
|
|
inline constexpr bool IsAscii(char8_t aChar) {
|
|
return IsAscii(static_cast<unsigned char>(aChar));
|
|
}
|
|
#endif
|
|
|
|
/** Returns true iff |aChar| is ASCII, i.e. in the range [0, 0x80). */
|
|
inline constexpr bool IsAscii(char16_t aChar) { return aChar < 0x80; }
|
|
|
|
/** Returns true iff |aChar| is ASCII, i.e. in the range [0, 0x80). */
|
|
inline constexpr bool IsAscii(char32_t aChar) { return aChar < 0x80; }
|
|
|
|
/**
|
|
* Returns |true| iff |aString| contains only ASCII characters, that is,
|
|
* characters in the range [0x00, 0x80).
|
|
*
|
|
* @param aString a 8-bit wide string to scan
|
|
*/
|
|
inline bool IsAscii(mozilla::Span<const char> aString) {
|
|
#if MOZ_HAS_JSRUST()
|
|
size_t length = aString.Length();
|
|
const char* ptr = aString.Elements();
|
|
// For short strings, avoid the function call, since, the SIMD
|
|
// code won't have a chance to kick in anyway.
|
|
if (length < mozilla::detail::kShortStringLimitForInlinePaths) {
|
|
const uint8_t* uptr = reinterpret_cast<const uint8_t*>(ptr);
|
|
uint8_t accu = 0;
|
|
for (size_t i = 0; i < length; i++) {
|
|
accu |= uptr[i];
|
|
}
|
|
return accu < 0x80;
|
|
}
|
|
return encoding_mem_is_ascii(ptr, length);
|
|
#else
|
|
for (char c : aString) {
|
|
if (!IsAscii(c)) {
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
#endif
|
|
}
|
|
|
|
/**
|
|
* Returns |true| iff |aString| contains only ASCII characters, that is,
|
|
* characters in the range [0x00, 0x80).
|
|
*
|
|
* @param aString a 16-bit wide string to scan
|
|
*/
|
|
inline bool IsAscii(mozilla::Span<const char16_t> aString) {
|
|
#if MOZ_HAS_JSRUST()
|
|
size_t length = aString.Length();
|
|
const char16_t* ptr = aString.Elements();
|
|
// For short strings, calling into Rust is a pessimization, and the SIMD
|
|
// code won't have a chance to kick in anyway.
|
|
// 16 is a bit larger than logically necessary for this function alone,
|
|
// but it's important that the limit here matches the limit used in
|
|
// LossyConvertUtf16toLatin1!
|
|
if (length < mozilla::detail::kShortStringLimitForInlinePaths) {
|
|
char16_t accu = 0;
|
|
for (size_t i = 0; i < length; i++) {
|
|
accu |= ptr[i];
|
|
}
|
|
return accu < 0x80;
|
|
}
|
|
return encoding_mem_is_basic_latin(ptr, length);
|
|
#else
|
|
for (char16_t c : aString) {
|
|
if (!IsAscii(c)) {
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
#endif
|
|
}
|
|
|
|
/**
|
|
* Returns true iff every character in the null-terminated string pointed to by
|
|
* |aChar| is ASCII, i.e. in the range [0, 0x80).
|
|
*/
|
|
template <typename Char>
|
|
constexpr bool IsAsciiNullTerminated(const Char* aChar) {
|
|
while (Char c = *aChar++) {
|
|
if (!IsAscii(c)) {
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
#if MOZ_HAS_JSRUST()
|
|
/**
|
|
* Returns the index of the first non-ASCII byte or
|
|
* the length of the string if there are none.
|
|
*/
|
|
inline size_t AsciiValidUpTo(mozilla::Span<const char> aString) {
|
|
return encoding_ascii_valid_up_to(
|
|
reinterpret_cast<const uint8_t*>(aString.Elements()), aString.Length());
|
|
}
|
|
|
|
/**
|
|
* Returns the index of the first unpaired surrogate or
|
|
* the length of the string if there are none.
|
|
*/
|
|
inline size_t Utf16ValidUpTo(mozilla::Span<const char16_t> aString) {
|
|
return encoding_mem_utf16_valid_up_to(aString.Elements(), aString.Length());
|
|
}
|
|
|
|
/**
|
|
* Replaces unpaired surrogates with U+FFFD in the argument.
|
|
*
|
|
* Note: If you have an nsAString, use EnsureUTF16Validity() from
|
|
* nsReadableUtils.h instead to avoid unsharing a valid shared
|
|
* string.
|
|
*/
|
|
inline void EnsureUtf16ValiditySpan(mozilla::Span<char16_t> aString) {
|
|
encoding_mem_ensure_utf16_validity(aString.Elements(), aString.Length());
|
|
}
|
|
|
|
/**
|
|
* Convert ASCII to UTF-16. In debug builds, assert that the input is
|
|
* ASCII.
|
|
*
|
|
* The length of aDest must not be less than the length of aSource.
|
|
*/
|
|
inline void ConvertAsciitoUtf16(mozilla::Span<const char> aSource,
|
|
mozilla::Span<char16_t> aDest) {
|
|
MOZ_ASSERT(IsAscii(aSource));
|
|
ConvertLatin1toUtf16(aSource, aDest);
|
|
}
|
|
|
|
#endif // MOZ_HAS_JSRUST
|
|
|
|
/**
|
|
* Returns true iff |aChar| matches Ascii Whitespace.
|
|
*
|
|
* This function is intended to match the Infra standard
|
|
* (https://infra.spec.whatwg.org/#ascii-whitespace)
|
|
*/
|
|
template <typename Char>
|
|
constexpr bool IsAsciiWhitespace(Char aChar) {
|
|
using UnsignedChar = typename detail::MakeUnsignedChar<Char>::Type;
|
|
auto uc = static_cast<UnsignedChar>(aChar);
|
|
return uc == 0x9 || uc == 0xA || uc == 0xC || uc == 0xD || uc == 0x20;
|
|
}
|
|
|
|
/**
|
|
* Returns true iff |aChar| matches [a-z].
|
|
*
|
|
* This function is basically what you thought islower was, except its behavior
|
|
* doesn't depend on the user's current locale.
|
|
*/
|
|
template <typename Char>
|
|
constexpr bool IsAsciiLowercaseAlpha(Char aChar) {
|
|
using UnsignedChar = typename detail::MakeUnsignedChar<Char>::Type;
|
|
auto uc = static_cast<UnsignedChar>(aChar);
|
|
return 'a' <= uc && uc <= 'z';
|
|
}
|
|
|
|
/**
|
|
* Returns true iff |aChar| matches [A-Z].
|
|
*
|
|
* This function is basically what you thought isupper was, except its behavior
|
|
* doesn't depend on the user's current locale.
|
|
*/
|
|
template <typename Char>
|
|
constexpr bool IsAsciiUppercaseAlpha(Char aChar) {
|
|
using UnsignedChar = typename detail::MakeUnsignedChar<Char>::Type;
|
|
auto uc = static_cast<UnsignedChar>(aChar);
|
|
return 'A' <= uc && uc <= 'Z';
|
|
}
|
|
|
|
/**
|
|
* Returns true iff |aChar| matches [a-zA-Z].
|
|
*
|
|
* This function is basically what you thought isalpha was, except its behavior
|
|
* doesn't depend on the user's current locale.
|
|
*/
|
|
template <typename Char>
|
|
constexpr bool IsAsciiAlpha(Char aChar) {
|
|
return IsAsciiLowercaseAlpha(aChar) || IsAsciiUppercaseAlpha(aChar);
|
|
}
|
|
|
|
/**
|
|
* Returns true iff |aChar| matches [0-9].
|
|
*
|
|
* This function is basically what you thought isdigit was, except its behavior
|
|
* doesn't depend on the user's current locale.
|
|
*/
|
|
template <typename Char>
|
|
constexpr bool IsAsciiDigit(Char aChar) {
|
|
using UnsignedChar = typename detail::MakeUnsignedChar<Char>::Type;
|
|
auto uc = static_cast<UnsignedChar>(aChar);
|
|
return '0' <= uc && uc <= '9';
|
|
}
|
|
|
|
/**
|
|
* Returns true iff |aChar| matches [0-9a-fA-F].
|
|
*
|
|
* This function is basically isxdigit, but guaranteed to be only for ASCII.
|
|
*/
|
|
template <typename Char>
|
|
constexpr bool IsAsciiHexDigit(Char aChar) {
|
|
using UnsignedChar = typename detail::MakeUnsignedChar<Char>::Type;
|
|
auto uc = static_cast<UnsignedChar>(aChar);
|
|
return ('0' <= uc && uc <= '9') || ('a' <= uc && uc <= 'f') ||
|
|
('A' <= uc && uc <= 'F');
|
|
}
|
|
|
|
/**
|
|
* Returns true iff |aChar| matches [a-zA-Z0-9].
|
|
*
|
|
* This function is basically what you thought isalnum was, except its behavior
|
|
* doesn't depend on the user's current locale.
|
|
*/
|
|
template <typename Char>
|
|
constexpr bool IsAsciiAlphanumeric(Char aChar) {
|
|
return IsAsciiDigit(aChar) || IsAsciiAlpha(aChar);
|
|
}
|
|
|
|
/**
|
|
* Converts an ASCII alphanumeric digit [0-9a-zA-Z] to number as if in base-36.
|
|
* (This function therefore works for decimal, hexadecimal, etc.).
|
|
*/
|
|
template <typename Char>
|
|
uint8_t AsciiAlphanumericToNumber(Char aChar) {
|
|
using UnsignedChar = typename detail::MakeUnsignedChar<Char>::Type;
|
|
auto uc = static_cast<UnsignedChar>(aChar);
|
|
|
|
if ('0' <= uc && uc <= '9') {
|
|
return uc - '0';
|
|
}
|
|
|
|
if ('A' <= uc && uc <= 'Z') {
|
|
return uc - 'A' + 10;
|
|
}
|
|
|
|
// Ideally this function would be constexpr, but unfortunately gcc at least as
|
|
// of 6.4 forbids non-constexpr function calls in unevaluated constexpr
|
|
// function calls. See bug 1453456. So for now, just assert and leave the
|
|
// entire function non-constexpr.
|
|
MOZ_ASSERT('a' <= uc && uc <= 'z',
|
|
"non-ASCII alphanumeric character can't be converted to number");
|
|
return uc - 'a' + 10;
|
|
}
|
|
|
|
} // namespace mozilla
|
|
|
|
#endif /* mozilla_TextUtils_h */
|