mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-11-08 04:27:37 +00:00
453 lines
14 KiB
C++
453 lines
14 KiB
C++
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
|
/* ***** BEGIN LICENSE BLOCK *****
|
|
* Version: NPL 1.1/GPL 2.0/LGPL 2.1
|
|
*
|
|
* The contents of this file are subject to the Netscape Public License
|
|
* Version 1.1 (the "License"); you may not use this file except in
|
|
* compliance with the License. You may obtain a copy of the License at
|
|
* http://www.mozilla.org/NPL/
|
|
*
|
|
* Software distributed under the License is distributed on an "AS IS" basis,
|
|
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
|
* for the specific language governing rights and limitations under the
|
|
* License.
|
|
*
|
|
* The Original Code is mozilla.org code.
|
|
*
|
|
* The Initial Developer of the Original Code is
|
|
* Netscape Communications Corporation.
|
|
* Portions created by the Initial Developer are Copyright (C) 2001
|
|
* the Initial Developer. All Rights Reserved.
|
|
*
|
|
* Contributor(s):
|
|
* Peter Annema <jaggernaut@netscape.com> (original author)
|
|
*
|
|
* Alternatively, the contents of this file may be used under the terms of
|
|
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
|
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
|
* in which case the provisions of the GPL or the LGPL are applicable instead
|
|
* of those above. If you wish to allow use of your version of this file only
|
|
* under the terms of either the GPL or the LGPL, and not to allow others to
|
|
* use your version of this file under the terms of the NPL, indicate your
|
|
* decision by deleting the provisions above and replace them with the notice
|
|
* and other provisions required by the GPL or the LGPL. If you do not delete
|
|
* the provisions above, a recipient may use your version of this file under
|
|
* the terms of any one of the NPL, the GPL or the LGPL.
|
|
*
|
|
* ***** END LICENSE BLOCK ***** */
|
|
|
|
#ifndef nsUTF8Utils_h_
|
|
#define nsUTF8Utils_h_
|
|
|
|
class UTF8traits
|
|
{
|
|
public:
|
|
static PRBool isASCII(char c) { return (c & 0x80) == 0x00; }
|
|
static PRBool isInSeq(char c) { return (c & 0xC0) == 0x80; }
|
|
static PRBool is2byte(char c) { return (c & 0xE0) == 0xC0; }
|
|
static PRBool is3byte(char c) { return (c & 0xF0) == 0xE0; }
|
|
static PRBool is4byte(char c) { return (c & 0xF8) == 0xF0; }
|
|
static PRBool is5byte(char c) { return (c & 0xFC) == 0xF8; }
|
|
static PRBool is6byte(char c) { return (c & 0xFE) == 0xFC; }
|
|
};
|
|
|
|
#define PLANE1_BASE 0x00010000
|
|
#define UCS2_REPLACEMENT_CHAR 0xfffd
|
|
|
|
#ifdef __GNUC__
|
|
#define NS_ALWAYS_INLINE __attribute__((always_inline))
|
|
#else
|
|
#define NS_ALWAYS_INLINE
|
|
#endif
|
|
|
|
/**
|
|
* A character sink (see |copy_string| in nsAlgorithm.h) for converting
|
|
* UTF-8 to UTF-16
|
|
*/
|
|
class ConvertUTF8toUTF16
|
|
{
|
|
public:
|
|
typedef nsACString::char_type value_type;
|
|
typedef nsAString::char_type buffer_type;
|
|
|
|
ConvertUTF8toUTF16( buffer_type* aBuffer )
|
|
: mStart(aBuffer), mBuffer(aBuffer), mErrorEncountered(PR_FALSE) {}
|
|
|
|
size_t Length() const { return mBuffer - mStart; }
|
|
|
|
PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
|
|
{
|
|
if ( mErrorEncountered )
|
|
return N;
|
|
|
|
// algorithm assumes utf8 units won't
|
|
// be spread across fragments
|
|
const value_type* p = start;
|
|
const value_type* end = start + N;
|
|
buffer_type* out = mBuffer;
|
|
for ( ; p != end /* && *p */; )
|
|
{
|
|
char c = *p++;
|
|
|
|
if ( UTF8traits::isASCII(c) )
|
|
{
|
|
*out++ = buffer_type(c);
|
|
continue;
|
|
}
|
|
|
|
PRUint32 ucs4;
|
|
PRUint32 minUcs4;
|
|
PRInt32 state = 0;
|
|
|
|
if ( UTF8traits::is2byte(c) )
|
|
{
|
|
ucs4 = (PRUint32(c) << 6) & 0x000007C0L;
|
|
state = 1;
|
|
minUcs4 = 0x00000080;
|
|
}
|
|
else if ( UTF8traits::is3byte(c) )
|
|
{
|
|
ucs4 = (PRUint32(c) << 12) & 0x0000F000L;
|
|
state = 2;
|
|
minUcs4 = 0x00000800;
|
|
}
|
|
else if ( UTF8traits::is4byte(c) )
|
|
{
|
|
ucs4 = (PRUint32(c) << 18) & 0x001F0000L;
|
|
state = 3;
|
|
minUcs4 = 0x00010000;
|
|
}
|
|
else if ( UTF8traits::is5byte(c) )
|
|
{
|
|
ucs4 = (PRUint32(c) << 24) & 0x03000000L;
|
|
state = 4;
|
|
minUcs4 = 0x00200000;
|
|
}
|
|
else if ( UTF8traits::is6byte(c) )
|
|
{
|
|
ucs4 = (PRUint32(c) << 30) & 0x40000000L;
|
|
state = 5;
|
|
minUcs4 = 0x04000000;
|
|
}
|
|
else
|
|
{
|
|
NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
|
|
mErrorEncountered = PR_TRUE;
|
|
mBuffer = out;
|
|
return N;
|
|
}
|
|
|
|
while ( state-- )
|
|
{
|
|
c = *p++;
|
|
|
|
if ( UTF8traits::isInSeq(c) )
|
|
{
|
|
PRInt32 shift = state * 6;
|
|
ucs4 |= (PRUint32(c) & 0x3F) << shift;
|
|
}
|
|
else
|
|
{
|
|
NS_ERROR("not a UTF8 string");
|
|
mErrorEncountered = PR_TRUE;
|
|
mBuffer = out;
|
|
return N;
|
|
}
|
|
}
|
|
|
|
if ( ucs4 < minUcs4 )
|
|
{
|
|
// Overlong sequence
|
|
*out++ = UCS2_REPLACEMENT_CHAR;
|
|
}
|
|
else if ( ucs4 <= 0xD7FF )
|
|
{
|
|
*out++ = ucs4;
|
|
}
|
|
else if ( /* ucs4 >= 0xD800 && */ ucs4 <= 0xDFFF )
|
|
{
|
|
// Surrogates
|
|
*out++ = UCS2_REPLACEMENT_CHAR;
|
|
}
|
|
else if ( ucs4 == 0xFFFE || ucs4 == 0xFFFF )
|
|
{
|
|
// Prohibited characters
|
|
*out++ = UCS2_REPLACEMENT_CHAR;
|
|
}
|
|
else if ( ucs4 >= PLANE1_BASE )
|
|
{
|
|
if ( ucs4 >= 0x00110000 )
|
|
*out++ = UCS2_REPLACEMENT_CHAR;
|
|
else {
|
|
// surrogate, see unicode specification 3.7 for following math.
|
|
ucs4 -= PLANE1_BASE;
|
|
*out++ = (PRUnichar)(ucs4 >> 10) | 0xd800u;
|
|
*out++ = (PRUnichar)(ucs4 & 0x3ff) | 0xdc00u;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
*out++ = ucs4;
|
|
}
|
|
}
|
|
mBuffer = out;
|
|
return p - start;
|
|
}
|
|
|
|
void write_terminator()
|
|
{
|
|
*mBuffer = buffer_type(0);
|
|
}
|
|
|
|
private:
|
|
buffer_type* const mStart;
|
|
buffer_type* mBuffer;
|
|
PRBool mErrorEncountered;
|
|
};
|
|
|
|
/**
|
|
* A character sink (see |copy_string| in nsAlgorithm.h) for computing
|
|
* the length of a UTF-8 string.
|
|
*/
|
|
class CalculateUTF8Length
|
|
{
|
|
public:
|
|
typedef nsACString::char_type value_type;
|
|
|
|
CalculateUTF8Length() : mLength(0), mErrorEncountered(PR_FALSE) { }
|
|
|
|
size_t Length() const { return mLength; }
|
|
|
|
PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
|
|
{
|
|
// ignore any further requests
|
|
if ( mErrorEncountered )
|
|
return N;
|
|
|
|
// algorithm assumes utf8 units won't
|
|
// be spread across fragments
|
|
const value_type* p = start;
|
|
const value_type* end = start + N;
|
|
for ( ; p < end /* && *p */; ++mLength )
|
|
{
|
|
if ( UTF8traits::isASCII(*p) )
|
|
p += 1;
|
|
else if ( UTF8traits::is2byte(*p) )
|
|
p += 2;
|
|
else if ( UTF8traits::is3byte(*p) )
|
|
p += 3;
|
|
else if ( UTF8traits::is4byte(*p) ) {
|
|
p += 4;
|
|
++mLength;
|
|
}
|
|
else if ( UTF8traits::is5byte(*p) )
|
|
p += 5;
|
|
else if ( UTF8traits::is6byte(*p) )
|
|
p += 6;
|
|
else
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
if ( p != end )
|
|
{
|
|
NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
|
|
mErrorEncountered = PR_TRUE;
|
|
mLength = 0;
|
|
return N;
|
|
}
|
|
return p - start;
|
|
}
|
|
|
|
private:
|
|
size_t mLength;
|
|
PRBool mErrorEncountered;
|
|
};
|
|
|
|
/**
|
|
* A character sink (see |copy_string| in nsAlgorithm.h) for converting
|
|
* UTF-16 to UTF-8.
|
|
*/
|
|
class ConvertUTF16toUTF8
|
|
{
|
|
public:
|
|
typedef nsAString::char_type value_type;
|
|
typedef nsACString::char_type buffer_type;
|
|
|
|
// The error handling here is more lenient than that in
|
|
// |ConvertUTF8toUTF16|, but it's that way for backwards
|
|
// compatibility.
|
|
|
|
ConvertUTF16toUTF8( buffer_type* aBuffer )
|
|
: mStart(aBuffer), mBuffer(aBuffer) {}
|
|
|
|
size_t Size() const { return mBuffer - mStart; }
|
|
|
|
PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
|
|
{
|
|
buffer_type *out = mBuffer; // gcc isn't smart enough to do this!
|
|
|
|
for (const value_type *p = start, *end = start + N; p < end; ++p )
|
|
{
|
|
value_type c = *p;
|
|
if (! (c & 0xFF80)) // U+0000 - U+007F
|
|
{
|
|
*out++ = (char)c;
|
|
}
|
|
else if (! (c & 0xF800)) // U+0100 - U+07FF
|
|
{
|
|
*out++ = 0xC0 | (char)(c >> 6);
|
|
*out++ = 0x80 | (char)(0x003F & c);
|
|
}
|
|
else if (0xD800 != (0xF800 & c)) // U+0800 - U+D7FF,U+E000 - U+FFFF
|
|
{
|
|
*out++ = 0xE0 | (char)(c >> 12);
|
|
*out++ = 0x80 | (char)(0x003F & (c >> 6));
|
|
*out++ = 0x80 | (char)(0x003F & c );
|
|
}
|
|
else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
|
|
{
|
|
// D800- DBFF - High Surrogate
|
|
// N = (H- D800) *400 + 10000 + ...
|
|
PRUint32 ucs4 = 0x10000 + ((0x03FF & c) << 10);
|
|
|
|
++p;
|
|
if (p == end)
|
|
{
|
|
NS_ERROR("Surrogate pair split between fragments");
|
|
mBuffer = out;
|
|
return N;
|
|
}
|
|
c = *p;
|
|
|
|
if (0xDC00 == (0xFC00 & c))
|
|
{
|
|
// DC00- DFFF - Low Surrogate
|
|
// N += ( L - DC00 )
|
|
ucs4 |= (0x03FF & c);
|
|
|
|
// 0001 0000-001F FFFF
|
|
*out++ = 0xF0 | (char)(ucs4 >> 18);
|
|
*out++ = 0x80 | (char)(0x003F & (ucs4 >> 12));
|
|
*out++ = 0x80 | (char)(0x003F & (ucs4 >> 6));
|
|
*out++ = 0x80 | (char)(0x003F & ucs4);
|
|
}
|
|
else
|
|
{
|
|
NS_ERROR("got a High Surrogate but no low surrogate");
|
|
// output nothing.
|
|
}
|
|
}
|
|
else // U+DC00 - U+DFFF
|
|
{
|
|
// DC00- DFFF - Low Surrogate
|
|
NS_ERROR("got a low Surrogate but no high surrogate");
|
|
// output nothing.
|
|
}
|
|
}
|
|
|
|
mBuffer = out;
|
|
return N;
|
|
}
|
|
|
|
void write_terminator()
|
|
{
|
|
*mBuffer = buffer_type(0);
|
|
}
|
|
|
|
private:
|
|
buffer_type* const mStart;
|
|
buffer_type* mBuffer;
|
|
};
|
|
|
|
/**
|
|
* A character sink (see |copy_string| in nsAlgorithm.h) for computing
|
|
* the number of bytes a UTF-16 would occupy in UTF-8.
|
|
*/
|
|
class CalculateUTF8Size
|
|
{
|
|
public:
|
|
typedef nsAString::char_type value_type;
|
|
|
|
CalculateUTF8Size()
|
|
: mSize(0) { }
|
|
|
|
size_t Size() const { return mSize; }
|
|
|
|
PRUint32 NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
|
|
{
|
|
// Assume UCS2 surrogate pairs won't be spread across fragments.
|
|
for (const value_type *p = start, *end = start + N; p < end; ++p )
|
|
{
|
|
value_type c = *p;
|
|
if (! (c & 0xFF80)) // U+0000 - U+007F
|
|
mSize += 1;
|
|
else if (! (c & 0xF800)) // U+0100 - U+07FF
|
|
mSize += 2;
|
|
else if (0xD800 != (0xF800 & c)) // U+0800 - U+D7FF,U+E000 - U+FFFF
|
|
mSize += 3;
|
|
else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
|
|
{
|
|
++p;
|
|
if (p == end)
|
|
{
|
|
NS_ERROR("Surrogate pair split between fragments");
|
|
return N;
|
|
}
|
|
c = *p;
|
|
|
|
if (0xDC00 == (0xFC00 & c))
|
|
mSize += 4;
|
|
else
|
|
NS_ERROR("got a high Surrogate but no low surrogate");
|
|
}
|
|
else // U+DC00 - U+DFFF
|
|
NS_ERROR("got a low Surrogate but no high surrogate");
|
|
}
|
|
|
|
return N;
|
|
}
|
|
|
|
private:
|
|
size_t mSize;
|
|
};
|
|
|
|
/**
|
|
* A character sink that performs a |reinterpret_cast| style conversion
|
|
* between character types.
|
|
*/
|
|
template <class FromCharT, class ToCharT>
|
|
class LossyConvertEncoding
|
|
{
|
|
public:
|
|
typedef FromCharT value_type;
|
|
|
|
typedef FromCharT input_type;
|
|
typedef ToCharT output_type;
|
|
|
|
typedef typename nsCharTraits<FromCharT>::unsigned_char_type unsigned_input_type;
|
|
|
|
public:
|
|
LossyConvertEncoding( output_type* aDestination ) : mDestination(aDestination) { }
|
|
|
|
PRUint32
|
|
write( const input_type* aSource, PRUint32 aSourceLength )
|
|
{
|
|
const input_type* done_writing = aSource + aSourceLength;
|
|
while ( aSource < done_writing )
|
|
*mDestination++ = (output_type)(unsigned_input_type)(*aSource++); // use old-style cast to mimic old |ns[C]String| behavior
|
|
return aSourceLength;
|
|
}
|
|
|
|
void
|
|
write_terminator()
|
|
{
|
|
*mDestination = output_type(0);
|
|
}
|
|
|
|
private:
|
|
output_type* mDestination;
|
|
};
|
|
|
|
#endif /* !defined(nsUTF8Utils_h_) */
|