gecko-dev/ef/Runtime/System/JavaString.cpp
1999-11-02 06:38:29 +00:00

213 lines
5.9 KiB
C++

/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*-
*
* The contents of this file are subject to the Netscape Public
* License Version 1.1 (the "License"); you may not use this file
* except in compliance with the License. You may obtain a copy of
* the License at http://www.mozilla.org/NPL/
*
* Software distributed under the License is distributed on an "AS
* IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
* implied. See the License for the specific language governing
* rights and limitations under the License.
*
* The Original Code is mozilla.org code.
*
* The Initial Developer of the Original Code is Netscape
* Communications Corporation. Portions created by Netscape are
* Copyright (C) 1998 Netscape Communications Corporation. All
* Rights Reserved.
*
* Contributor(s):
*/
#include "plstr.h"
#include "JavaString.h"
#include "ClassCentral.h"
#include "ClassFileSummary.h"
static inline JavaArray *newCharArray(Uint32 length)
{
void *mem = malloc(arrayEltsOffset(tkChar) + getTypeKindSize(tkChar)*length);
return (new (mem) JavaArray(Array::obtain(tkChar), length));
}
/* Count the number of bytes it would take to encode the given Unicode
* string using UTF-8. Add in the extra byte for the terminating NUL.
*/
static int
countUtf8Chars(const uint16 *ucs2, int ucs2len)
{
int utf8len = 1; // Need one character for terminating NUL
for (int i = ucs2len-1; i >= 0; i--) {
uint16 u = ucs2[i];
if (u < 0x80)
utf8len += 1;
else if (u < 0x800)
utf8len += 2;
else
utf8len += 3;
}
return utf8len;
}
/* Convert a Unicode (UCS-2) string to UTF-8 encoding. The length of
* the destination string, in bytes, is given by the utf8len argument.
* A NUL character is appended to the destination string, if possible.
* Returns: the actual length of the resulting string, in bytes.
*/
static int
convertUnicodeToUtf8(char *utf8, const uint16* ucs2, int utf8len)
{
char* start_utf8 = utf8;
char* lastchar = utf8 + utf8len - 1;
while (utf8 < lastchar) {
uint16 u = *ucs2++;
if (u < 0x80) {
*utf8++ = (char)u;
} else if (u < 0x800) {
if (utf8 >= (lastchar - 1))
break;
*utf8++ = 0xc0 | ((u >> 6) & 0x1f);
*utf8++ = 0x80 | (u & 0x3f);
} else {
if (utf8 >= (lastchar - 2))
break;
*utf8++ = 0xe0 | ((u >> 12) & 0x0f);
*utf8++ = 0x80 | ((u >> 6) & 0x3f);
*utf8++ = 0x80 | (u & 0x3f);
}
}
if (utf8 <= lastchar)
*utf8 = 0;
return utf8 - start_utf8;
}
/* Return the UTF8 representation of this string. This routine allocates
* enough memory for the conversion; this memory can be freed using
* JavaString::freeUtf()
*/
char *JavaString::convertUtf()
{
const uint16 *chars = getStr();
int utf8len = countUtf8Chars(chars, count);
char *utf8 = new char[utf8len];
convertUnicodeToUtf8(utf8, chars, utf8len);
return utf8;
}
void JavaString::freeUtf(char *str)
{
delete [] str;
}
/* Count the number of Unicode characters in a NUL-terminated
* UTF8 string. Don't count the final NUL character.
*/
static int
countUnicodeChars(const char *utf8)
{
signed char c;
int length = 0;
// Unicode characters are encoded as 1, 2, or 3 bytes in a UCS-2 string
while (c = *utf8) {
length++;
if (c >= 0) {
// Characters in the range of 0..0x7f are encoded using one byte
// b0xxxxxxx
utf8++;
} else if ((c & 0xe0) == 0xc0) {
// Characters in the range 0x80..0x7ff are encoded using two bytes
// b110xxxxx b10yyyyyy
utf8 += 2;
} else {
// Characters in the range 0x800..0xffff are encoded using three bytes
// b1110xxxx b10yyyyyy b10zzzzzz
PR_ASSERT((c & 0xf0) == 0xe0);
utf8 += 3;
}
}
return length;
}
/* Convert a UTF-8 encoded string to Unicode (UCS-2) representation. The
* length of the destination string, in 16-bit characters, is given by the
* ucs2 argument. The result is *not* NUL-terminated.
* Returns: the actual length of the resulting string, in characters.
*/
static int
convertUTF8ToUnicode(uint16 *ucs2, const char *utf8, int ucs2len)
{
signed char c;
int length = 0;
// Unicode characters are encoded as 1, 2, or 3 bytes in a UCS-2 string
while ((c = *utf8) != 0) {
length++;
if (length > ucs2len)
return ucs2len;
if (c >= 0) {
// Characters in the range of 0..0x7f are encoded using one byte
// b0xxxxxxx
*ucs2 = c;
utf8++;
} else if ((c & 0xe0) == 0xc0) {
// Characters in the range 0x80..0x7ff are encoded using two bytes
// b110xxxxx b10yyyyyy
*ucs2 = ((c & 0x1f) << 6) | (utf8[1] & 0x3f);
utf8 += 2;
} else {
// Characters in the range 0x800..0xffff are encoded using three bytes
// b1110xxxx b10yyyyyy b10zzzzzz
PR_ASSERT((c & 0xf0) == 0xe0);
*ucs2 = ((c & 0x0f) << 12) | ((utf8[1] & 0x3f) << 6) | (utf8[2] & 0x3f);
utf8 += 3;
}
ucs2++;
}
return length;
}
/* Create a new JavaString from a char array that represents the string in UTF-8
* format.
*/
JavaString::JavaString(const char *str) : JavaObject(*strType)
{
count = countUnicodeChars(str);
offset = 0;
value = (JavaArray *) newCharArray(count);
uint16 *chars = const_cast<uint16 *>(getStr());
convertUTF8ToUnicode(chars, str, count);
}
/* print a textual representation of this string */
void JavaString::dump()
{
const uint16 *chars = getStr();
for (int16 i = 0; i < count; i++)
putchar(chars[i]);
putchar('\n');
}
Type *JavaString::strType;
void JavaString::staticInit()
{
strType = &asType(Standard::get(cString));
}