mirror of
https://github.com/libretro/scummvm.git
synced 2025-01-01 06:58:34 +00:00
104 lines
3.1 KiB
C++
104 lines
3.1 KiB
C++
/* ScummVM - Graphic Adventure Engine
|
|
*
|
|
* ScummVM is the legal property of its developers, whose names
|
|
* are too numerous to list here. Please refer to the COPYRIGHT
|
|
* file distributed with this source distribution.
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public License
|
|
* as published by the Free Software Foundation; either version 2
|
|
* of the License, or (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
|
*
|
|
*/
|
|
/*
|
|
Basic UTF-8 manipulation routines
|
|
by Jeff Bezanson
|
|
placed in the public domain Fall 2005
|
|
|
|
This code is designed to provide the utilities you need to manipulate
|
|
UTF-8 as an internal string encoding. These functions do not perform the
|
|
error checking normally needed when handling UTF-8 data, so if you happen
|
|
to be from the Unicode Consortium you will want to flay me alive.
|
|
I do this because error checking can be performed at the boundaries (I/O),
|
|
with these routines reserved for higher performance on data known to be
|
|
valid.
|
|
*/
|
|
|
|
#include "common/debug.h"
|
|
|
|
#include "sludge/utf8.h"
|
|
|
|
namespace Sludge {
|
|
|
|
const uint32 UTF8Converter::offsetsFromUTF8[6] = {
|
|
0x00000000UL, 0x00003080UL,
|
|
0x000E2080UL, 0x03C82080UL,
|
|
0xFA082080UL, 0x82082080UL };
|
|
|
|
/* reads the next utf-8 sequence out of a string, updating an index */
|
|
uint32 UTF8Converter::nextchar(const char *s, int *i) {
|
|
uint32 ch = 0;
|
|
int sz = 0;
|
|
|
|
do {
|
|
ch <<= 6;
|
|
ch += (byte)s[(*i)++];
|
|
sz++;
|
|
} while (s[*i] && !isutf(s[*i]));
|
|
ch -= offsetsFromUTF8[sz - 1];
|
|
|
|
return ch;
|
|
}
|
|
|
|
Common::U32String UTF8Converter::convertUtf8ToUtf32(const Common::String &str) {
|
|
// we assume one character in a Common::String is one byte
|
|
// but in this case it's actually an UTF-8 string
|
|
// with up to 4 bytes per character. To work around this,
|
|
// convert it to an U32String before any further operation
|
|
Common::U32String u32str;
|
|
int i = 0;
|
|
while (i < (int)str.size()) {
|
|
uint32 chr = nextchar(str.c_str(), &i);
|
|
u32str += chr;
|
|
}
|
|
return u32str;
|
|
}
|
|
|
|
/* utf32 index => original byte offset */
|
|
int UTF8Converter::getOriginOffset(int origIdx) {
|
|
uint offs = 0;
|
|
while (origIdx > 0 && offs < _str.size()) {
|
|
// increment if it's not the start of a utf8 sequence
|
|
(void)( (++offs < _str.size() && isutf(_str[offs])) ||
|
|
(++offs < _str.size() && isutf(_str[offs])) ||
|
|
(++offs < _str.size() && isutf(_str[offs])) ||
|
|
++offs);
|
|
origIdx--;
|
|
}
|
|
return offs;
|
|
}
|
|
|
|
/** Construct a UTF8String with original char array to convert */
|
|
UTF8Converter::UTF8Converter(const char *str) {
|
|
setUTF8String(str);
|
|
}
|
|
|
|
/** set a utf8 string to convert */
|
|
void UTF8Converter::setUTF8String(Common::String str) {
|
|
_str32.clear();
|
|
_str32 = convertUtf8ToUtf32(str);
|
|
_str.clear();
|
|
_str = str;
|
|
}
|
|
|
|
} // End of namespace Sludge
|