COMMON: Add cyrilic transliteration to Encoding.

This commit is contained in:
Jaromir Wysoglad 2019-07-31 00:43:57 +02:00 committed by Filippos Karapetis
parent 888f3e4d30
commit 61cf628bfb
2 changed files with 116 additions and 23 deletions

View File

@ -32,44 +32,90 @@ namespace Common {
Encoding::Encoding(const String &to, const String &from)
: _to(to)
, _from(from) {
#ifdef USE_ICONV
String toTranslit = to + "//TRANSLIT";
_iconvHandle = iconv_open(toTranslit.c_str(), from.c_str());
#endif // USE_ICONV
_iconvHandle = initIconv(to, from);
}
Encoding::~Encoding() {
#ifdef USE_ICONV
if (_iconvHandle != (iconv_t) -1)
iconv_close(_iconvHandle);
#endif // USE_ICONV
deinitIconv(_iconvHandle);
}
char *Encoding::convert(const char *string, size_t size) {
#ifndef USE_ICONV
_iconvHandle = 0;
#endif
return doConversion(_iconvHandle, _to, _from, string, size);
}
char *Encoding::convert(const String &to, const String &from, const char *string, size_t size) {
iconv_t Encoding::initIconv(const String &to, const String &from) {
#ifdef USE_ICONV
String toTranslit = to + "//TRANSLIT";
iconv_t iconvHandle = iconv_open(toTranslit.c_str(), from.c_str());
String toTranslit = to + "//TRANSLIT";
return iconv_open(toTranslit.c_str(), from.c_str());
#else
iconv_t iconvHandle = 0;
return 0;
#endif // USE_ICONV
}
char *result = doConversion(iconvHandle, to, from, string, size);
void Encoding::deinitIconv(iconv_t iconvHandle) {
#ifdef USE_ICONV
if (iconvHandle != (iconv_t) -1)
iconv_close(iconvHandle);
#endif // USE_ICONV
}
char *Encoding::convert(const char *string, size_t size) {
return conversion(_iconvHandle, _to, _from, string, size);
}
char *Encoding::convert(const String &to, const String &from, const char *string, size_t size) {
iconv_t iconvHandle = initIconv(to, from);
char *result = conversion(iconvHandle, to, from, string, size);
deinitIconv(iconvHandle);
return result;
}
char *Encoding::doConversion(iconv_t iconvHandle, const String &to, const String &from, const char *string, size_t length) {
char *Encoding::conversion(iconv_t iconvHandle, const String &to, const String &from, const char *string, size_t length) {
char *newString = nullptr;
String newFrom = from;
size_t newLength = length;
if (String(from).equalsIgnoreCase("iso-8859-5") &&
!String(to).hasPrefixIgnoreCase("utf")) {
// There might be some cyrilic characters, which need to be transliterated.
newString = transliterateCyrilic(string);
newFrom = "ASCII";
}
if (String(from).hasPrefixIgnoreCase("utf") &&
!String(to).hasPrefixIgnoreCase("utf")) {
// There might be some cyrilic characters, which need to be transliterated.
char *tmpString;
if (String(from).hasPrefixIgnoreCase("utf-32"))
tmpString = nullptr;
else {
iconv_t tmpHandle = initIconv("UTF-32", from);
tmpString = conversion2(tmpHandle, "UTF-32", from, string, length);
deinitIconv(tmpHandle);
// find out the length in bytes of the tmpString
int i;
for (i = 0; ((const uint32 *)tmpString)[i]; i++) {}
newLength = i * 4;
newFrom = "UTF-32";
}
if (tmpString != nullptr) {
newString = (char *) transliterateUTF32((const uint32 *) tmpString, newLength);
free(tmpString);
} else
newString = (char *) transliterateUTF32((const uint32 *) string, newLength);
}
iconv_t newHandle = iconvHandle;
if (newFrom != from)
newHandle = initIconv(to, newFrom);
char *result;
if (newString != nullptr) {
result = conversion2(newHandle, to, newFrom, newString, newLength);
free(newString);
} else
result = conversion2(newHandle, to, newFrom, string, newLength);
if (newFrom != from)
deinitIconv(newHandle);
return result;
}
char *Encoding::conversion2(iconv_t iconvHandle, const String &to, const String &from, const char *string, size_t length) {
char *result = nullptr;
#ifdef USE_ICONV
if (iconvHandle != (iconv_t) -1)
@ -217,4 +263,43 @@ char *Encoding::convertTransManMapping(const char *to, const char *from, const c
#endif // USE_TRANSLATION
}
static char g_cyrilicTransliterationTable[] = {
' ', 'E', 'D', 'G', 'E', 'Z', 'I', 'I', 'J', 'L', 'N', 'C', 'K', '-', 'U', 'D',
'A', 'B', 'V', 'G', 'D', 'E', 'Z', 'Z', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
'R', 'S', 'T', 'U', 'F', 'H', 'C', 'C', 'S', 'S', '\"', 'Y', '\'', 'E', 'U', 'A',
'a', 'b', 'v', 'g', 'd', 'e', 'z', 'z', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p',
'r', 's', 't', 'u', 'f', 'h', 'c', 'c', 's', 's', '\"', 'y', '\'', 'e', 'u', 'a',
'N', 'e', 'd', 'g', 'e', 'z', 'i', 'i', 'j', 'l', 'n', 'c', 'k', '?', 'u', 'd',
};
char *Encoding::transliterateCyrilic(const char *string) {
char *result = (char *) malloc(strlen(string) + 1);
if (!result) {
warning("Could not allocate memory for encoding conversion");
return nullptr;
}
for(unsigned i = 0; i <= strlen(string); i++) {
if ((unsigned char) string[i] >= 160)
result[i] = g_cyrilicTransliterationTable[(unsigned char) string[i] - 160];
else
result[i] = string[i];
}
return result;
}
uint32 *Encoding::transliterateUTF32(const uint32 *string, size_t length) {
uint32 *result = (uint32 *) malloc(length + 4);
if (!result) {
warning("Could not allocate memory for encoding conversion");
return nullptr;
}
for(unsigned i = 0; i <= length / 4; i++) {
if (string[i] >= 0x410 && string[i] <= 0x450)
result[i] = g_cyrilicTransliterationTable[string[i] - 160 - 864];
else
result[i] = string[i];
}
return result;
}
}

View File

@ -55,12 +55,20 @@ class Encoding {
String _to;
String _from;
static char *doConversion(iconv_t iconvHandle, const String &to, const String &from, const char *string, size_t length);
static char *conversion(iconv_t iconvHandle, const String &to, const String &from, const char *string, size_t length);
static char *conversion2(iconv_t iconvHandle, const String &to, const String &from, const char *string, size_t length);
iconv_t _iconvHandle;
static char *convertIconv(iconv_t iconvHandle, const char *string, size_t length);
static char *convertTransManMapping(const char *to, const char *from, const char *string, size_t length);
static char *transliterateCyrilic(const char *string);
static uint32 *transliterateUTF32(const uint32 *string, size_t length);
static iconv_t initIconv(const String &to, const String &from);
static void deinitIconv(iconv_t iconvHandle);
};
}