/* * Copyright (c) 2015 Apple Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in * compliance with the License. Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this * file. * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. * * @APPLE_LICENSE_HEADER_END@ */ /* CFICUConverters.c Copyright (c) 2004-2014, Apple Inc. All rights reserved. Responsibility: Aki Inoue */ #include "CFStringEncodingDatabase.h" #include "CFStringEncodingConverterPriv.h" #include "CFICUConverters.h" #include #include #include #include #include "CFInternal.h" #include // Thread data support typedef struct { uint8_t _numSlots; uint8_t _nextSlot; UConverter **_converters; } __CFICUThreadData; static void __CFICUThreadDataDestructor(void *context) { __CFICUThreadData * data = (__CFICUThreadData *)context; if (NULL != data->_converters) { // scan to make sure deallocation UConverter **converter = data->_converters; UConverter **limit = converter + data->_numSlots; while (converter < limit) { if (NULL != converter) ucnv_close(*converter); ++converter; } CFAllocatorDeallocate(NULL, data->_converters); } CFAllocatorDeallocate(NULL, data); } CF_INLINE __CFICUThreadData *__CFStringEncodingICUGetThreadData() { __CFICUThreadData * data; data = (__CFICUThreadData *)_CFGetTSD(__CFTSDKeyICUConverter); if (NULL == data) { data = (__CFICUThreadData *)CFAllocatorAllocate(NULL, sizeof(__CFICUThreadData), 0); memset(data, 0, sizeof(__CFICUThreadData)); _CFSetTSD(__CFTSDKeyICUConverter, (void *)data, __CFICUThreadDataDestructor); } return data; } CF_PRIVATE const char *__CFStringEncodingGetICUName(CFStringEncoding encoding) { #define STACK_BUFFER_SIZE (60) char buffer[STACK_BUFFER_SIZE]; const char *result = NULL; UErrorCode errorCode = U_ZERO_ERROR; uint32_t codepage = 0; if (kCFStringEncodingUTF7_IMAP == encoding) return "IMAP-mailbox-name"; if (kCFStringEncodingUnicode != (encoding & 0x0F00)) codepage = __CFStringEncodingGetWindowsCodePage(encoding); // we don't use codepage for UTF to avoid little endian weirdness of Windows if ((0 != codepage) && (snprintf(buffer, STACK_BUFFER_SIZE, "windows-%d", codepage) < STACK_BUFFER_SIZE) && (NULL != (result = ucnv_getAlias(buffer, 0, &errorCode)))) return result; if (__CFStringEncodingGetCanonicalName(encoding, buffer, STACK_BUFFER_SIZE)) result = ucnv_getAlias(buffer, 0, &errorCode); return result; #undef STACK_BUFFER_SIZE } CF_PRIVATE CFStringEncoding __CFStringEncodingGetFromICUName(const char *icuName) { uint32_t codepage; char *endPtr; UErrorCode errorCode = U_ZERO_ERROR; if ((0 == strncasecmp_l(icuName, "windows-", strlen("windows-"), NULL)) && (0 != (codepage = strtol(icuName + strlen("windows-"), &endPtr, 10))) && (*endPtr == '\0')) return __CFStringEncodingGetFromWindowsCodePage(codepage); if (0 != ucnv_countAliases(icuName, &errorCode)) { CFStringEncoding encoding; const char *name; // Try WINDOWS platform name = ucnv_getStandardName(icuName, "WINDOWS", &errorCode); if (NULL != name) { if ((0 == strncasecmp_l(name, "windows-", strlen("windows-"), NULL)) && (0 != (codepage = strtol(name + strlen("windows-"), &endPtr, 10))) && (*endPtr == '\0')) return __CFStringEncodingGetFromWindowsCodePage(codepage); if (strncasecmp_l(icuName, name, strlen(name), NULL) && (kCFStringEncodingInvalidId != (encoding = __CFStringEncodingGetFromCanonicalName(name)))) return encoding; } // Try JAVA platform name = ucnv_getStandardName(icuName, "JAVA", &errorCode); if ((NULL != name) && strncasecmp_l(icuName, name, strlen(name), NULL) && (kCFStringEncodingInvalidId != (encoding = __CFStringEncodingGetFromCanonicalName(name)))) return encoding; // Try MIME platform name = ucnv_getStandardName(icuName, "MIME", &errorCode); if ((NULL != name) && strncasecmp_l(icuName, name, strlen(name), NULL) && (kCFStringEncodingInvalidId != (encoding = __CFStringEncodingGetFromCanonicalName(name)))) return encoding; } return kCFStringEncodingInvalidId; } CF_INLINE UConverter *__CFStringEncodingConverterCreateICUConverter(const char *icuName, uint32_t flags, bool toUnicode) { UConverter *converter; UErrorCode errorCode = U_ZERO_ERROR; uint8_t streamID = CFStringEncodingStreamIDFromMask(flags); if (0 != streamID) { // this is a part of streaming previously created __CFICUThreadData *data = __CFStringEncodingICUGetThreadData(); --streamID; // map to array index if ((streamID < data->_numSlots) && (NULL != data->_converters[streamID])) return data->_converters[streamID]; } converter = ucnv_open(icuName, &errorCode); if (NULL != converter) { char lossyByte = CFStringEncodingMaskToLossyByte(flags); if ((0 == lossyByte) && (0 != (flags & kCFStringEncodingAllowLossyConversion))) lossyByte = '?'; if (0 ==lossyByte) { if (toUnicode) { ucnv_setToUCallBack(converter, &UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode); } else { ucnv_setFromUCallBack(converter, &UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode); } } else { ucnv_setSubstChars(converter, &lossyByte, 1, &errorCode); } } return converter; } #define ICU_CONVERTER_SLOT_INCREMENT (10) #define ICU_CONVERTER_MAX_SLOT (255) static CFIndex __CFStringEncodingConverterReleaseICUConverter(UConverter *converter, uint32_t flags, CFIndex status) { uint8_t streamID = CFStringEncodingStreamIDFromMask(flags); if ((kCFStringEncodingInvalidInputStream != status) && ((0 != (flags & kCFStringEncodingPartialInput)) || ((kCFStringEncodingInsufficientOutputBufferLength == status) && (0 != (flags & kCFStringEncodingPartialOutput))))) { if (0 == streamID) { __CFICUThreadData *data = __CFStringEncodingICUGetThreadData(); if (NULL == data->_converters) { data->_converters = (UConverter **)CFAllocatorAllocate(NULL, sizeof(UConverter *) * ICU_CONVERTER_SLOT_INCREMENT, 0); memset(data->_converters, 0, sizeof(UConverter *) * ICU_CONVERTER_SLOT_INCREMENT); data->_numSlots = ICU_CONVERTER_SLOT_INCREMENT; data->_nextSlot = 0; } else if ((data->_nextSlot >= data->_numSlots) || (NULL != data->_converters[data->_nextSlot])) { // Need to find one CFIndex index; for (index = 0;index < data->_numSlots;index++) { if (NULL == data->_converters[index]) { data->_nextSlot = index; break; } } if (index >= data->_numSlots) { // we're full UConverter **newConverters; CFIndex newSize = data->_numSlots + ICU_CONVERTER_SLOT_INCREMENT; if (newSize > ICU_CONVERTER_MAX_SLOT) { // something is terribly wrong CFLog(kCFLogLevelError, CFSTR("Per-thread streaming ID for ICU converters exhausted. Ignoring...")); ucnv_close(converter); return 0; } newConverters = (UConverter **)CFAllocatorAllocate(NULL, sizeof(UConverter *) * newSize, 0); memset(newConverters, 0, sizeof(UConverter *) * newSize); memcpy(newConverters, data->_converters, sizeof(UConverter *) * data->_numSlots); CFAllocatorDeallocate(NULL, data->_converters); data->_converters = newConverters; data->_nextSlot = data->_numSlots; data->_numSlots = newSize; } } data->_converters[data->_nextSlot] = converter; streamID = data->_nextSlot + 1; // now find next slot ++data->_nextSlot; if ((data->_nextSlot >= data->_numSlots) || (NULL != data->_converters[data->_nextSlot])) { data->_nextSlot = 0; while ((data->_nextSlot < data->_numSlots) && (NULL != data->_converters[data->_nextSlot])) ++data->_nextSlot; } } return CFStringEncodingStreamIDToMask(streamID); } if (0 != streamID) { __CFICUThreadData *data = __CFStringEncodingICUGetThreadData(); --streamID; // map to array index if ((streamID < data->_numSlots) && (converter == data->_converters[streamID])) { data->_converters[streamID] = NULL; if (data->_nextSlot > streamID) data->_nextSlot = streamID; } } ucnv_close(converter); return 0; } #define MAX_BUFFER_SIZE (1000) #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED #if 0 // we're no longer doing this check. Revive when the status in the bug changed. #if (U_ICU_VERSION_MAJOR_NUM > 49) #warning Unknown ICU version. Check binary compatibility issues for rdar://problem/6024743 #endif #endif #endif #define HAS_ICU_BUG_6024743 (1) #define HAS_ICU_BUG_6025527 (1) CF_PRIVATE CFIndex __CFStringEncodingICUToBytes(const char *icuName, uint32_t flags, const UniChar *characters, CFIndex numChars, CFIndex *usedCharLen, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) { UConverter *converter; UErrorCode errorCode = U_ZERO_ERROR; const UTF16Char *source = characters; const UTF16Char *sourceLimit = source + numChars; char *destination = (char *)bytes; const char *destinationLimit = destination + maxByteLen; bool flush = ((0 == (flags & kCFStringEncodingPartialInput)) ? true : false); CFIndex status; if (NULL == (converter = __CFStringEncodingConverterCreateICUConverter(icuName, flags, false))) return kCFStringEncodingConverterUnavailable; if (0 == maxByteLen) { char buffer[MAX_BUFFER_SIZE]; CFIndex totalLength = 0; while ((source < sourceLimit) && (U_ZERO_ERROR == errorCode)) { destination = buffer; destinationLimit = destination + MAX_BUFFER_SIZE; ucnv_fromUnicode(converter, &destination, destinationLimit, (const UChar **)&source, (const UChar *)sourceLimit, NULL, flush, &errorCode); totalLength += (destination - buffer); if (U_BUFFER_OVERFLOW_ERROR == errorCode) errorCode = U_ZERO_ERROR; } if (NULL != usedByteLen) *usedByteLen = totalLength; } else { ucnv_fromUnicode(converter, &destination, destinationLimit, (const UChar **)&source, (const UChar *)sourceLimit, NULL, flush, &errorCode); #if HAS_ICU_BUG_6024743 /* Another critical ICU design issue. Similar to conversion error, source pointer returned from U_BUFFER_OVERFLOW_ERROR is already beyond the last valid character position. It renders the returned value from source entirely unusable. We have to manually back up until succeeding Intrestingly, this issue doesn't apply to ucnv_toUnicode. The asynmmetric nature makes this more dangerous */ if (U_BUFFER_OVERFLOW_ERROR == errorCode) { const uint8_t *bitmap = CFUniCharGetBitmapPtrForPlane(kCFUniCharNonBaseCharacterSet, 0); const uint8_t *nonBase; UTF32Char character; do { // Since the output buffer is filled, we can assume no invalid chars (including stray surrogates) do { sourceLimit = (source - 1); character = *sourceLimit; nonBase = bitmap; if (CFUniCharIsSurrogateLowCharacter(character)) { --sourceLimit; character = CFUniCharGetLongCharacterForSurrogatePair(*sourceLimit, character); nonBase = CFUniCharGetBitmapPtrForPlane(kCFUniCharNonBaseCharacterSet, (character >> 16) & 0x000F); character &= 0xFFFF; } } while ((sourceLimit > characters) && CFUniCharIsMemberOfBitmap(character, nonBase)); if (sourceLimit > characters) { source = characters; destination = (char *)bytes; errorCode = U_ZERO_ERROR; ucnv_resetFromUnicode(converter); ucnv_fromUnicode(converter, &destination, destinationLimit, (const UChar **)&source, (const UChar *)sourceLimit, NULL, flush, &errorCode); } } while (U_BUFFER_OVERFLOW_ERROR == errorCode); errorCode = U_BUFFER_OVERFLOW_ERROR; } #endif if (NULL != usedByteLen) *usedByteLen = destination - (const char *)bytes; } status = ((U_ZERO_ERROR == errorCode) ? kCFStringEncodingConversionSuccess : ((U_BUFFER_OVERFLOW_ERROR == errorCode) ? kCFStringEncodingInsufficientOutputBufferLength : kCFStringEncodingInvalidInputStream)); if (NULL != usedCharLen) { #if HAS_ICU_BUG_6024743 /* ICU has a serious behavioral inconsistency issue that the source pointer returned from ucnv_fromUnicode() is after illegal input. We have to keep track of any changes in this area in order to prevent future binary compatiibility issues */ if (kCFStringEncodingInvalidInputStream == status) { #define MAX_ERROR_BUFFER_LEN (32) UTF16Char errorBuffer[MAX_ERROR_BUFFER_LEN]; int8_t errorLength = MAX_ERROR_BUFFER_LEN; #undef MAX_ERROR_BUFFER_LEN errorCode = U_ZERO_ERROR; ucnv_getInvalidUChars(converter, (UChar *)errorBuffer, &errorLength, &errorCode); if (U_ZERO_ERROR == errorCode) { source -= errorLength; } else { // Gah, something is terribly wrong. Reset everything source = characters; // 0 length if (NULL != usedByteLen) *usedByteLen = 0; } } #endif *usedCharLen = source - characters; } status |= __CFStringEncodingConverterReleaseICUConverter(converter, flags, status); return status; } CF_PRIVATE CFIndex __CFStringEncodingICUToUnicode(const char *icuName, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, CFIndex *usedByteLen, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) { UConverter *converter; UErrorCode errorCode = U_ZERO_ERROR; const char *source = (const char *)bytes; const char *sourceLimit = source + numBytes; UTF16Char *destination = characters; const UTF16Char *destinationLimit = destination + maxCharLen; bool flush = ((0 == (flags & kCFStringEncodingPartialInput)) ? true : false); CFIndex status; if (NULL == (converter = __CFStringEncodingConverterCreateICUConverter(icuName, flags, true))) return kCFStringEncodingConverterUnavailable; if (0 == maxCharLen) { UTF16Char buffer[MAX_BUFFER_SIZE]; CFIndex totalLength = 0; while ((source < sourceLimit) && (U_ZERO_ERROR == errorCode)) { destination = buffer; destinationLimit = destination + MAX_BUFFER_SIZE; ucnv_toUnicode(converter, (UChar **)&destination, (const UChar *)destinationLimit, &source, sourceLimit, NULL, flush, &errorCode); totalLength += (destination - buffer); if (U_BUFFER_OVERFLOW_ERROR == errorCode) errorCode = U_ZERO_ERROR; } if (NULL != usedCharLen) *usedCharLen = totalLength; } else { ucnv_toUnicode(converter, (UChar **)&destination, (const UChar *)destinationLimit, &source, sourceLimit, NULL, flush, &errorCode); if (NULL != usedCharLen) *usedCharLen = destination - characters; } status = ((U_ZERO_ERROR == errorCode) ? kCFStringEncodingConversionSuccess : ((U_BUFFER_OVERFLOW_ERROR == errorCode) ? kCFStringEncodingInsufficientOutputBufferLength : kCFStringEncodingInvalidInputStream)); if (NULL != usedByteLen) { #if HAS_ICU_BUG_6024743 /* ICU has a serious behavioral inconsistency issue that the source pointer returned from ucnv_toUnicode() is after illegal input. We have to keep track of any changes in this area in order to prevent future binary compatiibility issues */ if (kCFStringEncodingInvalidInputStream == status) { #define MAX_ERROR_BUFFER_LEN (32) char errorBuffer[MAX_ERROR_BUFFER_LEN]; int8_t errorLength = MAX_ERROR_BUFFER_LEN; #undef MAX_ERROR_BUFFER_LEN errorCode = U_ZERO_ERROR; ucnv_getInvalidChars(converter, errorBuffer, &errorLength, &errorCode); if (U_ZERO_ERROR == errorCode) { #if HAS_ICU_BUG_6025527 // Another ICU oddness here. ucnv_getInvalidUChars() writes the '\0' terminator, and errorLength includes the extra byte. if ((errorLength > 0) && ('\0' == errorBuffer[errorLength - 1])) --errorLength; #endif source -= errorLength; } else { // Gah, something is terribly wrong. Reset everything source = (const char *)bytes; // 0 length if (NULL != usedCharLen) *usedCharLen = 0; } } #endif *usedByteLen = source - (const char *)bytes; } status |= __CFStringEncodingConverterReleaseICUConverter(converter, flags, status); return status; } CF_PRIVATE CFIndex __CFStringEncodingICUCharLength(const char *icuName, uint32_t flags, const uint8_t *bytes, CFIndex numBytes) { CFIndex usedCharLen; return (__CFStringEncodingICUToUnicode(icuName, flags, bytes, numBytes, NULL, NULL, 0, &usedCharLen) == kCFStringEncodingConversionSuccess ? usedCharLen : 0); } CF_PRIVATE CFIndex __CFStringEncodingICUByteLength(const char *icuName, uint32_t flags, const UniChar *characters, CFIndex numChars) { CFIndex usedByteLen; return (__CFStringEncodingICUToBytes(icuName, flags, characters, numChars, NULL, NULL, 0, &usedByteLen) == kCFStringEncodingConversionSuccess ? usedByteLen : 0); } CF_PRIVATE CFStringEncoding *__CFStringEncodingCreateICUEncodings(CFAllocatorRef allocator, CFIndex *numberOfIndex) { CFIndex count = ucnv_countAvailable(); CFIndex numEncodings = 0; CFStringEncoding *encodings; CFStringEncoding encoding; CFIndex index; if (0 == count) return NULL; encodings = (CFStringEncoding *)CFAllocatorAllocate(NULL, sizeof(CFStringEncoding) * count, 0); for (index = 0;index < count;index++) { encoding = __CFStringEncodingGetFromICUName(ucnv_getAvailableName(index)); if (kCFStringEncodingInvalidId != encoding) encodings[numEncodings++] = encoding; } if (0 == numEncodings) { CFAllocatorDeallocate(allocator, encodings); encodings = NULL; } *numberOfIndex = numEncodings; return encodings; }