2010-06-11 19:14:37 +00:00
|
|
|
/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*-
|
|
|
|
* ***** BEGIN LICENSE BLOCK *****
|
|
|
|
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
|
|
|
|
*
|
|
|
|
* The contents of this file are subject to the Mozilla Public License Version
|
|
|
|
* 1.1 (the "License"); you may not use this file except in compliance with
|
|
|
|
* the License. You may obtain a copy of the License at
|
|
|
|
* http://www.mozilla.org/MPL/
|
|
|
|
*
|
|
|
|
* Software distributed under the License is distributed on an "AS IS" basis,
|
|
|
|
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
|
|
|
* for the specific language governing rights and limitations under the
|
|
|
|
* License.
|
|
|
|
*
|
|
|
|
* The Original Code is Mozilla Corporation code.
|
|
|
|
*
|
|
|
|
* The Initial Developer of the Original Code is Mozilla Corporation.
|
|
|
|
* Portions created by the Initial Developer are Copyright (C) 2009-2010
|
|
|
|
* the Initial Developer. All Rights Reserved.
|
|
|
|
*
|
|
|
|
* Contributor(s):
|
|
|
|
* Jonathan Kew <jfkthame@gmail.com>
|
|
|
|
*
|
|
|
|
* Alternatively, the contents of this file may be used under the terms of
|
|
|
|
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
|
|
|
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
|
|
|
* in which case the provisions of the GPL or the LGPL are applicable instead
|
|
|
|
* of those above. If you wish to allow use of your version of this file only
|
|
|
|
* under the terms of either the GPL or the LGPL, and not to allow others to
|
|
|
|
* use your version of this file under the terms of the MPL, indicate your
|
|
|
|
* decision by deleting the provisions above and replace them with the notice
|
|
|
|
* and other provisions required by the GPL or the LGPL. If you do not delete
|
|
|
|
* the provisions above, a recipient may use your version of this file under
|
|
|
|
* the terms of any one of the MPL, the GPL or the LGPL.
|
|
|
|
*
|
|
|
|
* ***** END LICENSE BLOCK ***** */
|
|
|
|
|
2012-02-24 10:15:46 +00:00
|
|
|
#include "nsUnicodeProperties.h"
|
|
|
|
#include "nsUnicodeScriptCodes.h"
|
|
|
|
#include "nsUnicodePropertyData.cpp"
|
2010-06-11 19:14:37 +00:00
|
|
|
|
2012-02-14 08:03:01 +00:00
|
|
|
#include "mozilla/Util.h"
|
|
|
|
#include "nsMemory.h"
|
|
|
|
|
2010-06-11 19:14:37 +00:00
|
|
|
#include "harfbuzz/hb-unicode.h"
|
|
|
|
|
|
|
|
#define UNICODE_BMP_LIMIT 0x10000
|
|
|
|
#define UNICODE_LIMIT 0x110000
|
|
|
|
|
2012-02-24 10:15:46 +00:00
|
|
|
namespace mozilla {
|
|
|
|
|
|
|
|
namespace unicode {
|
|
|
|
|
2010-06-11 19:14:37 +00:00
|
|
|
/*
|
|
|
|
To store properties for a million Unicode codepoints compactly, we use
|
|
|
|
a three-level array structure, with the Unicode values considered as
|
|
|
|
three elements: Plane, Page, and Char.
|
|
|
|
|
|
|
|
Space optimization happens because multiple Planes can refer to the same
|
|
|
|
Page array, and multiple Pages can refer to the same Char array holding
|
|
|
|
the actual values. In practice, most of the higher planes are empty and
|
|
|
|
thus share the same data; and within the BMP, there are also many pages
|
|
|
|
that repeat the same data for any given property.
|
|
|
|
|
|
|
|
Plane is usually zero, so we skip a lookup in this case, and require
|
|
|
|
that the Plane 0 pages are always the first set of entries in the Page
|
|
|
|
array.
|
|
|
|
|
|
|
|
The division of the remaining 16 bits into Page and Char fields is
|
|
|
|
adjusted for each property (by experiment using the generation tool)
|
|
|
|
to provide the most compact storage, depending on the distribution
|
|
|
|
of values.
|
|
|
|
*/
|
|
|
|
|
2012-02-24 10:15:46 +00:00
|
|
|
nsIUGenCategory::nsUGenCategory sDetailedToGeneralCategory[] = {
|
|
|
|
/*
|
|
|
|
* The order here corresponds to the HB_UNICODE_GENERAL_CATEGORY_* constants
|
|
|
|
* of the hb_unicode_general_category_t enum in gfx/harfbuzz/src/hb-common.h.
|
|
|
|
*/
|
|
|
|
/* CONTROL */ nsIUGenCategory::kOther,
|
|
|
|
/* FORMAT */ nsIUGenCategory::kOther,
|
|
|
|
/* UNASSIGNED */ nsIUGenCategory::kOther,
|
|
|
|
/* PRIVATE_USE */ nsIUGenCategory::kOther,
|
|
|
|
/* SURROGATE */ nsIUGenCategory::kOther,
|
|
|
|
/* LOWERCASE_LETTER */ nsIUGenCategory::kLetter,
|
|
|
|
/* MODIFIER_LETTER */ nsIUGenCategory::kLetter,
|
|
|
|
/* OTHER_LETTER */ nsIUGenCategory::kLetter,
|
|
|
|
/* TITLECASE_LETTER */ nsIUGenCategory::kLetter,
|
|
|
|
/* UPPERCASE_LETTER */ nsIUGenCategory::kLetter,
|
|
|
|
/* COMBINING_MARK */ nsIUGenCategory::kMark,
|
|
|
|
/* ENCLOSING_MARK */ nsIUGenCategory::kMark,
|
|
|
|
/* NON_SPACING_MARK */ nsIUGenCategory::kMark,
|
|
|
|
/* DECIMAL_NUMBER */ nsIUGenCategory::kNumber,
|
|
|
|
/* LETTER_NUMBER */ nsIUGenCategory::kNumber,
|
|
|
|
/* OTHER_NUMBER */ nsIUGenCategory::kNumber,
|
|
|
|
/* CONNECT_PUNCTUATION */ nsIUGenCategory::kPunctuation,
|
|
|
|
/* DASH_PUNCTUATION */ nsIUGenCategory::kPunctuation,
|
|
|
|
/* CLOSE_PUNCTUATION */ nsIUGenCategory::kPunctuation,
|
|
|
|
/* FINAL_PUNCTUATION */ nsIUGenCategory::kPunctuation,
|
|
|
|
/* INITIAL_PUNCTUATION */ nsIUGenCategory::kPunctuation,
|
|
|
|
/* OTHER_PUNCTUATION */ nsIUGenCategory::kPunctuation,
|
|
|
|
/* OPEN_PUNCTUATION */ nsIUGenCategory::kPunctuation,
|
|
|
|
/* CURRENCY_SYMBOL */ nsIUGenCategory::kSymbol,
|
|
|
|
/* MODIFIER_SYMBOL */ nsIUGenCategory::kSymbol,
|
|
|
|
/* MATH_SYMBOL */ nsIUGenCategory::kSymbol,
|
|
|
|
/* OTHER_SYMBOL */ nsIUGenCategory::kSymbol,
|
|
|
|
/* LINE_SEPARATOR */ nsIUGenCategory::kSeparator,
|
|
|
|
/* PARAGRAPH_SEPARATOR */ nsIUGenCategory::kSeparator,
|
|
|
|
/* SPACE_SEPARATOR */ nsIUGenCategory::kSeparator
|
|
|
|
};
|
|
|
|
|
2010-06-11 19:14:37 +00:00
|
|
|
PRUint32
|
2012-02-24 10:15:46 +00:00
|
|
|
GetMirroredChar(PRUint32 aCh)
|
2010-06-11 19:14:37 +00:00
|
|
|
{
|
|
|
|
// all mirrored chars are in plane 0
|
|
|
|
if (aCh < UNICODE_BMP_LIMIT) {
|
|
|
|
int v = sMirrorValues[sMirrorPages[0][aCh >> kMirrorCharBits]]
|
|
|
|
[aCh & ((1 << kMirrorCharBits) - 1)];
|
|
|
|
// The mirror value is stored as either an offset (if less than
|
|
|
|
// kSmallMirrorOffset) from the input character code, or as
|
|
|
|
// an index into the sDistantMirrors list. This allows the
|
|
|
|
// mirrored codes to be stored as 8-bit values, as most of them
|
|
|
|
// are references to nearby character codes.
|
|
|
|
if (v < kSmallMirrorOffset) {
|
|
|
|
return aCh + v;
|
|
|
|
}
|
|
|
|
return sDistantMirrors[v - kSmallMirrorOffset];
|
|
|
|
}
|
|
|
|
return aCh;
|
|
|
|
}
|
|
|
|
|
|
|
|
PRUint8
|
2012-02-24 10:15:46 +00:00
|
|
|
GetCombiningClass(PRUint32 aCh)
|
2010-06-11 19:14:37 +00:00
|
|
|
{
|
|
|
|
if (aCh < UNICODE_BMP_LIMIT) {
|
|
|
|
return sCClassValues[sCClassPages[0][aCh >> kCClassCharBits]]
|
|
|
|
[aCh & ((1 << kCClassCharBits) - 1)];
|
|
|
|
}
|
2012-02-14 08:03:01 +00:00
|
|
|
if (aCh < UNICODE_LIMIT) {
|
|
|
|
return sCClassValues[sCClassPages[sCClassPlanes[(aCh >> 16) - 1]]
|
|
|
|
[(aCh & 0xffff) >> kCClassCharBits]]
|
|
|
|
[aCh & ((1 << kCClassCharBits) - 1)];
|
2010-06-11 19:14:37 +00:00
|
|
|
}
|
2012-02-14 08:03:01 +00:00
|
|
|
NS_NOTREACHED("invalid Unicode character!");
|
|
|
|
return 0;
|
2010-06-11 19:14:37 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
PRUint8
|
2012-02-24 10:15:46 +00:00
|
|
|
GetGeneralCategory(PRUint32 aCh)
|
2010-06-11 19:14:37 +00:00
|
|
|
{
|
|
|
|
if (aCh < UNICODE_BMP_LIMIT) {
|
|
|
|
return sCatEAWValues[sCatEAWPages[0][aCh >> kCatEAWCharBits]]
|
|
|
|
[aCh & ((1 << kCatEAWCharBits) - 1)].mCategory;
|
|
|
|
}
|
2012-02-14 08:03:01 +00:00
|
|
|
if (aCh < UNICODE_LIMIT) {
|
|
|
|
return sCatEAWValues[sCatEAWPages[sCatEAWPlanes[(aCh >> 16) - 1]]
|
|
|
|
[(aCh & 0xffff) >> kCatEAWCharBits]]
|
|
|
|
[aCh & ((1 << kCatEAWCharBits) - 1)].mCategory;
|
2010-06-11 19:14:37 +00:00
|
|
|
}
|
2012-02-14 08:03:01 +00:00
|
|
|
NS_NOTREACHED("invalid Unicode character!");
|
|
|
|
return PRUint8(HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED);
|
2010-06-11 19:14:37 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
PRUint8
|
2012-02-24 10:15:46 +00:00
|
|
|
GetEastAsianWidth(PRUint32 aCh)
|
2010-06-11 19:14:37 +00:00
|
|
|
{
|
|
|
|
if (aCh < UNICODE_BMP_LIMIT) {
|
|
|
|
return sCatEAWValues[sCatEAWPages[0][aCh >> kCatEAWCharBits]]
|
|
|
|
[aCh & ((1 << kCatEAWCharBits) - 1)].mEAW;
|
|
|
|
}
|
2012-02-14 08:03:01 +00:00
|
|
|
if (aCh < UNICODE_LIMIT) {
|
|
|
|
return sCatEAWValues[sCatEAWPages[sCatEAWPlanes[(aCh >> 16) - 1]]
|
|
|
|
[(aCh & 0xffff) >> kCatEAWCharBits]]
|
|
|
|
[aCh & ((1 << kCatEAWCharBits) - 1)].mEAW;
|
2010-06-11 19:14:37 +00:00
|
|
|
}
|
2012-02-14 08:03:01 +00:00
|
|
|
NS_NOTREACHED("invalid Unicode character!");
|
|
|
|
return 0;
|
2010-06-11 19:14:37 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
PRInt32
|
2012-02-24 10:15:46 +00:00
|
|
|
GetScriptCode(PRUint32 aCh)
|
2010-06-11 19:14:37 +00:00
|
|
|
{
|
|
|
|
if (aCh < UNICODE_BMP_LIMIT) {
|
|
|
|
return sScriptValues[sScriptPages[0][aCh >> kScriptCharBits]]
|
|
|
|
[aCh & ((1 << kScriptCharBits) - 1)];
|
|
|
|
}
|
2012-02-14 08:03:01 +00:00
|
|
|
if (aCh < UNICODE_LIMIT) {
|
|
|
|
return sScriptValues[sScriptPages[sScriptPlanes[(aCh >> 16) - 1]]
|
|
|
|
[(aCh & 0xffff) >> kScriptCharBits]]
|
|
|
|
[aCh & ((1 << kScriptCharBits) - 1)];
|
|
|
|
}
|
|
|
|
NS_NOTREACHED("invalid Unicode character!");
|
|
|
|
return MOZ_SCRIPT_UNKNOWN;
|
|
|
|
}
|
|
|
|
|
|
|
|
PRUint32
|
2012-02-24 10:15:46 +00:00
|
|
|
GetScriptTagForCode(PRInt32 aScriptCode)
|
2012-02-14 08:03:01 +00:00
|
|
|
{
|
|
|
|
// this will safely return 0 for negative script codes, too :)
|
|
|
|
if (PRUint32(aScriptCode) > ArrayLength(sScriptCodeToTag)) {
|
|
|
|
return 0;
|
2010-06-11 19:14:37 +00:00
|
|
|
}
|
2012-02-14 08:03:01 +00:00
|
|
|
return sScriptCodeToTag[aScriptCode];
|
2010-06-11 19:14:37 +00:00
|
|
|
}
|
2009-10-07 17:16:52 +00:00
|
|
|
|
2012-02-24 10:15:46 +00:00
|
|
|
HSType
|
|
|
|
GetHangulSyllableType(PRUint32 aCh)
|
2010-12-06 13:22:24 +00:00
|
|
|
{
|
|
|
|
// all Hangul chars are in plane 0
|
|
|
|
if (aCh < UNICODE_BMP_LIMIT) {
|
|
|
|
return HSType(sHangulValues[sHangulPages[0][aCh >> kHangulCharBits]]
|
|
|
|
[aCh & ((1 << kHangulCharBits) - 1)]);
|
|
|
|
}
|
|
|
|
return HST_NONE;
|
|
|
|
}
|
|
|
|
|
2009-10-07 17:16:52 +00:00
|
|
|
// TODO: replace this with a properties file or similar;
|
|
|
|
// expect this to evolve as harfbuzz shaping support matures.
|
|
|
|
//
|
2011-04-11 11:17:31 +00:00
|
|
|
// The "shaping type" of each script run, as returned by this
|
|
|
|
// function, is compared to the bits set in the
|
|
|
|
// gfx.font_rendering.harfbuzz.scripts
|
2009-10-07 17:16:52 +00:00
|
|
|
// preference to decide whether to use the harfbuzz shaper.
|
|
|
|
//
|
|
|
|
PRInt32
|
2012-02-24 10:15:46 +00:00
|
|
|
ScriptShapingType(PRInt32 aScriptCode)
|
2009-10-07 17:16:52 +00:00
|
|
|
{
|
|
|
|
switch (aScriptCode) {
|
2010-11-20 17:49:12 +00:00
|
|
|
default:
|
2011-04-11 11:17:31 +00:00
|
|
|
return SHAPING_DEFAULT; // scripts not explicitly listed here are
|
|
|
|
// assumed to just use default shaping
|
2010-11-20 17:49:12 +00:00
|
|
|
|
2012-02-14 08:03:01 +00:00
|
|
|
case MOZ_SCRIPT_ARABIC:
|
|
|
|
case MOZ_SCRIPT_SYRIAC:
|
|
|
|
case MOZ_SCRIPT_NKO:
|
|
|
|
case MOZ_SCRIPT_MANDAIC:
|
2011-04-11 11:17:31 +00:00
|
|
|
return SHAPING_ARABIC; // bidi scripts with Arabic-style shaping
|
2009-10-07 17:16:52 +00:00
|
|
|
|
2012-02-14 08:03:01 +00:00
|
|
|
case MOZ_SCRIPT_HEBREW:
|
2011-04-11 11:17:31 +00:00
|
|
|
return SHAPING_HEBREW;
|
|
|
|
|
2012-02-14 08:03:01 +00:00
|
|
|
case MOZ_SCRIPT_HANGUL:
|
2011-04-11 11:17:31 +00:00
|
|
|
return SHAPING_HANGUL;
|
|
|
|
|
2012-02-14 08:03:01 +00:00
|
|
|
case MOZ_SCRIPT_MONGOLIAN: // to be supported by the Arabic shaper?
|
2011-04-11 11:17:31 +00:00
|
|
|
return SHAPING_MONGOLIAN;
|
|
|
|
|
2012-02-14 08:03:01 +00:00
|
|
|
case MOZ_SCRIPT_THAI: // no complex OT features, but MS engines like to do
|
|
|
|
// sequence checking
|
2011-07-26 09:47:36 +00:00
|
|
|
return SHAPING_THAI;
|
|
|
|
|
2012-02-14 08:03:01 +00:00
|
|
|
case MOZ_SCRIPT_BENGALI:
|
|
|
|
case MOZ_SCRIPT_DEVANAGARI:
|
|
|
|
case MOZ_SCRIPT_GUJARATI:
|
|
|
|
case MOZ_SCRIPT_GURMUKHI:
|
|
|
|
case MOZ_SCRIPT_KANNADA:
|
|
|
|
case MOZ_SCRIPT_MALAYALAM:
|
|
|
|
case MOZ_SCRIPT_ORIYA:
|
|
|
|
case MOZ_SCRIPT_SINHALA:
|
|
|
|
case MOZ_SCRIPT_TAMIL:
|
|
|
|
case MOZ_SCRIPT_TELUGU:
|
|
|
|
case MOZ_SCRIPT_KHMER:
|
|
|
|
case MOZ_SCRIPT_LAO:
|
|
|
|
case MOZ_SCRIPT_TIBETAN:
|
|
|
|
case MOZ_SCRIPT_NEW_TAI_LUE:
|
|
|
|
case MOZ_SCRIPT_TAI_LE:
|
|
|
|
case MOZ_SCRIPT_MYANMAR:
|
|
|
|
case MOZ_SCRIPT_PHAGS_PA:
|
|
|
|
case MOZ_SCRIPT_BATAK:
|
|
|
|
case MOZ_SCRIPT_BRAHMI:
|
2011-04-11 11:17:31 +00:00
|
|
|
return SHAPING_INDIC; // scripts that require Indic or other "special" shaping
|
2010-11-20 17:49:12 +00:00
|
|
|
}
|
2009-10-07 17:16:52 +00:00
|
|
|
}
|
2012-02-24 10:15:46 +00:00
|
|
|
|
|
|
|
} // end namespace unicode
|
|
|
|
|
|
|
|
} // end namespace mozilla
|