gecko-dev/intl/unicharutil/util/IrishCasing.cpp

250 lines
9.7 KiB
C++

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
/******************************************************************************
This file provides a finite state machine to support Irish Gaelic uppercasing
rules.
The caller will need to iterate through a string, passing a State variable
along with the current character to each UpperCase call and checking the flags
that are returned:
If aMarkPos is true, caller must remember the current index in the string as
a possible target for a future action.
If aAction is non-zero, then one or more characters from the marked index are
to be modified:
1 lowercase the marked letter
2 lowercase the marked letter and its successor
3 lowercase the marked letter, and delete its successor
### Rules from https://bugzilla.mozilla.org/show_bug.cgi?id=1014639,
### comments 1 and 4:
v = [a,á,e,é,i,í,o,ó,u,ú]
V = [A,Á,E,É,I,Í,O,Ó,U,Ú]
bhf -> bhF
bhF -> bhF
bp -> bP
bP -> bP
dt -> dT
dT -> dT
gc -> gC
gC -> gC
h{V} -> h{V}
mb -> mB
mB -> mB
n-{v} -> n{V}
n{V} -> n{V}
nd -> nD
nD -> nD
ng -> nG
nG -> nG
t-{v} -> t{V}
t{V} -> t{V}
ts{v} -> tS{V}
tS{v} -> tS{V}
tS{V} -> tS{V}
tsl -> tSL
tSl -> tSL
tSL -> tSL
tsn -> tSN
tSn -> tSN
tSN -> tSN
tsr -> tSR
tSr -> tSR
tSR -> tSR
### Create table of states and actions for each input class.
Start (non-word) state is #; generic in-word state is _, once we know there's
no special action to do in this word.
# _ b bh d g h m n n- t t- ts
input\state
b b' _ _ _ _ _ _ 1 _ _ _ _ _
B _ _ _ _ _ _ _ 1 _ _ _ _ _
c _ _ _ _ _ 1 _ _ _ _ _ _ _
C _ _ _ _ _ 1 _ _ _ _ _ _ _
d d' _ _ _ _ _ _ _ 1 _ _ _ _
D _ _ _ _ _ _ _ _ 1 _ _ _ _
f _ _ _ 2 _ _ _ _ _ _ _ _ _
F _ _ _ 2 _ _ _ _ _ _ _ _ _
g g' _ _ _ _ _ _ _ 1 _ _ _ _
G _ _ _ _ _ _ _ _ 1 _ _ _ _
h h' _ bh _ _ _ _ _ _ _ _ _ _
l _ _ _ _ _ _ _ _ _ _ _ _ 1
L _ _ _ _ _ _ _ _ _ _ _ _ 1
m m' _ _ _ _ _ _ _ _ _ _ _ _
n n' _ _ _ _ _ _ _ _ _ _ _ 1
N _ _ _ _ _ _ _ _ _ _ _ _ 1
p _ _ 1 _ _ _ _ _ _ _ _ _ _
P _ _ 1 _ _ _ _ _ _ _ _ _ _
r _ _ _ _ _ _ _ _ _ _ _ _ 1
R _ _ _ _ _ _ _ _ _ _ _ _ 1
s _ _ _ _ _ _ _ _ _ _ ts _ _
S _ _ _ _ _ _ _ _ _ _ ts _ _
t t' _ _ _ 1 _ _ _ _ _ _ _ _
T _ _ _ _ 1 _ _ _ _ _ _ _ _
vowel _ _ _ _ _ _ _ _ _ 1d _ 1d 1
Vowel _ _ _ _ _ _ 1 _ 1 _ 1 _ 1
hyph _ _ _ _ _ _ _ _ n- _ t- _ _
letter _ _ _ _ _ _ _ _ _ _ _ _ _
other # # # # # # # # # # # # #
Actions:
1 lowercase one letter at start of word
2 lowercase two letters at start of word
1d lowercase one letter at start of word, and delete next
(and then go to state _, nothing further to do in this word)
else just go to the given state; suffix ' indicates mark start-of-word.
### Consolidate identical states and classes:
0 1 2 3 4 5 6 7 8 9 A B
# _ b bh d g h m n [nt]- t ts
input\state
b b' _ _ _ _ _ _ 1 _ _ _ _
B _ _ _ _ _ _ _ 1 _ _ _ _
[cC] _ _ _ _ _ 1 _ _ _ _ _ _
d d' _ _ _ _ _ _ _ 1 _ _ _
[DG] _ _ _ _ _ _ _ _ 1 _ _ _
[fF] _ _ _ 2 _ _ _ _ _ _ _ _
g g' _ _ _ _ _ _ _ 1 _ _ _
h h' _ bh _ _ _ _ _ _ _ _ _
[lLNrR] _ _ _ _ _ _ _ _ _ _ _ 1
m m' _ _ _ _ _ _ _ _ _ _ _
n n' _ _ _ _ _ _ _ _ _ _ 1
[pP] _ _ 1 _ _ _ _ _ _ _ _ _
[sS] _ _ _ _ _ _ _ _ _ _ ts _
t t' _ _ _ 1 _ _ _ _ _ _ _
T _ _ _ _ 1 _ _ _ _ _ _ _
vowel _ _ _ _ _ _ _ _ _ 1d _ 1
Vowel _ _ _ _ _ _ 1 _ 1 _ 1 1
hyph _ _ _ _ _ _ _ _ [nt-] _ [nt-] _
letter _ _ _ _ _ _ _ _ _ _ _ _
other # # # # # # # # # # # #
So we have 20 input classes, and 12 states.
State table array will contain bytes that encode action and new state:
0x80 - bit flag: mark start-of-word position
0x40 - currently unused
0x30 - action mask: 4 values
0x00 - do nothing
0x10 - lowercase one letter
0x20 - lowercase two letters
0x30 - lowercase one, delete one
0x0F - next-state mask
******************************************************************************/
#include "IrishCasing.h"
#include "nsUnicodeProperties.h"
#include "nsUnicharUtils.h"
namespace mozilla {
const uint8_t
IrishCasing::sUppercaseStateTable[kNumClasses][kNumStates] = {
// # _ b bh d g h m n [nt]- t ts
{ 0x82, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01 }, // b
{ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01 }, // B
{ 0x01, 0x01, 0x01, 0x01, 0x01, 0x10, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // [cC]
{ 0x84, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01 }, // d
{ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01 }, // [DG]
{ 0x01, 0x01, 0x01, 0x21, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // [fF]
{ 0x85, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01 }, // g
{ 0x86, 0x01, 0x03, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // h
{ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11 }, // [lLNrR]
{ 0x87, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // m
{ 0x88, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11 }, // n
{ 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // [pP]
{ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x0B, 0x01 }, // [sS]
{ 0x8A, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // t
{ 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // T
{ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x31, 0x01, 0x11 }, // vowel
{ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x11, 0x01, 0x11, 0x01, 0x11, 0x11 }, // Vowel
{ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x09, 0x01, 0x09, 0x01 }, // hyph
{ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }, // letter
{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 } // other
};
#define HYPHEN 0x2010
#define NO_BREAK_HYPHEN 0x2011
#define a_ACUTE 0x00e1
#define e_ACUTE 0x00e9
#define i_ACUTE 0x00ed
#define o_ACUTE 0x00f3
#define u_ACUTE 0x00fa
#define A_ACUTE 0x00c1
#define E_ACUTE 0x00c9
#define I_ACUTE 0x00cd
#define O_ACUTE 0x00d3
#define U_ACUTE 0x00da
const uint8_t IrishCasing::sLcClasses[26] = {
kClass_vowel, kClass_b, kClass_cC, kClass_d, kClass_vowel,
kClass_fF, kClass_g, kClass_h, kClass_vowel, kClass_letter,
kClass_letter, kClass_lLNrR, kClass_m, kClass_n, kClass_vowel,
kClass_pP, kClass_letter, kClass_lLNrR, kClass_sS, kClass_t,
kClass_vowel, kClass_letter, kClass_letter, kClass_letter, kClass_letter,
kClass_letter
};
const uint8_t IrishCasing::sUcClasses[26] = {
kClass_Vowel, kClass_B, kClass_cC, kClass_DG, kClass_Vowel,
kClass_fF, kClass_DG, kClass_letter, kClass_Vowel, kClass_letter,
kClass_letter, kClass_lLNrR, kClass_letter, kClass_lLNrR, kClass_Vowel,
kClass_pP, kClass_letter, kClass_lLNrR, kClass_sS, kClass_T,
kClass_Vowel, kClass_letter, kClass_letter, kClass_letter, kClass_letter,
kClass_letter
};
uint8_t
IrishCasing::GetClass(uint32_t aCh)
{
using mozilla::unicode::GetGenCategory;
if (aCh >= 'a' && aCh <= 'z') {
return sLcClasses[aCh - 'a'];
} else if (aCh >= 'A' && aCh <= 'Z') {
return sUcClasses[aCh - 'A'];
} else if (GetGenCategory(aCh) == nsIUGenCategory::kLetter) {
if (aCh == a_ACUTE || aCh == e_ACUTE || aCh == i_ACUTE ||
aCh == o_ACUTE || aCh == u_ACUTE) {
return kClass_vowel;
} else if (aCh == A_ACUTE || aCh == E_ACUTE || aCh == I_ACUTE ||
aCh == O_ACUTE || aCh == U_ACUTE) {
return kClass_Vowel;
} else {
return kClass_letter;
}
} else if (aCh == '-' || aCh == HYPHEN || aCh == NO_BREAK_HYPHEN) {
return kClass_hyph;
} else {
return kClass_other;
}
}
uint32_t
IrishCasing::UpperCase(uint32_t aCh, State& aState,
bool& aMarkPos, uint8_t& aAction)
{
uint8_t cls = GetClass(aCh);
uint8_t stateEntry = sUppercaseStateTable[cls][aState];
aMarkPos = !!(stateEntry & kMarkPositionFlag);
aAction = (stateEntry & kActionMask) >> kActionShift;
aState = State(stateEntry & kNextStateMask);
return ToUpperCase(aCh);
}
} // namespace mozilla