Bug 1322992 - Implement locale-specific casing behavior for Lithuanian, and add more WPT tests for it. r=m_kato

Differential Revision: https://phabricator.services.mozilla.com/D32129

--HG--
rename : testing/web-platform/tests/css/css-text/text-transform/reference/text-transform-upperlower-039-ref.html => testing/web-platform/tests/css/css-text/text-transform/reference/text-transform-upperlower-044-ref.html
rename : testing/web-platform/tests/css/css-text/text-transform/text-transform-upperlower-039.html => testing/web-platform/tests/css/css-text/text-transform/text-transform-upperlower-044.html
extra : moz-landing-system : lando
This commit is contained in:
Jonathan Kew 2019-05-27 01:35:03 +00:00
parent 2f31fab6d9
commit 5fe29c880d
7 changed files with 193 additions and 9 deletions

View File

@ -219,11 +219,12 @@ gfxTextRunFactory::Parameters GetParametersForInner(
// exhibit the behavior in question; multiple lang tags may map to the
// same setting here, if the behavior is shared by other languages.
enum LanguageSpecificCasingBehavior {
eLSCB_None, // default non-lang-specific behavior
eLSCB_Dutch, // treat "ij" digraph as a unit for capitalization
eLSCB_Greek, // strip accent when uppercasing Greek vowels
eLSCB_Irish, // keep prefix letters as lowercase when uppercasing Irish
eLSCB_Turkish // preserve dotted/dotless-i distinction in uppercase
eLSCB_None, // default non-lang-specific behavior
eLSCB_Dutch, // treat "ij" digraph as a unit for capitalization
eLSCB_Greek, // strip accent when uppercasing Greek vowels
eLSCB_Irish, // keep prefix letters as lowercase when uppercasing Irish
eLSCB_Turkish, // preserve dotted/dotless-i distinction in uppercase
eLSCB_Lithuanian // retain dot on lowercase i/j when an accent is present
};
static LanguageSpecificCasingBehavior GetCasingFor(const nsAtom* aLang) {
@ -244,6 +245,9 @@ static LanguageSpecificCasingBehavior GetCasingFor(const nsAtom* aLang) {
if (aLang == nsGkAtoms::ga) {
return eLSCB_Irish;
}
if (aLang == nsGkAtoms::lt_) {
return eLSCB_Lithuanian;
}
// Is there a region subtag we should ignore?
nsAtomString langStr(const_cast<nsAtom*>(aLang));
@ -277,6 +281,8 @@ bool nsCaseTransformTextRunFactory::TransformString(
bool prevIsLetter = false;
bool ntPrefix = false; // true immediately after a word-initial 'n' or 't'
// when doing Irish lowercasing
bool seenSoftDotted = false; // true immediately after an I or J that is
// converted to lowercase in Lithuanian mode
uint32_t sigmaIndex = uint32_t(-1);
nsUGenCategory cat;
@ -353,6 +359,60 @@ bool nsCaseTransformTextRunFactory::TransformString(
}
}
if (languageSpecificCasing == eLSCB_Lithuanian) {
// clang-format off
/* From SpecialCasing.txt:
* # Introduce an explicit dot above when lowercasing capital I's and J's
* # whenever there are more accents above.
* # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
*
* 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
* 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
* 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
* 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
* 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
* 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
*/
// clang-format on
if (ch == 'I' || ch == 'J' || ch == 0x012E) {
ch = ToLowerCase(ch);
prevIsLetter = true;
seenSoftDotted = true;
sigmaIndex = uint32_t(-1);
break;
}
if (ch == 0x00CC) {
aConvertedString.Append('i');
aConvertedString.Append(0x0307);
extraChars += 2;
ch = 0x0300;
prevIsLetter = true;
seenSoftDotted = false;
sigmaIndex = uint32_t(-1);
break;
}
if (ch == 0x00CD) {
aConvertedString.Append('i');
aConvertedString.Append(0x0307);
extraChars += 2;
ch = 0x0301;
prevIsLetter = true;
seenSoftDotted = false;
sigmaIndex = uint32_t(-1);
break;
}
if (ch == 0x0128) {
aConvertedString.Append('i');
aConvertedString.Append(0x0307);
extraChars += 2;
ch = 0x0303;
prevIsLetter = true;
seenSoftDotted = false;
sigmaIndex = uint32_t(-1);
break;
}
}
cat = mozilla::unicode::GetGenCategory(ch);
if (languageSpecificCasing == eLSCB_Irish &&
@ -371,6 +431,15 @@ bool nsCaseTransformTextRunFactory::TransformString(
ntPrefix = false;
}
if (seenSoftDotted && cat == nsUGenCategory::kMark) {
// The seenSoftDotted flag will only be set in Lithuanian mode.
if (ch == 0x0300 || ch == 0x0301 || ch == 0x0303) {
aConvertedString.Append(0x0307);
++extraChars;
}
}
seenSoftDotted = false;
// Special lowercasing behavior for Greek Sigma: note that this is
// listed as context-sensitive in Unicode's SpecialCasing.txt, but is
// *not* a language-specific mapping; it applies regardless of the
@ -463,6 +532,26 @@ bool nsCaseTransformTextRunFactory::TransformString(
break;
}
if (languageSpecificCasing == eLSCB_Lithuanian) {
/*
* # Remove DOT ABOVE after "i" with upper or titlecase
*
* 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
*/
if (ch == 'i' || ch == 'j' || ch == 0x012F) {
seenSoftDotted = true;
ch = ToTitleCase(ch);
break;
}
if (seenSoftDotted) {
seenSoftDotted = false;
if (ch == 0x0307) {
ch = uint32_t(-1);
break;
}
}
}
if (languageSpecificCasing == eLSCB_Irish) {
bool mark;
uint8_t action;
@ -565,6 +654,25 @@ bool nsCaseTransformTextRunFactory::TransformString(
capitalizeDutchIJ = true;
break;
}
if (languageSpecificCasing == eLSCB_Lithuanian) {
/*
* # Remove DOT ABOVE after "i" with upper or titlecase
*
* 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
*/
if (ch == 'i' || ch == 'j' || ch == 0x012F) {
seenSoftDotted = true;
ch = ToTitleCase(ch);
break;
}
if (seenSoftDotted) {
seenSoftDotted = false;
if (ch == 0x0307) {
ch = uint32_t(-1);
break;
}
}
}
mcm = mozilla::unicode::SpecialTitle(ch);
if (mcm) {

View File

@ -1,2 +0,0 @@
[text-transform-upperlower-039.html]
expected: FAIL

View File

@ -17,7 +17,10 @@
</head>
<body>
<p class="instructions">Test passes if both characters in each pair match. If you are missing a font glyph for a character, ignore that pair, but report which characters were ignored.</p>
<div class="test" lang="lt"><span>&#x69;&#x307;&#x300; &#x69;&#x307;&#x300;</span> <span>&#x69;&#x307;&#x301; &#x69;&#x307;&#x301;</span> <span>&#x69;&#x307;&#x303; &#x69;&#x307;&#x303;</span></div>
<div class="test" lang="lt">
<span>&#x69;&#x307;&#x300; &#x69;&#x307;&#x300;</span> <span>&#x69;&#x307;&#x301; &#x69;&#x307;&#x301;</span> <span>&#x69;&#x307;&#x303; &#x69;&#x307;&#x303;</span>
<span>&#x69;&#x307;&#x300; &#x69;&#x307;&#x300;</span> <span>&#x6A;&#x307;&#x301; &#x6A;&#x307;&#x301;</span> <span>&#x12F;&#x307;&#x303; &#x12F;&#x307;&#x303;</span>
</div>
<!--Notes:
The language of the test box is set to Lithuanian (lt)
-->

View File

@ -0,0 +1,33 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8"/>
<title>CSS3 Text, text transform: Lithuanian, uppercase</title>
<link rel='author' title='Jonathan Kew' href='mailto:jkew@mozilla.com'>
<style type='text/css'>
@font-face {
font-family: 'webfont';
src: url('/fonts/DoulosSIL-R.woff') format('woff');
font-weight: normal;
font-style: normal;
}
.test, .ref { font-size: 200%; line-height: 2.5em; font-family: webfont, serif; }
.test span, .ref span { margin-right: 1em; white-space: nowrap; }
</style>
</head>
<body>
<p class="instructions">Test passes if both characters in each pair match. If you are missing a font glyph for a character, ignore that pair, but report which characters were ignored.</p>
<div class="test" lang="lt">
<span>&#x49;&#x300; &#x49;&#x300;</span>
<span>&#x49;&#x301; &#x49;&#x301;</span>
<span>&#x49;&#x303; &#x49;&#x303;</span>
<span>&#x49; &#x49;</span>
<span>&#x4A; &#x4A;</span>
<span>&#x12E; &#x12E;</span>
<span>X&#x307; X&#x307;</span>
</div>
<!--Notes:
The language of the test box is set to Lithuanian (lt)
-->
</body>
</html>

View File

@ -22,7 +22,10 @@
</head>
<body>
<p class="instructions">Test passes if both characters in each pair match. If you are missing a font glyph for a character, ignore that pair, but report which characters were ignored.</p>
<div class="test" lang="lt"><span>&#xCC; &#x69;&#x307;&#x300;</span> <span>&#xCD; &#x69;&#x307;&#x301;</span> <span>&#x128; &#x69;&#x307;&#x303;</span></div>
<div class="test" lang="lt">
<span>&#xCC; &#x69;&#x307;&#x300;</span> <span>&#xCD; &#x69;&#x307;&#x301;</span> <span>&#x128; &#x69;&#x307;&#x303;</span>
<span>&#x49;&#x300; &#x69;&#x307;&#x300;</span> <span>&#x4A;&#x301; &#x6A;&#x307;&#x301;</span> <span>&#x12E;&#x303; &#x12F;&#x307;&#x303;</span>
</div>
<!--Notes:
The language of the test box is set to Lithuanian (lt)
-->

View File

@ -0,0 +1,38 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8"/>
<title>CSS3 Text, text transform: Lithuanian, uppercase</title>
<meta name="assert" content="text-transform: uppercase will uppercase Lithuanian as described in Unicode's SpecialCasing.txt .">
<link rel='author' title='Jonathan Kew' href='mailto:jkew@mozilla.com'>
<link rel='help' href='https://drafts.csswg.org/css-text-3/#text-transform'>
<link rel="match" href="reference/text-transform-upperlower-044-ref.html">
<style type='text/css'>
@font-face {
font-family: 'webfont';
src: url('/fonts/DoulosSIL-R.woff') format('woff');
font-weight: normal;
font-style: normal;
}
.test, .ref { font-size: 200%; line-height: 2.5em; font-family: webfont, serif; }
.test span, .ref span { margin-right: 1em; white-space: nowrap; }
/* the CSS above is not part of the test */
.test { text-transform: uppercase; }
</style>
</head>
<body>
<p class="instructions">Test passes if both characters in each pair match. If you are missing a font glyph for a character, ignore that pair, but report which characters were ignored.</p>
<div class="test" lang="lt">
<span>&#x69;&#x307;&#x300; &#x49;&#x300;</span>
<span>&#x69;&#x307;&#x301; &#x49;&#x301;</span>
<span>&#x69;&#x307;&#x303; &#x49;&#x303;</span>
<span>&#x69;&#x307; &#x49;</span>
<span>&#x6A;&#x307; &#x4A;</span>
<span>&#x12F;&#x307; &#x12E;</span>
<span>x&#x307; X&#x307;</span> <!-- check that dot isn't deleted in other contexts -->
</div>
<!--Notes:
The language of the test box is set to Lithuanian (lt)
-->
</body>
</html>

View File

@ -2118,6 +2118,7 @@ STATIC_ATOMS = [
Atom("crh", "crh"),
# Atom("el", "el"), # "el" is present above
Atom("ga", "ga"),
# Atom("lt", "lt"), # "lt" is present above (atom name "lt_")
Atom("nl", "nl"),
# mathematical language, used for MathML