Bug 1706862 - Make chardetng detect half-width katakana. r=emk

Differential Revision: https://phabricator.services.mozilla.com/D115209
This commit is contained in:
Henri Sivonen 2021-05-31 11:52:17 +00:00
parent 2a324428d6
commit bd9a21c236
8 changed files with 129 additions and 18 deletions

View File

@ -90,7 +90,7 @@ rev = "ed8a4c6f900a90d4dbc1d64b856e61490a1c3570"
[source."https://github.com/hsivonen/chardetng"]
git = "https://github.com/hsivonen/chardetng"
replace-with = "vendored-sources"
rev = "39f95e2f7cd6e632a379cdeee62c68e8cedd7810"
rev = "302c995f91f44cf26e77dc4758ad56c3ff0153ad"
[source."https://github.com/gfx-rs/naga"]
git = "https://github.com/gfx-rs/naga"

2
Cargo.lock generated
View File

@ -571,7 +571,7 @@ checksum = "fd16c4719339c4530435d38e511904438d07cce7950afa3718a84ac36c10e89e"
[[package]]
name = "chardetng"
version = "0.1.9"
source = "git+https://github.com/hsivonen/chardetng?rev=39f95e2f7cd6e632a379cdeee62c68e8cedd7810#39f95e2f7cd6e632a379cdeee62c68e8cedd7810"
source = "git+https://github.com/hsivonen/chardetng?rev=302c995f91f44cf26e77dc4758ad56c3ff0153ad#302c995f91f44cf26e77dc4758ad56c3ff0153ad"
dependencies = [
"encoding_rs",
"memchr",

View File

@ -75,7 +75,7 @@ opt-level = 2
opt-level = 2
[patch.crates-io]
chardetng = { git = "https://github.com/hsivonen/chardetng", rev="39f95e2f7cd6e632a379cdeee62c68e8cedd7810" }
chardetng = { git = "https://github.com/hsivonen/chardetng", rev="302c995f91f44cf26e77dc4758ad56c3ff0153ad" }
chardetng_c = { git = "https://github.com/hsivonen/chardetng_c", rev="ed8a4c6f900a90d4dbc1d64b856e61490a1c3570" }
libudev-sys = { path = "dom/webauthn/libudev-sys" }
packed_simd = { git = "https://github.com/hsivonen/packed_simd", rev="0917fe780032a6bbb23d71be545f9c1834128d75" }

View File

@ -0,0 +1,16 @@
<!doctype html>
<meta charset="utf-8">
<title>ja Shift_JIS late</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<script>
setup({explicit_done:true});
window.onmessage = function(e) {
test(function() {
assert_equals(e.data, "Shift_JIS", 'Expected Shift_JIS');
}, "Check detection result");
w.close();
done();
};
var w = window.open("support/ja-half-width-late.sub.html");
</script>

View File

@ -0,0 +1,14 @@
<!doctype html>
<title>ja Shift_JIS half-width katakana</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<p>Ê°ÄÞ³ª±Ê°ÄÞ³ª±Ê°ÄÞ³ª±Ê°ÄÞ³ª±Ê°ÄÞ³ª±</p>
<script>
setup({explicit_done:true});
onload = function() {
test(function() {
assert_equals(document.characterSet, "Shift_JIS", 'Expected Shift_JIS');
}, "Check detection result");
done();
};
</script>

File diff suppressed because one or more lines are too long

View File

@ -1 +1 @@
{"files":{"CONTRIBUTING.md":"0e64fb3dd5a00e3fd528de6442de3f2ca851bd718c45cca0871aaf4eedac9ee1","COPYRIGHT":"2fd0d7e90bd241b79804de129c5b70089988f82a7bbb0fe580a55b67b2968928","Cargo.toml":"ab767659696eb10dbaab743b566910bd29fc8f8f6998d9580494397a8903bd34","LICENSE-APACHE":"cfc7749b96f63bd31c3c42b5c471bf756814053e847c10f3eb003417bc523d30","LICENSE-MIT":"4ad721b5b6a3d39ca3e2202f403d897c4a1d42896486dd58963a81f8e64ef61d","README.md":"a6c97d91989aee4c8afed918340ce6287652cbdd6fed833e20f76367c7953db9","src/data.rs":"be48f1486ef9fc264f6cda2e10944b7dcf8ed0a904b53227340a1384803796c7","src/lib.rs":"456f288602cf7f5f490b2c6541df500db0f53df889a5a4987ae361702afa48bc","src/tld.rs":"295c3c90c60c5bb6edd753b77c261eed10be2d431badda4e02168e740a0f2d7e"},"package":null}
{"files":{"CONTRIBUTING.md":"0e64fb3dd5a00e3fd528de6442de3f2ca851bd718c45cca0871aaf4eedac9ee1","COPYRIGHT":"2fd0d7e90bd241b79804de129c5b70089988f82a7bbb0fe580a55b67b2968928","Cargo.toml":"ab767659696eb10dbaab743b566910bd29fc8f8f6998d9580494397a8903bd34","LICENSE-APACHE":"cfc7749b96f63bd31c3c42b5c471bf756814053e847c10f3eb003417bc523d30","LICENSE-MIT":"4ad721b5b6a3d39ca3e2202f403d897c4a1d42896486dd58963a81f8e64ef61d","README.md":"a6c97d91989aee4c8afed918340ce6287652cbdd6fed833e20f76367c7953db9","src/data.rs":"be48f1486ef9fc264f6cda2e10944b7dcf8ed0a904b53227340a1384803796c7","src/lib.rs":"e6641fd425b374424a2481e0717df6db405fb1781d1ee0f3af74e1bd5ab392b0","src/tld.rs":"295c3c90c60c5bb6edd753b77c261eed10be2d431badda4e02168e740a0f2d7e"},"package":null}

View File

@ -56,7 +56,13 @@ const SHIFT_JIS_SCORE_PER_LEVEL_1_KANJI: i64 = CJK_BASE_SCORE;
const SHIFT_JIS_SCORE_PER_LEVEL_2_KANJI: i64 = CJK_SECONDARY_BASE_SCORE;
const HALF_WIDTH_KATAKANA_PENALTY: i64 = -(CJK_BASE_SCORE * 3);
// Manually calibrated relative to windows-1256 Persian and Urdu
const SHIFT_JIS_INITIAL_HALF_WIDTH_KATAKANA_PENALTY: i64 = -75;
const HALF_WIDTH_KATAKANA_SCORE: i64 = 1;
// Unclear if this is a good idea; seems not harmful, but can't be sure.
const HALF_WIDTH_KATAKANA_VOICING_SCORE: i64 = 10;
const SHIFT_JIS_PUA_PENALTY: i64 = -(CJK_BASE_SCORE * 10); // Should this be larger?
@ -947,6 +953,13 @@ enum LatinCj {
Other,
}
#[derive(PartialEq, Copy, Clone)]
enum HalfWidthKatakana {
DakutenForbidden,
DakutenAllowed,
DakutenOrHandakutenAllowed,
}
#[derive(PartialEq)]
enum LatinKorean {
AsciiLetter,
@ -1195,7 +1208,8 @@ fn more_problematic_lead(b: u8) -> bool {
struct ShiftJisCandidate {
decoder: Decoder,
non_ascii_seen: bool,
half_width_katakana_seen: bool,
half_width_katakana_state: HalfWidthKatakana,
prev: LatinCj,
prev_byte: u8,
pending_score: Option<i64>,
@ -1222,13 +1236,9 @@ impl ShiftJisCandidate {
.decoder
.decode_to_utf16_without_replacement(&src, &mut dst, false);
if written > 0 {
let half_width_katakana_state = self.half_width_katakana_state;
self.half_width_katakana_state = HalfWidthKatakana::DakutenForbidden;
let u = dst[0];
if !self.non_ascii_seen && u >= 0x80 {
self.non_ascii_seen = true;
if u >= 0xFF61 && u <= 0xFF9F {
return None;
}
}
if (u >= u16::from(b'a') && u <= u16::from(b'z'))
|| (u >= u16::from(b'A') && u <= u16::from(b'Z'))
{
@ -1238,8 +1248,38 @@ impl ShiftJisCandidate {
}
self.prev = LatinCj::AsciiLetter;
} else if u >= 0xFF61 && u <= 0xFF9F {
if !self.half_width_katakana_seen {
self.half_width_katakana_seen = true;
// To avoid misdetecting title-length inputs
score += SHIFT_JIS_INITIAL_HALF_WIDTH_KATAKANA_PENALTY;
}
self.pending_score = None; // Discard pending score
score += HALF_WIDTH_KATAKANA_PENALTY;
score += HALF_WIDTH_KATAKANA_SCORE;
if (u >= 0xFF76 && u <= 0xFF84) || u == 0xFF73 {
self.half_width_katakana_state = HalfWidthKatakana::DakutenAllowed;
} else if u >= 0xFF8A && u <= 0xFF8E {
self.half_width_katakana_state =
HalfWidthKatakana::DakutenOrHandakutenAllowed;
} else if u == 0xFF9E {
if half_width_katakana_state == HalfWidthKatakana::DakutenForbidden {
score += IMPLAUSIBILITY_PENALTY;
} else {
score += HALF_WIDTH_KATAKANA_VOICING_SCORE;
}
} else if u == 0xFF9F {
if half_width_katakana_state
!= HalfWidthKatakana::DakutenOrHandakutenAllowed
{
score += IMPLAUSIBILITY_PENALTY;
} else {
score += HALF_WIDTH_KATAKANA_VOICING_SCORE;
}
}
if self.prev == LatinCj::AsciiLetter {
score += CJK_LATIN_ADJACENCY_PENALTY;
}
self.prev = LatinCj::Cj;
} else if u >= 0x3040 && u < 0x3100 {
if let Some(pending) = self.pending_score {
@ -1377,6 +1417,7 @@ impl ShiftJisCandidate {
struct EucJpCandidate {
decoder: Decoder,
non_ascii_seen: bool,
half_width_katakana_state: HalfWidthKatakana,
prev: LatinCj,
prev_byte: u8,
prev_prev_byte: u8,
@ -1393,12 +1434,11 @@ impl EucJpCandidate {
.decoder
.decode_to_utf16_without_replacement(&src, &mut dst, false);
if written > 0 {
let half_width_katakana_state = self.half_width_katakana_state;
self.half_width_katakana_state = HalfWidthKatakana::DakutenForbidden;
let u = dst[0];
if !self.non_ascii_seen && u >= 0x80 {
self.non_ascii_seen = true;
if u >= 0xFF61 && u <= 0xFF9F {
return None;
}
if u >= 0x3040 && u < 0x3100 {
// Remove the kana advantage over initial Big5
// hanzi.
@ -1413,7 +1453,32 @@ impl EucJpCandidate {
}
self.prev = LatinCj::AsciiLetter;
} else if u >= 0xFF61 && u <= 0xFF9F {
score += HALF_WIDTH_KATAKANA_PENALTY;
score += HALF_WIDTH_KATAKANA_SCORE;
if (u >= 0xFF76 && u <= 0xFF84) || u == 0xFF73 {
self.half_width_katakana_state = HalfWidthKatakana::DakutenAllowed;
} else if u >= 0xFF8A && u <= 0xFF8E {
self.half_width_katakana_state =
HalfWidthKatakana::DakutenOrHandakutenAllowed;
} else if u == 0xFF9E {
if half_width_katakana_state == HalfWidthKatakana::DakutenForbidden {
score += IMPLAUSIBILITY_PENALTY;
} else {
score += HALF_WIDTH_KATAKANA_VOICING_SCORE;
}
} else if u == 0xFF9F {
if half_width_katakana_state
!= HalfWidthKatakana::DakutenOrHandakutenAllowed
{
score += IMPLAUSIBILITY_PENALTY;
} else {
score += HALF_WIDTH_KATAKANA_VOICING_SCORE;
}
}
if self.prev == LatinCj::AsciiLetter {
score += CJK_LATIN_ADJACENCY_PENALTY;
}
self.prev = LatinCj::Other;
} else if (u >= 0x3041 && u <= 0x3093) || (u >= 0x30A1 && u <= 0x30F6) {
match u {
@ -2459,7 +2524,8 @@ impl Candidate {
Candidate {
inner: InnerCandidate::Shift(ShiftJisCandidate {
decoder: SHIFT_JIS.new_decoder_without_bom_handling(),
non_ascii_seen: false,
half_width_katakana_seen: false,
half_width_katakana_state: HalfWidthKatakana::DakutenForbidden,
prev: LatinCj::Other,
prev_byte: 0,
pending_score: None,
@ -2473,6 +2539,7 @@ impl Candidate {
inner: InnerCandidate::EucJp(EucJpCandidate {
decoder: EUC_JP.new_decoder_without_bom_handling(),
non_ascii_seen: false,
half_width_katakana_state: HalfWidthKatakana::DakutenForbidden,
prev: LatinCj::Other,
prev_byte: 0,
prev_prev_byte: 0,
@ -3455,6 +3522,11 @@ mod tests {
check(" €9", WINDOWS_1252);
}
#[test]
fn test_shift_jis_half_width_katakana() {
check("ハードウェアハードウェアハードウェアハードウェアハードウェア", SHIFT_JIS);
}
#[test]
fn test_big5_pua() {
let mut v = Vec::new();