mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-11-23 12:51:06 +00:00
Bug 1706862 - Make chardetng detect half-width katakana. r=emk
Differential Revision: https://phabricator.services.mozilla.com/D115209
This commit is contained in:
parent
2a324428d6
commit
bd9a21c236
@ -90,7 +90,7 @@ rev = "ed8a4c6f900a90d4dbc1d64b856e61490a1c3570"
|
||||
[source."https://github.com/hsivonen/chardetng"]
|
||||
git = "https://github.com/hsivonen/chardetng"
|
||||
replace-with = "vendored-sources"
|
||||
rev = "39f95e2f7cd6e632a379cdeee62c68e8cedd7810"
|
||||
rev = "302c995f91f44cf26e77dc4758ad56c3ff0153ad"
|
||||
|
||||
[source."https://github.com/gfx-rs/naga"]
|
||||
git = "https://github.com/gfx-rs/naga"
|
||||
|
2
Cargo.lock
generated
2
Cargo.lock
generated
@ -571,7 +571,7 @@ checksum = "fd16c4719339c4530435d38e511904438d07cce7950afa3718a84ac36c10e89e"
|
||||
[[package]]
|
||||
name = "chardetng"
|
||||
version = "0.1.9"
|
||||
source = "git+https://github.com/hsivonen/chardetng?rev=39f95e2f7cd6e632a379cdeee62c68e8cedd7810#39f95e2f7cd6e632a379cdeee62c68e8cedd7810"
|
||||
source = "git+https://github.com/hsivonen/chardetng?rev=302c995f91f44cf26e77dc4758ad56c3ff0153ad#302c995f91f44cf26e77dc4758ad56c3ff0153ad"
|
||||
dependencies = [
|
||||
"encoding_rs",
|
||||
"memchr",
|
||||
|
@ -75,7 +75,7 @@ opt-level = 2
|
||||
opt-level = 2
|
||||
|
||||
[patch.crates-io]
|
||||
chardetng = { git = "https://github.com/hsivonen/chardetng", rev="39f95e2f7cd6e632a379cdeee62c68e8cedd7810" }
|
||||
chardetng = { git = "https://github.com/hsivonen/chardetng", rev="302c995f91f44cf26e77dc4758ad56c3ff0153ad" }
|
||||
chardetng_c = { git = "https://github.com/hsivonen/chardetng_c", rev="ed8a4c6f900a90d4dbc1d64b856e61490a1c3570" }
|
||||
libudev-sys = { path = "dom/webauthn/libudev-sys" }
|
||||
packed_simd = { git = "https://github.com/hsivonen/packed_simd", rev="0917fe780032a6bbb23d71be545f9c1834128d75" }
|
||||
|
@ -0,0 +1,16 @@
|
||||
<!doctype html>
|
||||
<meta charset="utf-8">
|
||||
<title>ja Shift_JIS late</title>
|
||||
<script src=/resources/testharness.js></script>
|
||||
<script src=/resources/testharnessreport.js></script>
|
||||
<script>
|
||||
setup({explicit_done:true});
|
||||
window.onmessage = function(e) {
|
||||
test(function() {
|
||||
assert_equals(e.data, "Shift_JIS", 'Expected Shift_JIS');
|
||||
}, "Check detection result");
|
||||
w.close();
|
||||
done();
|
||||
};
|
||||
var w = window.open("support/ja-half-width-late.sub.html");
|
||||
</script>
|
@ -0,0 +1,14 @@
|
||||
<!doctype html>
|
||||
<title>ja Shift_JIS half-width katakana</title>
|
||||
<script src=/resources/testharness.js></script>
|
||||
<script src=/resources/testharnessreport.js></script>
|
||||
<p>Ê°ÄÞ³ª±Ê°ÄÞ³ª±Ê°ÄÞ³ª±Ê°ÄÞ³ª±Ê°ÄÞ³ª±</p>
|
||||
<script>
|
||||
setup({explicit_done:true});
|
||||
onload = function() {
|
||||
test(function() {
|
||||
assert_equals(document.characterSet, "Shift_JIS", 'Expected Shift_JIS');
|
||||
}, "Check detection result");
|
||||
done();
|
||||
};
|
||||
</script>
|
File diff suppressed because one or more lines are too long
@ -1 +1 @@
|
||||
{"files":{"CONTRIBUTING.md":"0e64fb3dd5a00e3fd528de6442de3f2ca851bd718c45cca0871aaf4eedac9ee1","COPYRIGHT":"2fd0d7e90bd241b79804de129c5b70089988f82a7bbb0fe580a55b67b2968928","Cargo.toml":"ab767659696eb10dbaab743b566910bd29fc8f8f6998d9580494397a8903bd34","LICENSE-APACHE":"cfc7749b96f63bd31c3c42b5c471bf756814053e847c10f3eb003417bc523d30","LICENSE-MIT":"4ad721b5b6a3d39ca3e2202f403d897c4a1d42896486dd58963a81f8e64ef61d","README.md":"a6c97d91989aee4c8afed918340ce6287652cbdd6fed833e20f76367c7953db9","src/data.rs":"be48f1486ef9fc264f6cda2e10944b7dcf8ed0a904b53227340a1384803796c7","src/lib.rs":"456f288602cf7f5f490b2c6541df500db0f53df889a5a4987ae361702afa48bc","src/tld.rs":"295c3c90c60c5bb6edd753b77c261eed10be2d431badda4e02168e740a0f2d7e"},"package":null}
|
||||
{"files":{"CONTRIBUTING.md":"0e64fb3dd5a00e3fd528de6442de3f2ca851bd718c45cca0871aaf4eedac9ee1","COPYRIGHT":"2fd0d7e90bd241b79804de129c5b70089988f82a7bbb0fe580a55b67b2968928","Cargo.toml":"ab767659696eb10dbaab743b566910bd29fc8f8f6998d9580494397a8903bd34","LICENSE-APACHE":"cfc7749b96f63bd31c3c42b5c471bf756814053e847c10f3eb003417bc523d30","LICENSE-MIT":"4ad721b5b6a3d39ca3e2202f403d897c4a1d42896486dd58963a81f8e64ef61d","README.md":"a6c97d91989aee4c8afed918340ce6287652cbdd6fed833e20f76367c7953db9","src/data.rs":"be48f1486ef9fc264f6cda2e10944b7dcf8ed0a904b53227340a1384803796c7","src/lib.rs":"e6641fd425b374424a2481e0717df6db405fb1781d1ee0f3af74e1bd5ab392b0","src/tld.rs":"295c3c90c60c5bb6edd753b77c261eed10be2d431badda4e02168e740a0f2d7e"},"package":null}
|
100
third_party/rust/chardetng/src/lib.rs
vendored
100
third_party/rust/chardetng/src/lib.rs
vendored
@ -56,7 +56,13 @@ const SHIFT_JIS_SCORE_PER_LEVEL_1_KANJI: i64 = CJK_BASE_SCORE;
|
||||
|
||||
const SHIFT_JIS_SCORE_PER_LEVEL_2_KANJI: i64 = CJK_SECONDARY_BASE_SCORE;
|
||||
|
||||
const HALF_WIDTH_KATAKANA_PENALTY: i64 = -(CJK_BASE_SCORE * 3);
|
||||
// Manually calibrated relative to windows-1256 Persian and Urdu
|
||||
const SHIFT_JIS_INITIAL_HALF_WIDTH_KATAKANA_PENALTY: i64 = -75;
|
||||
|
||||
const HALF_WIDTH_KATAKANA_SCORE: i64 = 1;
|
||||
|
||||
// Unclear if this is a good idea; seems not harmful, but can't be sure.
|
||||
const HALF_WIDTH_KATAKANA_VOICING_SCORE: i64 = 10;
|
||||
|
||||
const SHIFT_JIS_PUA_PENALTY: i64 = -(CJK_BASE_SCORE * 10); // Should this be larger?
|
||||
|
||||
@ -947,6 +953,13 @@ enum LatinCj {
|
||||
Other,
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Copy, Clone)]
|
||||
enum HalfWidthKatakana {
|
||||
DakutenForbidden,
|
||||
DakutenAllowed,
|
||||
DakutenOrHandakutenAllowed,
|
||||
}
|
||||
|
||||
#[derive(PartialEq)]
|
||||
enum LatinKorean {
|
||||
AsciiLetter,
|
||||
@ -1195,7 +1208,8 @@ fn more_problematic_lead(b: u8) -> bool {
|
||||
|
||||
struct ShiftJisCandidate {
|
||||
decoder: Decoder,
|
||||
non_ascii_seen: bool,
|
||||
half_width_katakana_seen: bool,
|
||||
half_width_katakana_state: HalfWidthKatakana,
|
||||
prev: LatinCj,
|
||||
prev_byte: u8,
|
||||
pending_score: Option<i64>,
|
||||
@ -1222,13 +1236,9 @@ impl ShiftJisCandidate {
|
||||
.decoder
|
||||
.decode_to_utf16_without_replacement(&src, &mut dst, false);
|
||||
if written > 0 {
|
||||
let half_width_katakana_state = self.half_width_katakana_state;
|
||||
self.half_width_katakana_state = HalfWidthKatakana::DakutenForbidden;
|
||||
let u = dst[0];
|
||||
if !self.non_ascii_seen && u >= 0x80 {
|
||||
self.non_ascii_seen = true;
|
||||
if u >= 0xFF61 && u <= 0xFF9F {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
if (u >= u16::from(b'a') && u <= u16::from(b'z'))
|
||||
|| (u >= u16::from(b'A') && u <= u16::from(b'Z'))
|
||||
{
|
||||
@ -1238,8 +1248,38 @@ impl ShiftJisCandidate {
|
||||
}
|
||||
self.prev = LatinCj::AsciiLetter;
|
||||
} else if u >= 0xFF61 && u <= 0xFF9F {
|
||||
if !self.half_width_katakana_seen {
|
||||
self.half_width_katakana_seen = true;
|
||||
// To avoid misdetecting title-length inputs
|
||||
score += SHIFT_JIS_INITIAL_HALF_WIDTH_KATAKANA_PENALTY;
|
||||
}
|
||||
self.pending_score = None; // Discard pending score
|
||||
score += HALF_WIDTH_KATAKANA_PENALTY;
|
||||
score += HALF_WIDTH_KATAKANA_SCORE;
|
||||
|
||||
if (u >= 0xFF76 && u <= 0xFF84) || u == 0xFF73 {
|
||||
self.half_width_katakana_state = HalfWidthKatakana::DakutenAllowed;
|
||||
} else if u >= 0xFF8A && u <= 0xFF8E {
|
||||
self.half_width_katakana_state =
|
||||
HalfWidthKatakana::DakutenOrHandakutenAllowed;
|
||||
} else if u == 0xFF9E {
|
||||
if half_width_katakana_state == HalfWidthKatakana::DakutenForbidden {
|
||||
score += IMPLAUSIBILITY_PENALTY;
|
||||
} else {
|
||||
score += HALF_WIDTH_KATAKANA_VOICING_SCORE;
|
||||
}
|
||||
} else if u == 0xFF9F {
|
||||
if half_width_katakana_state
|
||||
!= HalfWidthKatakana::DakutenOrHandakutenAllowed
|
||||
{
|
||||
score += IMPLAUSIBILITY_PENALTY;
|
||||
} else {
|
||||
score += HALF_WIDTH_KATAKANA_VOICING_SCORE;
|
||||
}
|
||||
}
|
||||
|
||||
if self.prev == LatinCj::AsciiLetter {
|
||||
score += CJK_LATIN_ADJACENCY_PENALTY;
|
||||
}
|
||||
self.prev = LatinCj::Cj;
|
||||
} else if u >= 0x3040 && u < 0x3100 {
|
||||
if let Some(pending) = self.pending_score {
|
||||
@ -1377,6 +1417,7 @@ impl ShiftJisCandidate {
|
||||
struct EucJpCandidate {
|
||||
decoder: Decoder,
|
||||
non_ascii_seen: bool,
|
||||
half_width_katakana_state: HalfWidthKatakana,
|
||||
prev: LatinCj,
|
||||
prev_byte: u8,
|
||||
prev_prev_byte: u8,
|
||||
@ -1393,12 +1434,11 @@ impl EucJpCandidate {
|
||||
.decoder
|
||||
.decode_to_utf16_without_replacement(&src, &mut dst, false);
|
||||
if written > 0 {
|
||||
let half_width_katakana_state = self.half_width_katakana_state;
|
||||
self.half_width_katakana_state = HalfWidthKatakana::DakutenForbidden;
|
||||
let u = dst[0];
|
||||
if !self.non_ascii_seen && u >= 0x80 {
|
||||
self.non_ascii_seen = true;
|
||||
if u >= 0xFF61 && u <= 0xFF9F {
|
||||
return None;
|
||||
}
|
||||
if u >= 0x3040 && u < 0x3100 {
|
||||
// Remove the kana advantage over initial Big5
|
||||
// hanzi.
|
||||
@ -1413,7 +1453,32 @@ impl EucJpCandidate {
|
||||
}
|
||||
self.prev = LatinCj::AsciiLetter;
|
||||
} else if u >= 0xFF61 && u <= 0xFF9F {
|
||||
score += HALF_WIDTH_KATAKANA_PENALTY;
|
||||
score += HALF_WIDTH_KATAKANA_SCORE;
|
||||
|
||||
if (u >= 0xFF76 && u <= 0xFF84) || u == 0xFF73 {
|
||||
self.half_width_katakana_state = HalfWidthKatakana::DakutenAllowed;
|
||||
} else if u >= 0xFF8A && u <= 0xFF8E {
|
||||
self.half_width_katakana_state =
|
||||
HalfWidthKatakana::DakutenOrHandakutenAllowed;
|
||||
} else if u == 0xFF9E {
|
||||
if half_width_katakana_state == HalfWidthKatakana::DakutenForbidden {
|
||||
score += IMPLAUSIBILITY_PENALTY;
|
||||
} else {
|
||||
score += HALF_WIDTH_KATAKANA_VOICING_SCORE;
|
||||
}
|
||||
} else if u == 0xFF9F {
|
||||
if half_width_katakana_state
|
||||
!= HalfWidthKatakana::DakutenOrHandakutenAllowed
|
||||
{
|
||||
score += IMPLAUSIBILITY_PENALTY;
|
||||
} else {
|
||||
score += HALF_WIDTH_KATAKANA_VOICING_SCORE;
|
||||
}
|
||||
}
|
||||
|
||||
if self.prev == LatinCj::AsciiLetter {
|
||||
score += CJK_LATIN_ADJACENCY_PENALTY;
|
||||
}
|
||||
self.prev = LatinCj::Other;
|
||||
} else if (u >= 0x3041 && u <= 0x3093) || (u >= 0x30A1 && u <= 0x30F6) {
|
||||
match u {
|
||||
@ -2459,7 +2524,8 @@ impl Candidate {
|
||||
Candidate {
|
||||
inner: InnerCandidate::Shift(ShiftJisCandidate {
|
||||
decoder: SHIFT_JIS.new_decoder_without_bom_handling(),
|
||||
non_ascii_seen: false,
|
||||
half_width_katakana_seen: false,
|
||||
half_width_katakana_state: HalfWidthKatakana::DakutenForbidden,
|
||||
prev: LatinCj::Other,
|
||||
prev_byte: 0,
|
||||
pending_score: None,
|
||||
@ -2473,6 +2539,7 @@ impl Candidate {
|
||||
inner: InnerCandidate::EucJp(EucJpCandidate {
|
||||
decoder: EUC_JP.new_decoder_without_bom_handling(),
|
||||
non_ascii_seen: false,
|
||||
half_width_katakana_state: HalfWidthKatakana::DakutenForbidden,
|
||||
prev: LatinCj::Other,
|
||||
prev_byte: 0,
|
||||
prev_prev_byte: 0,
|
||||
@ -3455,6 +3522,11 @@ mod tests {
|
||||
check(" €9", WINDOWS_1252);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_shift_jis_half_width_katakana() {
|
||||
check("ハードウェアハードウェアハードウェアハードウェアハードウェア", SHIFT_JIS);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_big5_pua() {
|
||||
let mut v = Vec::new();
|
||||
|
Loading…
Reference in New Issue
Block a user