Bug 1731482 - Detect lone windows-1252 copyright sign as window-1252. r=dminor

Differential Revision: https://phabricator.services.mozilla.com/D134605
This commit is contained in:
Henri Sivonen 2021-12-23 13:14:34 +00:00
Родитель b186181173
Коммит 917c511c5f
6 изменённых файлов: 44 добавлений и 4 удалений

Просмотреть файл

@ -84,7 +84,7 @@ rev = "ed8a4c6f900a90d4dbc1d64b856e61490a1c3570"
[source."https://github.com/hsivonen/chardetng"]
git = "https://github.com/hsivonen/chardetng"
replace-with = "vendored-sources"
rev = "302c995f91f44cf26e77dc4758ad56c3ff0153ad"
rev = "cb8052aaf9c4bca3bea4a579667441cc3ba7537c"
[source."https://github.com/grovesNL/glow"]
git = "https://github.com/grovesNL/glow"

2
Cargo.lock сгенерированный
Просмотреть файл

@ -678,7 +678,7 @@ checksum = "fd16c4719339c4530435d38e511904438d07cce7950afa3718a84ac36c10e89e"
[[package]]
name = "chardetng"
version = "0.1.9"
source = "git+https://github.com/hsivonen/chardetng?rev=302c995f91f44cf26e77dc4758ad56c3ff0153ad#302c995f91f44cf26e77dc4758ad56c3ff0153ad"
source = "git+https://github.com/hsivonen/chardetng?rev=cb8052aaf9c4bca3bea4a579667441cc3ba7537c#cb8052aaf9c4bca3bea4a579667441cc3ba7537c"
dependencies = [
"encoding_rs",
"memchr",

Просмотреть файл

@ -97,7 +97,7 @@ wasm-bindgen = { git = "https://github.com/kvark/dummy-web" }
web-sys = { git = "https://github.com/kvark/dummy-web" }
# Other overrides
chardetng = { git = "https://github.com/hsivonen/chardetng", rev="302c995f91f44cf26e77dc4758ad56c3ff0153ad" }
chardetng = { git = "https://github.com/hsivonen/chardetng", rev="cb8052aaf9c4bca3bea4a579667441cc3ba7537c" }
chardetng_c = { git = "https://github.com/hsivonen/chardetng_c", rev="ed8a4c6f900a90d4dbc1d64b856e61490a1c3570" }
libudev-sys = { path = "dom/webauthn/libudev-sys" }
packed_simd = { git = "https://github.com/hsivonen/packed_simd", rev="8b4bd7d8229660a749dbe419a57ea01df9de5453" }

Просмотреть файл

@ -0,0 +1,14 @@
<!doctype html>
<title>en windows-1252 copyright sign</title>
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<p>Copyright © 2021</p>
<script>
setup({explicit_done:true});
onload = function() {
test(function() {
assert_equals(document.characterSet, "windows-1252", 'Expected windows-1252');
}, "Check detection result");
done();
};
</script>

Просмотреть файл

@ -1 +1 @@
{"files":{"CONTRIBUTING.md":"0e64fb3dd5a00e3fd528de6442de3f2ca851bd718c45cca0871aaf4eedac9ee1","COPYRIGHT":"2fd0d7e90bd241b79804de129c5b70089988f82a7bbb0fe580a55b67b2968928","Cargo.toml":"ab767659696eb10dbaab743b566910bd29fc8f8f6998d9580494397a8903bd34","LICENSE-APACHE":"cfc7749b96f63bd31c3c42b5c471bf756814053e847c10f3eb003417bc523d30","LICENSE-MIT":"4ad721b5b6a3d39ca3e2202f403d897c4a1d42896486dd58963a81f8e64ef61d","README.md":"a6c97d91989aee4c8afed918340ce6287652cbdd6fed833e20f76367c7953db9","src/data.rs":"be48f1486ef9fc264f6cda2e10944b7dcf8ed0a904b53227340a1384803796c7","src/lib.rs":"e6641fd425b374424a2481e0717df6db405fb1781d1ee0f3af74e1bd5ab392b0","src/tld.rs":"295c3c90c60c5bb6edd753b77c261eed10be2d431badda4e02168e740a0f2d7e"},"package":null}
{"files":{"CONTRIBUTING.md":"0e64fb3dd5a00e3fd528de6442de3f2ca851bd718c45cca0871aaf4eedac9ee1","COPYRIGHT":"2fd0d7e90bd241b79804de129c5b70089988f82a7bbb0fe580a55b67b2968928","Cargo.toml":"ab767659696eb10dbaab743b566910bd29fc8f8f6998d9580494397a8903bd34","LICENSE-APACHE":"cfc7749b96f63bd31c3c42b5c471bf756814053e847c10f3eb003417bc523d30","LICENSE-MIT":"4ad721b5b6a3d39ca3e2202f403d897c4a1d42896486dd58963a81f8e64ef61d","README.md":"a6c97d91989aee4c8afed918340ce6287652cbdd6fed833e20f76367c7953db9","src/data.rs":"be48f1486ef9fc264f6cda2e10944b7dcf8ed0a904b53227340a1384803796c7","src/lib.rs":"8c632df34c983e90edc814233abb6a7e79ea76d56e6e8874f8d33379057edf63","src/tld.rs":"295c3c90c60c5bb6edd753b77c261eed10be2d431badda4e02168e740a0f2d7e"},"package":null}

26
third_party/rust/chardetng/src/lib.rs поставляемый
Просмотреть файл

@ -37,6 +37,15 @@ const IMPLAUSIBILITY_PENALTY: i64 = -220;
const ORDINAL_BONUS: i64 = 300;
/// Must match the ISO-8859-2 score for " Š ". Note: There
/// are four Slovenian Wikipedia list page titles where the
/// list is split by letter so that Š stands alone for the
/// list part for Š. Let's assume that's a special case not
/// worth detecting even though the copyright sign detection
/// makes Slovenian title detection round to one percentage
/// point worse.
const COPYRIGHT_BONUS: i64 = 222;
const IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY: i64 = -180;
const NON_LATIN_CAPITALIZATION_BONUS: i64 = 40;
@ -334,6 +343,7 @@ enum OrdinalState {
FeminineAbbreviationStartLetter,
Digit,
Roman,
Copyright,
}
struct LatinCandidate {
@ -436,6 +446,7 @@ impl LatinCandidate {
// * " Xª " (Italian, where X is a small Roman numeral)
// * " Nº1" (Italian, where 1 is an ASCII digit)
// * " Nº " (Italian)
// * " © " (otherwise ASCII-only)
// which are problematic to deal with by pairwise scoring
// without messing up Romanian detection.
// Initial sc
@ -464,6 +475,8 @@ impl LatinCandidate {
/* X */
{
self.ordinal_state = OrdinalState::Roman;
} else if b == 0xA9 {
self.ordinal_state = OrdinalState::Copyright;
} else {
self.ordinal_state = OrdinalState::Other;
}
@ -579,6 +592,14 @@ impl LatinCandidate {
self.ordinal_state = OrdinalState::Other;
}
}
OrdinalState::Copyright => {
if b == b' ' {
score += COPYRIGHT_BONUS;
self.ordinal_state = OrdinalState::Space;
} else {
self.ordinal_state = OrdinalState::Other;
}
}
}
}
@ -3447,6 +3468,11 @@ mod tests {
check("\u{A0}\u{A0}", WINDOWS_1252);
}
#[test]
fn test_space_copyright_space() {
check(" © ", WINDOWS_1252);
}
#[test]
fn test_space_masculine_space() {
check(" º ", WINDOWS_1252);