зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1731482 - Detect lone windows-1252 copyright sign as window-1252. r=dminor
Differential Revision: https://phabricator.services.mozilla.com/D134605
This commit is contained in:
Родитель
b186181173
Коммит
917c511c5f
|
@ -84,7 +84,7 @@ rev = "ed8a4c6f900a90d4dbc1d64b856e61490a1c3570"
|
|||
[source."https://github.com/hsivonen/chardetng"]
|
||||
git = "https://github.com/hsivonen/chardetng"
|
||||
replace-with = "vendored-sources"
|
||||
rev = "302c995f91f44cf26e77dc4758ad56c3ff0153ad"
|
||||
rev = "cb8052aaf9c4bca3bea4a579667441cc3ba7537c"
|
||||
|
||||
[source."https://github.com/grovesNL/glow"]
|
||||
git = "https://github.com/grovesNL/glow"
|
||||
|
|
|
@ -678,7 +678,7 @@ checksum = "fd16c4719339c4530435d38e511904438d07cce7950afa3718a84ac36c10e89e"
|
|||
[[package]]
|
||||
name = "chardetng"
|
||||
version = "0.1.9"
|
||||
source = "git+https://github.com/hsivonen/chardetng?rev=302c995f91f44cf26e77dc4758ad56c3ff0153ad#302c995f91f44cf26e77dc4758ad56c3ff0153ad"
|
||||
source = "git+https://github.com/hsivonen/chardetng?rev=cb8052aaf9c4bca3bea4a579667441cc3ba7537c#cb8052aaf9c4bca3bea4a579667441cc3ba7537c"
|
||||
dependencies = [
|
||||
"encoding_rs",
|
||||
"memchr",
|
||||
|
|
|
@ -97,7 +97,7 @@ wasm-bindgen = { git = "https://github.com/kvark/dummy-web" }
|
|||
web-sys = { git = "https://github.com/kvark/dummy-web" }
|
||||
|
||||
# Other overrides
|
||||
chardetng = { git = "https://github.com/hsivonen/chardetng", rev="302c995f91f44cf26e77dc4758ad56c3ff0153ad" }
|
||||
chardetng = { git = "https://github.com/hsivonen/chardetng", rev="cb8052aaf9c4bca3bea4a579667441cc3ba7537c" }
|
||||
chardetng_c = { git = "https://github.com/hsivonen/chardetng_c", rev="ed8a4c6f900a90d4dbc1d64b856e61490a1c3570" }
|
||||
libudev-sys = { path = "dom/webauthn/libudev-sys" }
|
||||
packed_simd = { git = "https://github.com/hsivonen/packed_simd", rev="8b4bd7d8229660a749dbe419a57ea01df9de5453" }
|
||||
|
|
|
@ -0,0 +1,14 @@
|
|||
<!doctype html>
|
||||
<title>en windows-1252 copyright sign</title>
|
||||
<script src=/resources/testharness.js></script>
|
||||
<script src=/resources/testharnessreport.js></script>
|
||||
<p>Copyright © 2021</p>
|
||||
<script>
|
||||
setup({explicit_done:true});
|
||||
onload = function() {
|
||||
test(function() {
|
||||
assert_equals(document.characterSet, "windows-1252", 'Expected windows-1252');
|
||||
}, "Check detection result");
|
||||
done();
|
||||
};
|
||||
</script>
|
|
@ -1 +1 @@
|
|||
{"files":{"CONTRIBUTING.md":"0e64fb3dd5a00e3fd528de6442de3f2ca851bd718c45cca0871aaf4eedac9ee1","COPYRIGHT":"2fd0d7e90bd241b79804de129c5b70089988f82a7bbb0fe580a55b67b2968928","Cargo.toml":"ab767659696eb10dbaab743b566910bd29fc8f8f6998d9580494397a8903bd34","LICENSE-APACHE":"cfc7749b96f63bd31c3c42b5c471bf756814053e847c10f3eb003417bc523d30","LICENSE-MIT":"4ad721b5b6a3d39ca3e2202f403d897c4a1d42896486dd58963a81f8e64ef61d","README.md":"a6c97d91989aee4c8afed918340ce6287652cbdd6fed833e20f76367c7953db9","src/data.rs":"be48f1486ef9fc264f6cda2e10944b7dcf8ed0a904b53227340a1384803796c7","src/lib.rs":"e6641fd425b374424a2481e0717df6db405fb1781d1ee0f3af74e1bd5ab392b0","src/tld.rs":"295c3c90c60c5bb6edd753b77c261eed10be2d431badda4e02168e740a0f2d7e"},"package":null}
|
||||
{"files":{"CONTRIBUTING.md":"0e64fb3dd5a00e3fd528de6442de3f2ca851bd718c45cca0871aaf4eedac9ee1","COPYRIGHT":"2fd0d7e90bd241b79804de129c5b70089988f82a7bbb0fe580a55b67b2968928","Cargo.toml":"ab767659696eb10dbaab743b566910bd29fc8f8f6998d9580494397a8903bd34","LICENSE-APACHE":"cfc7749b96f63bd31c3c42b5c471bf756814053e847c10f3eb003417bc523d30","LICENSE-MIT":"4ad721b5b6a3d39ca3e2202f403d897c4a1d42896486dd58963a81f8e64ef61d","README.md":"a6c97d91989aee4c8afed918340ce6287652cbdd6fed833e20f76367c7953db9","src/data.rs":"be48f1486ef9fc264f6cda2e10944b7dcf8ed0a904b53227340a1384803796c7","src/lib.rs":"8c632df34c983e90edc814233abb6a7e79ea76d56e6e8874f8d33379057edf63","src/tld.rs":"295c3c90c60c5bb6edd753b77c261eed10be2d431badda4e02168e740a0f2d7e"},"package":null}
|
|
@ -37,6 +37,15 @@ const IMPLAUSIBILITY_PENALTY: i64 = -220;
|
|||
|
||||
const ORDINAL_BONUS: i64 = 300;
|
||||
|
||||
/// Must match the ISO-8859-2 score for " Š ". Note: There
|
||||
/// are four Slovenian Wikipedia list page titles where the
|
||||
/// list is split by letter so that Š stands alone for the
|
||||
/// list part for Š. Let's assume that's a special case not
|
||||
/// worth detecting even though the copyright sign detection
|
||||
/// makes Slovenian title detection round to one percentage
|
||||
/// point worse.
|
||||
const COPYRIGHT_BONUS: i64 = 222;
|
||||
|
||||
const IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY: i64 = -180;
|
||||
|
||||
const NON_LATIN_CAPITALIZATION_BONUS: i64 = 40;
|
||||
|
@ -334,6 +343,7 @@ enum OrdinalState {
|
|||
FeminineAbbreviationStartLetter,
|
||||
Digit,
|
||||
Roman,
|
||||
Copyright,
|
||||
}
|
||||
|
||||
struct LatinCandidate {
|
||||
|
@ -436,6 +446,7 @@ impl LatinCandidate {
|
|||
// * " Xª " (Italian, where X is a small Roman numeral)
|
||||
// * " Nº1" (Italian, where 1 is an ASCII digit)
|
||||
// * " Nº " (Italian)
|
||||
// * " © " (otherwise ASCII-only)
|
||||
// which are problematic to deal with by pairwise scoring
|
||||
// without messing up Romanian detection.
|
||||
// Initial sc
|
||||
|
@ -464,6 +475,8 @@ impl LatinCandidate {
|
|||
/* X */
|
||||
{
|
||||
self.ordinal_state = OrdinalState::Roman;
|
||||
} else if b == 0xA9 {
|
||||
self.ordinal_state = OrdinalState::Copyright;
|
||||
} else {
|
||||
self.ordinal_state = OrdinalState::Other;
|
||||
}
|
||||
|
@ -579,6 +592,14 @@ impl LatinCandidate {
|
|||
self.ordinal_state = OrdinalState::Other;
|
||||
}
|
||||
}
|
||||
OrdinalState::Copyright => {
|
||||
if b == b' ' {
|
||||
score += COPYRIGHT_BONUS;
|
||||
self.ordinal_state = OrdinalState::Space;
|
||||
} else {
|
||||
self.ordinal_state = OrdinalState::Other;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -3447,6 +3468,11 @@ mod tests {
|
|||
check("\u{A0}\u{A0}", WINDOWS_1252);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_space_copyright_space() {
|
||||
check(" © ", WINDOWS_1252);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_space_masculine_space() {
|
||||
check(" º ", WINDOWS_1252);
|
||||
|
|
Загрузка…
Ссылка в новой задаче