Bug 1495067 - Make HasRTLChars() consider Hebrew presentation forms as RTL (again) and not consider U+FEFF as RTL (again). r=jfkthame

* Update encoding_rs to 0.8.8.
 * Change U+FEFD and U+FEFE to RTL in IS_RTL_PRESENTATION_FORM to make the
   Rust and C++ code agree on what's RTL.

MozReview-Commit-ID: CuK6fN4pojG

Differential Revision: https://phabricator.services.mozilla.com/D7285

--HG--
extra : moz-landing-system : lando
This commit is contained in:
Henri Sivonen 2018-10-02 14:24:15 +00:00
Родитель e198ffe488
Коммит b8545f2cf8
14 изменённых файлов: 271 добавлений и 74 удалений

10
Cargo.lock сгенерированный
Просмотреть файл

@ -726,21 +726,21 @@ name = "encoding_c"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"encoding_rs 0.8.7 (registry+https://github.com/rust-lang/crates.io-index)",
"encoding_rs 0.8.8 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "encoding_glue"
version = "0.1.0"
dependencies = [
"encoding_rs 0.8.7 (registry+https://github.com/rust-lang/crates.io-index)",
"encoding_rs 0.8.8 (registry+https://github.com/rust-lang/crates.io-index)",
"nserror 0.1.0",
"nsstring 0.1.0",
]
[[package]]
name = "encoding_rs"
version = "0.8.7"
version = "0.8.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"cfg-if 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
@ -1604,7 +1604,7 @@ name = "nsstring"
version = "0.1.0"
dependencies = [
"bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)",
"encoding_rs 0.8.7 (registry+https://github.com/rust-lang/crates.io-index)",
"encoding_rs 0.8.8 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
@ -3069,7 +3069,7 @@ dependencies = [
"checksum either 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "18785c1ba806c258137c937e44ada9ee7e69a37e3c72077542cd2f069d78562a"
"checksum ena 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "cabe5a5078ac8c506d3e4430763b1ba9b609b1286913e7d08e581d1c2de9b7e5"
"checksum encoding_c 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "769ecb8b33323998e482b218c0d13cd64c267609023b4b7ec3ee740714c318ee"
"checksum encoding_rs 0.8.7 (registry+https://github.com/rust-lang/crates.io-index)" = "21a550ec129ca2f8593227888625c7c5331c6ad878e2cee6b7ac25e1c7d05746"
"checksum encoding_rs 0.8.8 (registry+https://github.com/rust-lang/crates.io-index)" = "cc9945e460ad969220c1061b9574fb02ed097c6f0704ce2f3e336cb443c40c73"
"checksum env_logger 0.5.6 (registry+https://github.com/rust-lang/crates.io-index)" = "0561146661ae44c579e993456bc76d11ce1e0c7d745e57b2fa7146b6e49fa2ad"
"checksum error-chain 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ff511d5dc435d703f4971bc399647c9bc38e20cb41452e3b9feb4765419ed3f3"
"checksum euclid 0.19.0 (registry+https://github.com/rust-lang/crates.io-index)" = "70a2ebdf55fb9d6329046e026329a55ef8fbaae5ea833f56e170beb3125a8a5f"

Просмотреть файл

@ -265,7 +265,7 @@ typedef enum nsCharType nsCharType;
#define IS_IN_BMP_RTL_BLOCK(c) ((0x590 <= (c)) && ((c) <= 0x8ff))
#define IS_RTL_PRESENTATION_FORM(c) (((0xfb1d <= (c)) && ((c) <= 0xfdff)) || \
((0xfe70 <= (c)) && ((c) <= 0xfefc)))
((0xfe70 <= (c)) && ((c) <= 0xfefe)))
#define IS_IN_SMP_RTL_BLOCK(c) (((0x10800 <= (c)) && ((c) <= 0x10fff)) || \
((0x1e800 <= (c)) && ((c) <= 0x1eFFF)))
// Due to the supplementary-plane RTL blocks being identifiable from the

Просмотреть файл

@ -0,0 +1,5 @@
[floating-first-letter-05d0.html]
type: reftest
disabled: https://bugzilla.mozilla.org/show_bug.cgi?id=1495674
expected:
FAIL

Просмотреть файл

@ -0,0 +1,20 @@
<!doctype html>
<meta charset=utf-8>
<title>Drop cap with U+05D0 in the document</title>
<meta name="assert" content="The text placement within :first-line should not be affected by later presence of a right-to-left character.">
<link rel=help href=https://drafts.csswg.org/css-pseudo-4/#first-line-styling>
<link rel=match href=/css/selectors/floating-first-letter-ref.html>
<style>
p:first-line {
background: lightblue;
}
p::first-letter {
float: left;
font-size: 4rem;
}
div {
color: transparent;
}
</style>
<p>Ab</p><div>&#x05D0;</div>

Просмотреть файл

@ -0,0 +1,17 @@
<!doctype html>
<meta charset=utf-8>
<title>Drop cap with U+FEFF in the document</title>
<meta name="assert" content="The text placement within :first-line should not be affected by later presence of U+FEFF.">
<link rel=help href=https://drafts.csswg.org/css-pseudo-4/#first-line-styling>
<link rel=match href=/css/selectors/floating-first-letter-ref.html>
<style>
p:first-line {
background: lightblue;
}
p::first-letter {
float: left;
font-size: 4rem;
}
</style>
<p>Ab</p>&#xFEFF;

Просмотреть файл

@ -0,0 +1,14 @@
<!doctype html>
<meta charset=utf-8>
<title>Drop cap with no bidi in the document</title>
<style>
p:first-line {
background: lightblue;
}
p::first-letter {
float: left;
font-size: 4rem;
}
</style>
<p>Ab</p>

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -1,5 +1,5 @@
{
"git": {
"sha1": "b67c60025bfebbf186e8b22f03edc9b6dc96df59"
"sha1": "ffe863483bcfeb069edf645738ac3650899d2801"
}
}

2
third_party/rust/encoding_rs/Cargo.toml поставляемый
Просмотреть файл

@ -12,7 +12,7 @@
[package]
name = "encoding_rs"
version = "0.8.7"
version = "0.8.8"
authors = ["Henri Sivonen <hsivonen@hsivonen.fi>"]
description = "A Gecko-oriented implementation of the Encoding Standard"
homepage = "https://docs.rs/encoding_rs/"

8
third_party/rust/encoding_rs/README.md поставляемый
Просмотреть файл

@ -244,6 +244,14 @@ used in Firefox.
## Release Notes
### 0.8.8
* Made the `is_foo_bidi()` not treat U+FEFF (ZERO WIDTH NO-BREAK SPACE
aka. BYTE ORDER MARK) as right-to-left.
* Made the `is_foo_bidi()` functions report `true` if the input contains
Hebrew presentations forms (which are right-to-left but not in a
right-to-left-roadmapped block).
### 0.8.7
* Fixed a panic in the UTF-16LE/UTF-16BE decoder when decoding to UTF-8.

2
third_party/rust/encoding_rs/src/lib.rs поставляемый
Просмотреть файл

@ -8,7 +8,7 @@
// except according to those terms.
#![cfg_attr(feature = "cargo-clippy", allow(doc_markdown, inline_always, new_ret_no_self))]
#![doc(html_root_url = "https://docs.rs/encoding_rs/0.8.7")]
#![doc(html_root_url = "https://docs.rs/encoding_rs/0.8.8")]
//! encoding_rs is a Gecko-oriented Free Software / Open Source implementation
//! of the [Encoding Standard](https://encoding.spec.whatwg.org/) in Rust.

203
third_party/rust/encoding_rs/src/mem.rs поставляемый
Просмотреть файл

@ -669,11 +669,14 @@ pub fn is_utf16_latin1(buffer: &[u16]) -> bool {
/// that trigger right-to-left processing.
///
/// The check is done on a Unicode block basis without regard to assigned
/// vs. unassigned code points in the block. Additionally, the four
/// RIGHT-TO-LEFT FOO controls in General Punctuation are checked for.
/// Control characters that are technically bidi controls but do not cause
/// right-to-left behavior without the presence of right-to-left characters
/// or right-to-left controls are not checked for.
/// vs. unassigned code points in the block. Hebrew presentation forms in
/// the Alphabetic Presentation Forms block are treated as if they formed
/// a block on their own (i.e. it treated as right-to-left). Additionally,
/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
/// for. Control characters that are technically bidi controls but do not
/// cause right-to-left behavior without the presence of right-to-left
/// characters or right-to-left controls are not checked for. As a special
/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
///
/// Returns `true` if the input is invalid UTF-8 or the input contains an
/// RTL character. Returns `false` if the input is valid UTF-8 and contains
@ -699,15 +702,15 @@ pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
// U+202E: E2 80 AE
// U+2067: E2 81 A7
//
// U+FB4F: EF AD 8F
// U+FB50: EF AD 90
// U+FB1C: EF AC 9C
// U+FB1D: EF AC 9D
// U+FDFF: EF B7 BF
// U+FE00: EF B8 80
//
// U+FE6F: EF B9 AF
// U+FE70: EF B9 B0
// U+FEFE: EF BB BE
// U+FEFF: EF BB BF
// U+FF00: EF BC 80
//
// U+107FF: F0 90 9F BF
// U+10800: F0 90 A0 80
@ -797,9 +800,9 @@ pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
{
return true;
}
if in_inclusive_range8(second, 0xAD, 0xB7) {
if second == 0xAD {
if third > 0x8F {
if in_inclusive_range8(second, 0xAC, 0xB7) {
if second == 0xAC {
if third > 0x9C {
return true;
}
} else {
@ -810,6 +813,10 @@ pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
if third > 0xAF {
return true;
}
} else if second == 0xBB {
if third != 0xBF {
return true;
}
} else {
return true;
}
@ -1013,9 +1020,9 @@ pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
{
return true;
}
if in_inclusive_range8(second, 0xAD, 0xB7) {
if second == 0xAD {
if third > 0x8F {
if in_inclusive_range8(second, 0xAC, 0xB7) {
if second == 0xAC {
if third > 0x9C {
return true;
}
} else {
@ -1026,6 +1033,10 @@ pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
if third > 0xAF {
return true;
}
} else if second == 0xBB {
if third != 0xBF {
return true;
}
} else {
return true;
}
@ -1083,11 +1094,14 @@ pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
/// right-to-left processing.
///
/// The check is done on a Unicode block basis without regard to assigned
/// vs. unassigned code points in the block. Additionally, the four
/// RIGHT-TO-LEFT FOO controls in General Punctuation are checked for.
/// Control characters that are technically bidi controls but do not cause
/// right-to-left behavior without the presence of right-to-left characters
/// or right-to-left controls are not checked for.
/// vs. unassigned code points in the block. Hebrew presentation forms in
/// the Alphabetic Presentation Forms block are treated as if they formed
/// a block on their own (i.e. it treated as right-to-left). Additionally,
/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
/// for. Control characters that are technically bidi controls but do not
/// cause right-to-left behavior without the presence of right-to-left
/// characters or right-to-left controls are not checked for. As a special
/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
#[inline]
pub fn is_str_bidi(buffer: &str) -> bool {
// U+058F: D6 8F
@ -1100,15 +1114,15 @@ pub fn is_str_bidi(buffer: &str) -> bool {
// U+202E: E2 80 AE
// U+2067: E2 81 A7
//
// U+FB4F: EF AD 8F
// U+FB50: EF AD 90
// U+FB1C: EF AC 9C
// U+FB1D: EF AC 9D
// U+FDFF: EF B7 BF
// U+FE00: EF B8 80
//
// U+FE6F: EF B9 AF
// U+FE70: EF B9 B0
// U+FEFE: EF BB BE
// U+FEFF: EF BB BF
// U+FF00: EF BC 80
//
// U+107FF: F0 90 9F BF
// U+10800: F0 90 A0 80
@ -1178,10 +1192,10 @@ pub fn is_str_bidi(buffer: &str) -> bool {
}
} else {
debug_assert_eq!(byte, 0xEF);
if in_inclusive_range8(second, 0xAD, 0xB7) {
if second == 0xAD {
if in_inclusive_range8(second, 0xAC, 0xB7) {
if second == 0xAC {
let third = bytes[read + 2];
if third > 0x8F {
if third > 0x9C {
return true;
}
} else {
@ -1193,6 +1207,11 @@ pub fn is_str_bidi(buffer: &str) -> bool {
if third > 0xAF {
return true;
}
} else if second == 0xBB {
let third = bytes[read + 2];
if third != 0xBF {
return true;
}
} else {
return true;
}
@ -1230,11 +1249,14 @@ pub fn is_str_bidi(buffer: &str) -> bool {
/// right-to-left processing.
///
/// The check is done on a Unicode block basis without regard to assigned
/// vs. unassigned code points in the block. Additionally, the four
/// RIGHT-TO-LEFT FOO controls in General Punctuation are checked for.
/// Control characters that are technically bidi controls but do not cause
/// right-to-left behavior without the presence of right-to-left characters
/// or right-to-left controls are not checked for.
/// vs. unassigned code points in the block. Hebrew presentation forms in
/// the Alphabetic Presentation Forms block are treated as if they formed
/// a block on their own (i.e. it treated as right-to-left). Additionally,
/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
/// for. Control characters that are technically bidi controls but do not
/// cause right-to-left behavior without the presence of right-to-left
/// characters or right-to-left controls are not checked for. As a special
/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
///
/// Returns `true` if the input contains an RTL character or an unpaired
/// high surrogate that could be the high half of an RTL character.
@ -1248,11 +1270,14 @@ pub fn is_utf16_bidi(buffer: &[u16]) -> bool {
/// Checks whether a code point triggers right-to-left processing.
///
/// The check is done on a Unicode block basis without regard to assigned
/// vs. unassigned code points in the block. Additionally, the four
/// RIGHT-TO-LEFT FOO controls in General Punctuation are checked for.
/// Control characters that are technically bidi controls but do not cause
/// right-to-left behavior without the presence of right-to-left characters
/// or right-to-left controls are not checked for.
/// vs. unassigned code points in the block. Hebrew presentation forms in
/// the Alphabetic Presentation Forms block are treated as if they formed
/// a block on their own (i.e. it treated as right-to-left). Additionally,
/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
/// for. Control characters that are technically bidi controls but do not
/// cause right-to-left behavior without the presence of right-to-left
/// characters or right-to-left controls are not checked for. As a special
/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
#[inline(always)]
pub fn is_char_bidi(c: char) -> bool {
// Controls:
@ -1266,8 +1291,9 @@ pub fn is_char_bidi(c: char) -> bool {
// BMP RTL:
// https://www.unicode.org/roadmaps/bmp/
// U+0590...U+08FF
// U+FB50...U+FDFF Arabic Presentation Forms A
// U+FE70...U+FEFF Arabic Presentation Forms B
// U+FB1D...U+FDFF Hebrew presentation forms and
// Arabic Presentation Forms A
// U+FE70...U+FEFE Arabic Presentation Forms B (excl. BOM)
//
// Supplementary RTL:
// https://www.unicode.org/roadmaps/smp/
@ -1278,8 +1304,8 @@ pub fn is_char_bidi(c: char) -> bool {
// Below Hebrew
return false;
}
if in_range32(code_point, 0x0900, 0xFB50) {
// Above Arabic Extended-A and below Arabic Presentation Forms
if in_range32(code_point, 0x0900, 0xFB1D) {
// Above Arabic Extended-A and below Hebrew presentation forms
if in_inclusive_range32(code_point, 0x200F, 0x2067) {
// In the range that contains the RTL controls
return code_point == 0x200F
@ -1297,8 +1323,8 @@ pub fn is_char_bidi(c: char) -> bool {
// Between astral RTL blocks
return false;
}
if in_range32(code_point, 0xFF00, 0x10800) {
// Above Arabic Presentations Forms B and below first
if in_range32(code_point, 0xFEFF, 0x10800) {
// Above Arabic Presentations Forms B (excl. BOM) and below first
// astral RTL
return false;
}
@ -1312,11 +1338,14 @@ pub fn is_char_bidi(c: char) -> bool {
/// Checks whether a UTF-16 code unit triggers right-to-left processing.
///
/// The check is done on a Unicode block basis without regard to assigned
/// vs. unassigned code points in the block. Additionally, the four
/// RIGHT-TO-LEFT FOO controls in General Punctuation are checked for.
/// Control characters that are technically bidi controls but do not cause
/// right-to-left behavior without the presence of right-to-left characters
/// or right-to-left controls are not checked for.
/// vs. unassigned code points in the block. Hebrew presentation forms in
/// the Alphabetic Presentation Forms block are treated as if they formed
/// a block on their own (i.e. it treated as right-to-left). Additionally,
/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
/// for. Control characters that are technically bidi controls but do not
/// cause right-to-left behavior without the presence of right-to-left
/// characters or right-to-left controls are not checked for. As a special
/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
///
/// Since supplementary-plane right-to-left blocks are identifiable from the
/// high surrogate without examining the low surrogate, this function returns
@ -1338,8 +1367,8 @@ pub fn is_utf16_code_unit_bidi(u: u16) -> bool {
}
return false;
}
if in_range16(u, 0xD83C, 0xFB50) {
// Between astral RTL high surrogates and Arabic Presentation Forms
if in_range16(u, 0xD83C, 0xFB1D) {
// Between astral RTL high surrogates and Hebrew presentation forms
// (Emoji is here)
return false;
}
@ -1347,8 +1376,8 @@ pub fn is_utf16_code_unit_bidi(u: u16) -> bool {
// Between RTL high surragates
return false;
}
if u > 0xFEFF {
// Above Arabic Presentation Forms
if u > 0xFEFE {
// Above Arabic Presentation Forms (excl. BOM)
return false;
}
if in_range16(u, 0xFE00, 0xFE70) {
@ -2370,13 +2399,14 @@ mod tests {
assert!(!is_char_bidi('\u{1F4A9}'));
assert!(!is_char_bidi('\u{FE00}'));
assert!(!is_char_bidi('\u{202C}'));
assert!(!is_char_bidi('\u{FEFF}'));
assert!(is_char_bidi('\u{0590}'));
assert!(is_char_bidi('\u{08FF}'));
assert!(is_char_bidi('\u{061C}'));
assert!(is_char_bidi('\u{FB50}'));
assert!(is_char_bidi('\u{FDFF}'));
assert!(is_char_bidi('\u{FE70}'));
assert!(is_char_bidi('\u{FEFF}'));
assert!(is_char_bidi('\u{FEFE}'));
assert!(is_char_bidi('\u{200F}'));
assert!(is_char_bidi('\u{202B}'));
assert!(is_char_bidi('\u{202E}'));
@ -2395,13 +2425,15 @@ mod tests {
assert!(!is_utf16_code_unit_bidi(0xD801));
assert!(!is_utf16_code_unit_bidi(0xFE00));
assert!(!is_utf16_code_unit_bidi(0x202C));
assert!(!is_utf16_code_unit_bidi(0xFEFF));
assert!(is_utf16_code_unit_bidi(0x0590));
assert!(is_utf16_code_unit_bidi(0x08FF));
assert!(is_utf16_code_unit_bidi(0x061C));
assert!(is_utf16_code_unit_bidi(0xFB1D));
assert!(is_utf16_code_unit_bidi(0xFB50));
assert!(is_utf16_code_unit_bidi(0xFDFF));
assert!(is_utf16_code_unit_bidi(0xFE70));
assert!(is_utf16_code_unit_bidi(0xFEFF));
assert!(is_utf16_code_unit_bidi(0xFEFE));
assert!(is_utf16_code_unit_bidi(0x200F));
assert!(is_utf16_code_unit_bidi(0x202B));
assert!(is_utf16_code_unit_bidi(0x202E));
@ -2420,13 +2452,14 @@ mod tests {
assert!(!is_str_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop"));
assert!(!is_str_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop"));
assert!(!is_str_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop"));
assert!(!is_str_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop"));
assert!(is_str_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop"));
assert!(is_str_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop"));
assert!(is_str_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop"));
assert!(is_str_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop"));
assert!(is_str_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop"));
assert!(is_str_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop"));
assert!(is_str_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop"));
assert!(is_str_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop"));
assert!(is_str_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop"));
assert!(is_str_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop"));
assert!(is_str_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop"));
@ -2457,6 +2490,9 @@ mod tests {
assert!(!is_utf8_bidi(
"abcdefghijklmnop\u{202C}abcdefghijklmnop".as_bytes()
));
assert!(!is_utf8_bidi(
"abcdefghijklmnop\u{FEFF}abcdefghijklmnop".as_bytes()
));
assert!(is_utf8_bidi(
"abcdefghijklmnop\u{0590}abcdefghijklmnop".as_bytes()
));
@ -2476,7 +2512,7 @@ mod tests {
"abcdefghijklmnop\u{FE70}abcdefghijklmnop".as_bytes()
));
assert!(is_utf8_bidi(
"abcdefghijklmnop\u{FEFF}abcdefghijklmnop".as_bytes()
"abcdefghijklmnop\u{FEFE}abcdefghijklmnop".as_bytes()
));
assert!(is_utf8_bidi(
"abcdefghijklmnop\u{200F}abcdefghijklmnop".as_bytes()
@ -2530,6 +2566,10 @@ mod tests {
0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202C, 0x62, 0x63, 0x64, 0x65, 0x66,
0x67, 0x68, 0x69,
]));
assert!(!is_utf16_bidi(&[
0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFF, 0x62, 0x63, 0x64, 0x65, 0x66,
0x67, 0x68, 0x69,
]));
assert!(is_utf16_bidi(&[
0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x62, 0x63, 0x64, 0x65, 0x66,
0x67, 0x68, 0x69,
@ -2542,6 +2582,10 @@ mod tests {
0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x061C, 0x62, 0x63, 0x64, 0x65, 0x66,
0x67, 0x68, 0x69,
]));
assert!(is_utf16_bidi(&[
0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB1D, 0x62, 0x63, 0x64, 0x65, 0x66,
0x67, 0x68, 0x69,
]));
assert!(is_utf16_bidi(&[
0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB50, 0x62, 0x63, 0x64, 0x65, 0x66,
0x67, 0x68, 0x69,
@ -2555,7 +2599,7 @@ mod tests {
0x67, 0x68, 0x69,
]));
assert!(is_utf16_bidi(&[
0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFF, 0x62, 0x63, 0x64, 0x65, 0x66,
0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFE, 0x62, 0x63, 0x64, 0x65, 0x66,
0x67, 0x68, 0x69,
]));
assert!(is_utf16_bidi(&[
@ -2623,6 +2667,10 @@ mod tests {
check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop"),
Latin1Bidi::Bidi
);
assert_ne!(
check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop"),
Latin1Bidi::Bidi
);
assert_eq!(
check_str_for_latin1_and_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop"),
Latin1Bidi::Bidi
@ -2648,7 +2696,7 @@ mod tests {
Latin1Bidi::Bidi
);
assert_eq!(
check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop"),
check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop"),
Latin1Bidi::Bidi
);
assert_eq!(
@ -2711,6 +2759,10 @@ mod tests {
check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop".as_bytes()),
Latin1Bidi::Bidi
);
assert_ne!(
check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop".as_bytes()),
Latin1Bidi::Bidi
);
assert_eq!(
check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop".as_bytes()),
Latin1Bidi::Bidi
@ -2736,7 +2788,7 @@ mod tests {
Latin1Bidi::Bidi
);
assert_eq!(
check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop".as_bytes()),
check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop".as_bytes()),
Latin1Bidi::Bidi
);
assert_eq!(
@ -2817,6 +2869,13 @@ mod tests {
]),
Latin1Bidi::Bidi
);
assert_ne!(
check_utf16_for_latin1_and_bidi(&[
0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFF, 0x62, 0x63, 0x64, 0x65,
0x66, 0x67, 0x68, 0x69,
]),
Latin1Bidi::Bidi
);
assert_eq!(
check_utf16_for_latin1_and_bidi(&[
0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x62, 0x63, 0x64, 0x65,
@ -2838,6 +2897,13 @@ mod tests {
]),
Latin1Bidi::Bidi
);
assert_eq!(
check_utf16_for_latin1_and_bidi(&[
0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB1D, 0x62, 0x63, 0x64, 0x65,
0x66, 0x67, 0x68, 0x69,
]),
Latin1Bidi::Bidi
);
assert_eq!(
check_utf16_for_latin1_and_bidi(&[
0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB50, 0x62, 0x63, 0x64, 0x65,
@ -2861,7 +2927,7 @@ mod tests {
);
assert_eq!(
check_utf16_for_latin1_and_bidi(&[
0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFF, 0x62, 0x63, 0x64, 0x65,
0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFE, 0x62, 0x63, 0x64, 0x65,
0x66, 0x67, 0x68, 0x69,
]),
Latin1Bidi::Bidi
@ -2936,8 +3002,8 @@ mod tests {
pub fn reference_is_char_bidi(c: char) -> bool {
match c {
'\u{0590}'...'\u{08FF}'
| '\u{FB50}'...'\u{FDFF}'
| '\u{FE70}'...'\u{FEFF}'
| '\u{FB1D}'...'\u{FDFF}'
| '\u{FE70}'...'\u{FEFE}'
| '\u{10800}'...'\u{10FFF}'
| '\u{1E800}'...'\u{1EFFF}'
| '\u{200F}'
@ -2952,8 +3018,8 @@ mod tests {
pub fn reference_is_utf16_code_unit_bidi(u: u16) -> bool {
match u {
0x0590...0x08FF
| 0xFB50...0xFDFF
| 0xFE70...0xFEFF
| 0xFB1D...0xFDFF
| 0xFE70...0xFEFE
| 0xD802
| 0xD803
| 0xD83A
@ -3049,6 +3115,19 @@ mod tests {
}
}
#[test]
fn test_is_utf16_bidi_thoroughly() {
let mut buf = [0; 32];
for i in 0..0x10000u32 {
let u = i as u16;
buf[15] = u;
assert_eq!(
is_utf16_bidi(&buf[..]),
reference_is_utf16_code_unit_bidi(u)
);
}
}
#[test]
fn test_is_utf8_bidi_edge_cases() {
assert!(!is_utf8_bidi(b"\xD5\xBF\x61"));

Просмотреть файл

@ -278,8 +278,8 @@ pub fn is_u16x8_bidi(s: u16x8) -> bool {
// Quick refutation failed. Let's do the full check.
(in_range16x8!(s, 0x0590, 0x0900)
| in_range16x8!(s, 0xFB50, 0xFE00)
| in_range16x8!(s, 0xFE70, 0xFF00)
| in_range16x8!(s, 0xFB1D, 0xFE00)
| in_range16x8!(s, 0xFE70, 0xFEFF)
| in_range16x8!(s, 0xD802, 0xD804)
| in_range16x8!(s, 0xD83A, 0xD83C)
| s.eq(u16x8::splat(0x200F))

54
third_party/rust/encoding_rs/src/utf_16.rs поставляемый
Просмотреть файл

@ -401,4 +401,58 @@ mod tests {
assert_eq!(output[0], 0xFFFD);
}
}
#[test]
fn test_utf_16le_decode_near_end() {
let mut output = [0u8; 4];
let mut decoder = UTF_16LE.new_decoder();
{
let (result, read, written, had_errors) =
decoder.decode_to_utf8(&[0x03], &mut output[..], false);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, 1);
assert_eq!(written, 0);
assert!(!had_errors);
assert_eq!(output[0], 0x0);
}
{
let (result, read, written, had_errors) =
decoder.decode_to_utf8(&[0x26, 0x03, 0x26], &mut output[..], false);
assert_eq!(result, CoderResult::OutputFull);
assert_eq!(read, 1);
assert_eq!(written, 3);
assert!(!had_errors);
assert_eq!(output[0], 0xE2);
assert_eq!(output[1], 0x98);
assert_eq!(output[2], 0x83);
assert_eq!(output[3], 0x00);
}
}
#[test]
fn test_utf_16be_decode_near_end() {
let mut output = [0u8; 4];
let mut decoder = UTF_16BE.new_decoder();
{
let (result, read, written, had_errors) =
decoder.decode_to_utf8(&[0x26], &mut output[..], false);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, 1);
assert_eq!(written, 0);
assert!(!had_errors);
assert_eq!(output[0], 0x0);
}
{
let (result, read, written, had_errors) =
decoder.decode_to_utf8(&[0x03, 0x26, 0x03], &mut output[..], false);
assert_eq!(result, CoderResult::OutputFull);
assert_eq!(read, 1);
assert_eq!(written, 3);
assert!(!had_errors);
assert_eq!(output[0], 0xE2);
assert_eq!(output[1], 0x98);
assert_eq!(output[2], 0x83);
assert_eq!(output[3], 0x00);
}
}
}