Bug 1773399 - Update idna to 0.2.3. r=emilio,supply-chain-reviewers

Differential Revision: https://phabricator.services.mozilla.com/D148737
This commit is contained in:
Mike Hommey 2022-06-14 20:32:28 +00:00
Родитель 3f5dc34b19
Коммит 527752ca63
32 изменённых файлов: 29206 добавлений и 145914 удалений

18
Cargo.lock сгенерированный
Просмотреть файл

@ -2621,9 +2621,9 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
[[package]]
name = "idna"
version = "0.2.1"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "de910d521f7cc3135c4de8db1cb910e0b5ed1dc6f57c381cd07e8e661ce10094"
checksum = "418a0a6fab821475f634efe3ccc45c013f742efe03d853e8d3355d5cb850ecf8"
dependencies = [
"matches",
"unicode-bidi",
@ -5279,6 +5279,13 @@ version = "0.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "29738eedb4388d9ea620eeab9384884fc3f06f586a2eddb56bedc5885126c7c1"
[[package]]
name = "tinyvec"
version = "1.999.999"
dependencies = [
"smallvec",
]
[[package]]
name = "to_shmem"
version = "0.0.1"
@ -5636,9 +5643,12 @@ checksum = "d22af068fba1eb5edcb4aea19d382b2a3deb4c8f9d475c589b6ada9e0fd493ee"
[[package]]
name = "unicode-normalization"
version = "0.1.7"
version = "0.1.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6a0180bc61fc5a987082bfa111f4cc95c4caff7f9799f3e46df09163a937aa25"
checksum = "d54590932941a9e9266f0832deed84ebe1bf2e4c9e4a3554d393d18f5e854bf9"
dependencies = [
"tinyvec",
]
[[package]]
name = "unicode-segmentation"

Просмотреть файл

@ -132,6 +132,9 @@ nom = { path = "build/rust/nom" }
# dependencies on windows-sys.
parking_lot = { path = "build/rust/parking_lot" }
# Override tinyvec with smallvec
tinyvec = { path = "build/rust/tinyvec" }
# Patch autocfg to hide rustc output. Workaround for https://github.com/cuviper/autocfg/issues/30
autocfg = { path = "third_party/rust/autocfg" }

Просмотреть файл

@ -0,0 +1,16 @@
[package]
name = "tinyvec"
version = "1.999.999"
edition = "2018"
license = "MPL-2.0"
[lib]
path = "lib.rs"
[dependencies]
smallvec = "1"
[features]
alloc = []
default = []
std = ["alloc"]

Просмотреть файл

@ -0,0 +1,6 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
pub use smallvec::SmallVec as ArrayVec;
pub use smallvec::SmallVec as TinyVec;

Просмотреть файл

@ -768,7 +768,7 @@ version = "1.0.1"
criteria = "safe-to-deploy"
[[unaudited.idna]]
version = "0.2.1"
version = "0.2.3"
criteria = "safe-to-deploy"
[[unaudited.indexmap]]
@ -1552,7 +1552,7 @@ version = "1.0.0"
criteria = "safe-to-deploy"
[[unaudited.unicode-normalization]]
version = "0.1.7"
version = "0.1.19"
criteria = "safe-to-deploy"
[[unaudited.unicode-segmentation]]

2
third_party/rust/idna/.cargo-checksum.json поставляемый
Просмотреть файл

@ -1 +1 @@
{"files":{"Cargo.toml":"6f1fd46d4d9575d5a7f46873cb40a93e973e9fb8f574b28a1b21b596df618a89","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"20c7855c364d57ea4c97889a5e8d98470a9952dade37bd9248b9a54431670e5e","benches/all.rs":"e734b9c9092ed66986725f86cfe90f3756cfddb058af308b796ba494f9beefc2","src/IdnaMappingTable.txt":"87d6553a4b86bc49dcade38bf26b745cd81800eb8af295dc3fb99b4729eaea38","src/lib.rs":"d61b2bfcf4265b9a41eedd1de33ab49ea615e3c06df944321b30c57950a85342","src/make_uts46_mapping_table.py":"d420883d17b44c42109317ffaf1c273e611864eaeb1c5f1b9d93634a5d586835","src/punycode.rs":"dceeb0467197f892d2c777711b3c6647238f52f3976dfca5a8f8957500fd3599","src/uts46.rs":"49aaae3c5a9503bc7ef59b1a2e76ba158154132515e7c85ab670130ed5da318f","src/uts46_mapping_table.rs":"90c4180dd865b919bf1b2f13459c9c5b9de0cbbdff6584f742a7ecc0c14d3cdd","tests/IdnaTestV2.txt":"c6f3778b0545fd150c8063286c7f5adc901e16557eddccc3751213646d07593d","tests/punycode.rs":"8efdaae0902a8ffe483ae69236c9d0a38979cfd2430e69b87f33975e6946d577","tests/punycode_tests.json":"3d4ac0cf25984c37b9ce197f5df680a0136f728fb8ec82bc76624e42139eb3a8","tests/tests.rs":"de7425a3e4e6e871255721107803704d1431246601fa9c87105224d88dfe60d6","tests/unit.rs":"9600ec4f67ae44e8457fb64a9872d190a1a4d807e32d9688c8fa3ef9135c7a5d","tests/uts46.rs":"ca91d48811d366fb9e32d7aa79cfda1261b93c271b6ed7fb5535de9a2500205b"},"package":"de910d521f7cc3135c4de8db1cb910e0b5ed1dc6f57c381cd07e8e661ce10094"}
{"files":{"Cargo.toml":"fa141dcb135262e5fda9f680671699045326d96779bb1acf38d48c70c712bcdf","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"20c7855c364d57ea4c97889a5e8d98470a9952dade37bd9248b9a54431670e5e","benches/all.rs":"e734b9c9092ed66986725f86cfe90f3756cfddb058af308b796ba494f9beefc2","src/IdnaMappingTable.txt":"87d6553a4b86bc49dcade38bf26b745cd81800eb8af295dc3fb99b4729eaea38","src/lib.rs":"d61b2bfcf4265b9a41eedd1de33ab49ea615e3c06df944321b30c57950a85342","src/make_uts46_mapping_table.py":"917055fa841f813de2bcf79cc79b595da3d5551559ee768db8660ab77cb26c34","src/punycode.rs":"07edf5293bc384a164eebb01bc18fe3d4b2d009b4565a36b74a3030978ea6e04","src/uts46.rs":"40521a01e5b8c38667252d5b1e0141c5a71f63aeae2f451b986792984e633b09","src/uts46_mapping_table.rs":"942fff78147c61da942f5f3a7ff4e90f9d7a00a29285733ac3fc3357eb2ed06f","tests/IdnaTestV2.txt":"c6f3778b0545fd150c8063286c7f5adc901e16557eddccc3751213646d07593d","tests/punycode.rs":"e6fb978f48445d1525a6b97351c41c5393a1612a35f85b9a7f45b8794fce9aba","tests/punycode_tests.json":"3d4ac0cf25984c37b9ce197f5df680a0136f728fb8ec82bc76624e42139eb3a8","tests/tests.rs":"de7425a3e4e6e871255721107803704d1431246601fa9c87105224d88dfe60d6","tests/unit.rs":"be025a7d9bab3bd1ce134c87f9d848269e157b31ca5ba0ea03426c1ac736b69e","tests/uts46.rs":"ca91d48811d366fb9e32d7aa79cfda1261b93c271b6ed7fb5535de9a2500205b"},"package":"418a0a6fab821475f634efe3ccc45c013f742efe03d853e8d3355d5cb850ecf8"}

4
third_party/rust/idna/Cargo.toml поставляемый
Просмотреть файл

@ -13,7 +13,7 @@
[package]
edition = "2018"
name = "idna"
version = "0.2.1"
version = "0.2.3"
authors = ["The rust-url developers"]
autotests = false
description = "IDNA (Internationalizing Domain Names in Applications) and Punycode."
@ -40,7 +40,7 @@ version = "0.1"
version = "0.3"
[dependencies.unicode-normalization]
version = "0.1.5"
version = "0.1.17"
[dev-dependencies.assert_matches]
version = "1.3"

Просмотреть файл

@ -78,6 +78,12 @@ for line in txt:
unicode_str = u''.join(char(c) for c in fields[2].strip().split(' '))
elif mapping == "Deviation":
unicode_str = u''
if len(fields) > 3:
assert fields[3].strip() in ('NV8', 'XV8'), fields[3]
assert mapping == 'Valid', mapping
mapping = 'DisallowedIdna2008'
ranges.append((first, last, mapping, unicode_str))
def mergeable_key(r):
@ -86,7 +92,7 @@ def mergeable_key(r):
# These types have associated data, so we should not merge them.
if mapping in ('Mapped', 'Deviation', 'DisallowedStd3Mapped'):
return r
assert mapping in ('Valid', 'Ignored', 'Disallowed', 'DisallowedStd3Valid')
assert mapping in ('Valid', 'Ignored', 'Disallowed', 'DisallowedStd3Valid', 'DisallowedIdna2008')
return mapping
grouped_ranges = itertools.groupby(ranges, key=mergeable_key)
@ -116,11 +122,7 @@ for (k, g) in grouped_ranges:
# Assert we're seeing the surrogate case here.
assert last_char == 0xd7ff
assert next_char == 0xe000
first = group[0][0]
last = group[-1][1]
mapping = group[0][2]
unicode_str = group[0][3]
optimized_ranges.append((first, last, mapping, unicode_str))
optimized_ranges.append((group[0][0], group[-1][1]) + group[0][2:])
def is_single_char_range(r):
(first, last, _, _) = r
@ -148,30 +150,22 @@ def merge_single_char_ranges(ranges):
optimized_ranges = list(merge_single_char_ranges(optimized_ranges))
print("static TABLE: &[Range] = &[")
for ranges in optimized_ranges:
first = ranges[0][0]
last = ranges[-1][1]
print(" Range { from: '%s', to: '%s', }," % (escape_char(char(first)),
escape_char(char(last))))
print("];\n")
print("static INDEX_TABLE: &[u16] = &[")
SINGLE_MARKER = 1 << 15
print("static TABLE: &[(char, u16)] = &[")
offset = 0
for ranges in optimized_ranges:
assert offset < SINGLE_MARKER
block_len = len(ranges)
single = SINGLE_MARKER if block_len == 1 else 0
print(" %s," % (offset | single))
index = offset | single
offset += block_len
start = escape_char(char(ranges[0][0]))
print(" ('%s', %s)," % (start, index))
print("];\n")
print("static MAPPING_TABLE: &[Mapping] = &[")

4
third_party/rust/idna/src/punycode.rs поставляемый
Просмотреть файл

@ -78,6 +78,10 @@ impl Decoder {
),
};
if !base.is_ascii() {
return Err(());
}
let base_len = base.len();
let mut length = base_len as u32;
let mut code_point = INITIAL_N;

65
third_party/rust/idna/src/uts46.rs поставляемый
Просмотреть файл

@ -11,7 +11,6 @@
use self::Mapping::*;
use crate::punycode;
use std::cmp::Ordering::{Equal, Greater, Less};
use std::{error::Error as StdError, fmt};
use unicode_bidi::{bidi_class, BidiClass};
use unicode_normalization::char::is_combining_mark;
@ -48,38 +47,26 @@ enum Mapping {
Disallowed,
DisallowedStd3Valid,
DisallowedStd3Mapped(StringTableSlice),
}
struct Range {
from: char,
to: char,
DisallowedIdna2008,
}
fn find_char(codepoint: char) -> &'static Mapping {
let r = TABLE.binary_search_by(|ref range| {
if codepoint > range.to {
Less
} else if codepoint < range.from {
Greater
} else {
Equal
}
});
r.ok()
.map(|i| {
const SINGLE_MARKER: u16 = 1 << 15;
let idx = match TABLE.binary_search_by_key(&codepoint, |&val| val.0) {
Ok(idx) => idx,
Err(idx) => idx - 1,
};
let x = INDEX_TABLE[i];
let single = (x & SINGLE_MARKER) != 0;
let offset = !SINGLE_MARKER & x;
const SINGLE_MARKER: u16 = 1 << 15;
if single {
&MAPPING_TABLE[offset as usize]
} else {
&MAPPING_TABLE[(offset + (codepoint as u16 - TABLE[i].from as u16)) as usize]
}
})
.unwrap()
let (base, x) = TABLE[idx];
let single = (x & SINGLE_MARKER) != 0;
let offset = !SINGLE_MARKER & x;
if single {
&MAPPING_TABLE[offset as usize]
} else {
&MAPPING_TABLE[(offset + (codepoint as u16 - base as u16)) as usize]
}
}
struct Mapper<'a> {
@ -140,6 +127,12 @@ impl<'a> Iterator for Mapper<'a> {
self.slice = Some(decode_slice(slice).chars());
continue;
}
Mapping::DisallowedIdna2008 => {
if self.config.use_idna_2008_rules {
self.errors.disallowed_in_idna_2008 = true;
}
codepoint
}
});
}
}
@ -310,13 +303,12 @@ fn check_validity(label: &str, config: Config, errors: &mut Errors) {
// V6: Check against Mapping Table
if label.chars().any(|c| match *find_char(c) {
Mapping::Valid => false,
Mapping::Valid | Mapping::DisallowedIdna2008 => false,
Mapping::Deviation(_) => config.transitional_processing,
Mapping::DisallowedStd3Valid => config.use_std3_ascii_rules,
_ => true,
}) {
errors.invalid_mapping = true;
return;
}
// V7: ContextJ rules
@ -510,6 +502,7 @@ pub struct Config {
transitional_processing: bool,
verify_dns_length: bool,
check_hyphens: bool,
use_idna_2008_rules: bool,
}
/// The defaults are that of https://url.spec.whatwg.org/#idna
@ -524,6 +517,7 @@ impl Default for Config {
// Only use for to_ascii, not to_unicode
verify_dns_length: false,
use_idna_2008_rules: false,
}
}
}
@ -553,6 +547,12 @@ impl Config {
self
}
#[inline]
pub fn use_idna_2008_rules(mut self, value: bool) -> Self {
self.use_idna_2008_rules = value;
self
}
/// http://www.unicode.org/reports/tr46/#ToASCII
pub fn to_ascii(self, domain: &str) -> Result<String, Errors> {
let mut result = String::new();
@ -599,6 +599,7 @@ pub struct Errors {
disallowed_character: bool,
too_long_for_dns: bool,
too_short_for_dns: bool,
disallowed_in_idna_2008: bool,
}
impl Errors {
@ -615,6 +616,7 @@ impl Errors {
disallowed_character,
too_long_for_dns,
too_short_for_dns,
disallowed_in_idna_2008,
} = *self;
punycode
|| check_hyphens
@ -627,6 +629,7 @@ impl Errors {
|| disallowed_character
|| too_long_for_dns
|| too_short_for_dns
|| disallowed_in_idna_2008
}
}
@ -644,6 +647,7 @@ impl fmt::Debug for Errors {
disallowed_character,
too_long_for_dns,
too_short_for_dns,
disallowed_in_idna_2008,
} = *self;
let fields = [
@ -661,6 +665,7 @@ impl fmt::Debug for Errors {
("disallowed_character", disallowed_character),
("too_long_for_dns", too_long_for_dns),
("too_short_for_dns", too_short_for_dns),
("disallowed_in_idna_2008", disallowed_in_idna_2008),
];
let mut empty = true;

6261
third_party/rust/idna/src/uts46_mapping_table.rs поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

16
third_party/rust/idna/tests/punycode.rs поставляемый
Просмотреть файл

@ -19,10 +19,10 @@ fn one_test(decoded: &str, encoded: &str) {
let result = result.into_iter().collect::<String>();
assert!(
result == decoded,
format!(
"Incorrect decoding of \"{}\":\n \"{}\"\n!= \"{}\"\n",
encoded, result, decoded
)
"Incorrect decoding of \"{}\":\n \"{}\"\n!= \"{}\"\n",
encoded,
result,
decoded
)
}
}
@ -31,10 +31,10 @@ fn one_test(decoded: &str, encoded: &str) {
None => panic!("Encoding {} failed.", decoded),
Some(result) => assert!(
result == encoded,
format!(
"Incorrect encoding of \"{}\":\n \"{}\"\n!= \"{}\"\n",
decoded, result, encoded
)
"Incorrect encoding of \"{}\":\n \"{}\"\n!= \"{}\"\n",
decoded,
result,
encoded
),
}
}

23
third_party/rust/idna/tests/unit.rs поставляемый
Просмотреть файл

@ -114,3 +114,26 @@ fn test_v8_bidi_rules() {
// Bidi chars may be punycode-encoded
assert!(config.to_ascii("xn--0ca24w").is_err());
}
#[test]
fn emoji_domains() {
// HOT BEVERAGE is allowed here...
let config = idna::Config::default()
.verify_dns_length(true)
.use_std3_ascii_rules(true);
assert_eq!(config.to_ascii("☕.com").unwrap(), "xn--53h.com");
// ... but not here
let config = idna::Config::default()
.verify_dns_length(true)
.use_std3_ascii_rules(true)
.use_idna_2008_rules(true);
let error = format!("{:?}", config.to_ascii("☕.com").unwrap_err());
assert!(error.contains("disallowed_in_idna_2008"));
}
#[test]
fn unicode_before_delimiter() {
let config = idna::Config::default();
assert!(config.to_ascii("xn--f\u{34a}-PTP").is_err());
}

Просмотреть файл

@ -1 +1 @@
{"files":{"COPYRIGHT":"23860c2a7b5d96b21569afedf033469bab9fe14a1b24a35068b8641c578ce24d","Cargo.toml":"de7af66ede1e1b369adcdf82174fd97782a26cf11d66deb2bdb518741675e15a","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"7b63ecd5f1902af1b63729947373683c32745c16a10e8e6292e2e2dcd7e90ae0","README.md":"4e01af0960f3a9abb8a06b64bc903d730a9b285098ec9a1af9bceb135d08a660","benches/bench.rs":"eb8b04b99ac55f8583893ff23385194002472e9b5182e3c74636b989caa163db","scripts/unicode.py":"06e074696ea85b24a82bcad360b7ec765e4bd9ebc574e722689ea2434e8a0548","src/decompose.rs":"7cf48297bfeace89e43e7a0402ca05f4d508c732bf7befddf63ad1e95d14c8c4","src/lib.rs":"5cb3d00fffe5c3cb8f0f1cb4317894946c93247e08f7c612275bfd8948db7a02","src/normalization_tests.rs":"de293b9aa396b1b4235b7bfb460e216e2dc874f4ee58bbf54458173e22363cb1","src/normalize.rs":"82f1a3511432349799b42a360ef4a993a4df7e492d88fdc918adf317317c0ed6","src/quick_check.rs":"73335b915e483604c7d10491bc925fda1bbd29e32ce5dd7529cbe4982034780a","src/recompose.rs":"bf04c41bbcfce4717944f1974b87b97619ba66ca7ebec86745dd53493564e170","src/stream_safe.rs":"18f48fbb6afaa6d75289fe1c473bf9e610e76b3119acf7358b1b12d77b0a85fa","src/tables.rs":"c9c0a7cbdd27c11eb444de215153ba02e08cb9cd485c09855005bf23d30f8502","src/test.rs":"5b51a97954f053c251181277faf7ca8ab8f1a7167104f535fbfad97568442571"},"package":"6a0180bc61fc5a987082bfa111f4cc95c4caff7f9799f3e46df09163a937aa25"}
{"files":{"COPYRIGHT":"23860c2a7b5d96b21569afedf033469bab9fe14a1b24a35068b8641c578ce24d","Cargo.toml":"34370ae727c107ec51fd6809e01ff76220a1bcc2b849b8d277bf9c7bf1875abd","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"7b63ecd5f1902af1b63729947373683c32745c16a10e8e6292e2e2dcd7e90ae0","README.md":"80e4415e2f0941aac11b7e5c1db946d00139db2f1a67774fcd0c0bfde52217fe","benches/bench.rs":"827e5343b059a732904be29717c2797203bfd0a633edf08042afea65372a3e2c","scripts/unicode.py":"c00cb48507e4564a2dcf17a95a5fb1206830f748a8444d296f95b5d2dd09b72c","src/__test_api.rs":"78e21bfa0b98894f545c8ed3e31cec20d7a48951a7f3ed69a6130c4b3d463aee","src/decompose.rs":"c0eb774843a545356e63bbcd7fb926f80d3c97ef4601ca3701fc34154f2e9905","src/lib.rs":"3eaa16b8b4d2d8e15d38b56760fb432ec7665e22360fd4c587c9b724486ba90e","src/lookups.rs":"ca7022bf19a82108df1f5bd78c7fc30806f931d932a65538be818caaa5f7049d","src/no_std_prelude.rs":"602e81e67b8952b6571826f431e3b6787be3073bc10f38a0d3374278f81a6a1f","src/normalize.rs":"de2670b4437d335d42884af844a750f70e541467ecd34077dfe032103cb9b041","src/perfect_hash.rs":"400c84e2f467f61bd55d55d08672da6a9ad7a57c938ce5d0c701a6994b1b273b","src/quick_check.rs":"9756312d75fc31b67fca954e44a4812945a7e436b03ba18b9a2441f6de570f6f","src/recompose.rs":"a6228ad7561a5c7a1ef1d510159bdde1eea8a161007c80e470432e9b844d5536","src/replace.rs":"b24c904f3e00851a78820e30ddfa4ff10c795f8925fd0ee7f5870f31fdfa770b","src/stream_safe.rs":"383d71f0da401af8e735877e43855c7e16cb06deb2263539cdec2a407dbe257d","src/tables.rs":"d24cf5a2a6d5059543b39eec6806c93fa8c314b52b251ddd354affcf91ef7f0b","src/test.rs":"0def2cb0a013fba29938262b3cd3533fbb10eacaf6bcd82eef1f91759fe0a2eb"},"package":"d54590932941a9e9266f0832deed84ebe1bf2e4c9e4a3554d393d18f5e854bf9"}

Просмотреть файл

@ -3,7 +3,7 @@
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
# to registry (e.g. crates.io) dependencies
# to registry (e.g., crates.io) dependencies
#
# If you believe there's an error in this file please file an
# issue against the rust-lang/cargo repository. If you're
@ -11,14 +11,22 @@
# will likely look very different (and much more reasonable)
[package]
edition = "2018"
name = "unicode-normalization"
version = "0.1.7"
authors = ["kwantam <kwantam@gmail.com>"]
exclude = ["target/*", "Cargo.lock", "scripts/tmp", "*.txt"]
version = "0.1.19"
authors = ["kwantam <kwantam@gmail.com>", "Manish Goregaokar <manishsmail@gmail.com>"]
exclude = ["target/*", "Cargo.lock", "scripts/tmp", "*.txt", "tests/*"]
description = "This crate provides functions for normalization of\nUnicode strings, including Canonical and Compatible\nDecomposition and Recomposition, as described in\nUnicode Standard Annex #15.\n"
homepage = "https://github.com/unicode-rs/unicode-normalization"
documentation = "https://unicode-rs.github.io/unicode-normalization"
documentation = "https://docs.rs/unicode-normalization/"
readme = "README.md"
keywords = ["text", "unicode", "normalization", "decomposition", "recomposition"]
license = "MIT/Apache-2.0"
repository = "https://github.com/unicode-rs/unicode-normalization"
[dependencies.tinyvec]
version = "1"
features = ["alloc"]
[features]
default = ["std"]
std = []

Просмотреть файл

@ -1,10 +1,13 @@
# unicode-normalization
[![Build Status](https://travis-ci.org/unicode-rs/unicode-normalization.svg)](https://travis-ci.org/unicode-rs/unicode-normalization)
[![Docs](https://docs.rs/unicode-normalization/badge.svg)](https://docs.rs/unicode-normalization/)
Unicode character composition and decomposition utilities
as described in
[Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/).
[![Build Status](https://travis-ci.org/unicode-rs/unicode-normalization.svg)](https://travis-ci.org/unicode-rs/unicode-normalization)
[Documentation](https://unicode-rs.github.io/unicode-normalization/unicode_normalization/index.html)
This crate requires Rust 1.36+.
```rust
extern crate unicode_normalization;
@ -21,12 +24,16 @@ fn main() {
}
```
# crates.io
## crates.io
You can use this package in your project by adding the following
to your `Cargo.toml`:
```toml
[dependencies]
unicode-normalization = "0.1.7"
unicode-normalization = "0.1.19"
```
## `no_std` + `alloc` support
This crate is completely `no_std` + `alloc` compatible. This can be enabled by disabling the `std` feature, i.e. specifying `default-features = false` for this crate on your `Cargo.toml`.

Просмотреть файл

@ -1,8 +1,9 @@
#![feature(test)]
#![feature(iterator_step_by)]
extern crate unicode_normalization;
extern crate test;
extern crate test;
extern crate unicode_normalization;
use std::fs;
use test::Bencher;
use unicode_normalization::UnicodeNormalization;
@ -80,6 +81,40 @@ fn bench_nfd_ascii(b: &mut Bencher) {
b.iter(|| ASCII.nfd().count());
}
#[bench]
fn bench_nfc_long(b: &mut Bencher) {
let long = fs::read_to_string("benches/long.txt").unwrap();
b.iter(|| long.nfc().count());
}
#[bench]
fn bench_nfd_long(b: &mut Bencher) {
let long = fs::read_to_string("benches/long.txt").unwrap();
b.iter(|| long.nfd().count());
}
#[bench]
fn bench_nfkc_ascii(b: &mut Bencher) {
b.iter(|| ASCII.nfkc().count());
}
#[bench]
fn bench_nfkd_ascii(b: &mut Bencher) {
b.iter(|| ASCII.nfkd().count());
}
#[bench]
fn bench_nfkc_long(b: &mut Bencher) {
let long = fs::read_to_string("benches/long.txt").unwrap();
b.iter(|| long.nfkc().count());
}
#[bench]
fn bench_nfkd_long(b: &mut Bencher) {
let long = fs::read_to_string("benches/long.txt").unwrap();
b.iter(|| long.nfkd().count());
}
#[bench]
fn bench_streamsafe_ascii(b: &mut Bencher) {
b.iter(|| ASCII.stream_safe().count());

Просмотреть файл

@ -14,13 +14,14 @@
# - DerivedNormalizationProps.txt
# - NormalizationTest.txt
# - UnicodeData.txt
# - StandardizedVariants.txt
#
# Since this should not require frequent updates, we just store this
# out-of-line and check the unicode.rs file into git.
# out-of-line and check the tables.rs and normalization_tests.rs files into git.
import collections
import requests
import urllib.request
UNICODE_VERSION = "9.0.0"
UNICODE_VERSION = "13.0.0"
UCD_URL = "https://www.unicode.org/Public/%s/ucd/" % UNICODE_VERSION
PREAMBLE = """// Copyright 2012-2018 The Rust Project Developers. See the COPYRIGHT
@ -57,6 +58,11 @@ expanded_categories = {
'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
}
# Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior
# http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior
S_BASE, L_COUNT, V_COUNT, T_COUNT = 0xAC00, 19, 21, 28
S_COUNT = L_COUNT * V_COUNT * T_COUNT
class UnicodeData(object):
def __init__(self):
self._load_unicode_data()
@ -66,35 +72,48 @@ class UnicodeData(object):
self.canon_comp = self._compute_canonical_comp()
self.canon_fully_decomp, self.compat_fully_decomp = self._compute_fully_decomposed()
self.cjk_compat_variants_fully_decomp = {}
self._load_cjk_compat_ideograph_variants()
def stats(name, table):
count = sum(len(v) for v in table.values())
print "%s: %d chars => %d decomposed chars" % (name, len(table), count)
print("%s: %d chars => %d decomposed chars" % (name, len(table), count))
print "Decomposition table stats:"
print("Decomposition table stats:")
stats("Canonical decomp", self.canon_decomp)
stats("Compatible decomp", self.compat_decomp)
stats("Canonical fully decomp", self.canon_fully_decomp)
stats("Compatible fully decomp", self.compat_fully_decomp)
stats("CJK Compat Variants fully decomp", self.cjk_compat_variants_fully_decomp)
self.ss_leading, self.ss_trailing = self._compute_stream_safe_tables()
def _fetch(self, filename):
resp = requests.get(UCD_URL + filename)
return resp.text
resp = urllib.request.urlopen(UCD_URL + filename)
return resp.read().decode('utf-8')
def _load_unicode_data(self):
self.name_to_char_int = {}
self.combining_classes = {}
self.compat_decomp = {}
self.canon_decomp = {}
self.general_category_mark = []
self.general_category_public_assigned = []
assigned_start = 0;
prev_char_int = -1;
prev_name = "";
for line in self._fetch("UnicodeData.txt").splitlines():
# See ftp://ftp.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html
pieces = line.split(';')
assert len(pieces) == 15
char, category, cc, decomp = pieces[0], pieces[2], pieces[3], pieces[5]
char, name, category, cc, decomp = pieces[0], pieces[1], pieces[2], pieces[3], pieces[5]
char_int = int(char, 16)
name = pieces[1].strip()
self.name_to_char_int[name] = char_int
if cc != '0':
self.combining_classes[char_int] = cc
@ -106,6 +125,51 @@ class UnicodeData(object):
if category == 'M' or 'M' in expanded_categories.get(category, []):
self.general_category_mark.append(char_int)
assert category != 'Cn', "Unexpected: Unassigned codepoint in UnicodeData.txt"
if category not in ['Co', 'Cs']:
if char_int != prev_char_int + 1 and not is_first_and_last(prev_name, name):
self.general_category_public_assigned.append((assigned_start, prev_char_int))
assigned_start = char_int
prev_char_int = char_int
prev_name = name;
self.general_category_public_assigned.append((assigned_start, prev_char_int))
def _load_cjk_compat_ideograph_variants(self):
for line in self._fetch("StandardizedVariants.txt").splitlines():
strip_comments = line.split('#', 1)[0].strip()
if not strip_comments:
continue
variation_sequence, description, differences = strip_comments.split(';')
description = description.strip()
# Don't use variations that only apply in particular shaping environments.
if differences:
continue
# Look for entries where the description field is a codepoint name.
if description not in self.name_to_char_int:
continue
# Only consider the CJK Compatibility Ideographs.
if not description.startswith('CJK COMPATIBILITY IDEOGRAPH-'):
continue
char_int = self.name_to_char_int[description]
assert not char_int in self.combining_classes, "Unexpected: CJK compat variant with a combining class"
assert not char_int in self.compat_decomp, "Unexpected: CJK compat variant and compatibility decomposition"
assert len(self.canon_decomp[char_int]) == 1, "Unexpected: CJK compat variant and non-singleton canonical decomposition"
# If we ever need to handle Hangul here, we'll need to handle it separately.
assert not (S_BASE <= char_int < S_BASE + S_COUNT)
cjk_compat_variant_parts = [int(c, 16) for c in variation_sequence.split()]
for c in cjk_compat_variant_parts:
assert not c in self.canon_decomp, "Unexpected: CJK compat variant is unnormalized (canon)"
assert not c in self.compat_decomp, "Unexpected: CJK compat variant is unnormalized (compat)"
self.cjk_compat_variants_fully_decomp[char_int] = cjk_compat_variant_parts
def _load_norm_props(self):
props = collections.defaultdict(list)
@ -178,11 +242,6 @@ class UnicodeData(object):
The upshot is that decomposition code is very simple and easy to inline
at mild code size cost.
"""
# Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior
# http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior
S_BASE, L_COUNT, V_COUNT, T_COUNT = 0xAC00, 19, 21, 28
S_COUNT = L_COUNT * V_COUNT * T_COUNT
def _decompose(char_int, compatible):
# 7-bit ASCII never decomposes
if char_int <= 0x7f:
@ -234,7 +293,7 @@ class UnicodeData(object):
# need to store their overlap when they agree. When they don't agree,
# store the decomposition in the compatibility table since we'll check
# that first when normalizing to NFKD.
assert canon_fully_decomp <= compat_fully_decomp
assert set(canon_fully_decomp) <= set(compat_fully_decomp)
for ch in set(canon_fully_decomp) & set(compat_fully_decomp):
if canon_fully_decomp[ch] == compat_fully_decomp[ch]:
@ -284,47 +343,57 @@ class UnicodeData(object):
return leading_nonstarters, trailing_nonstarters
hexify = lambda c: hex(c)[2:].upper().rjust(4, '0')
hexify = lambda c: '{:04X}'.format(c)
# Test whether `first` and `last` are corresponding "<..., First>" and
# "<..., Last>" markers.
def is_first_and_last(first, last):
if not first.startswith('<') or not first.endswith(', First>'):
return False
if not last.startswith('<') or not last.endswith(', Last>'):
return False
return first[1:-8] == last[1:-7]
def gen_mph_data(name, d, kv_type, kv_callback):
(salt, keys) = minimal_perfect_hash(d)
out.write("pub(crate) const %s_SALT: &[u16] = &[\n" % name.upper())
for s in salt:
out.write(" 0x{:x},\n".format(s))
out.write("];\n")
out.write("pub(crate) const {}_KV: &[{}] = &[\n".format(name.upper(), kv_type))
for k in keys:
out.write(" {},\n".format(kv_callback(k)))
out.write("];\n\n")
def gen_combining_class(combining_classes, out):
out.write("#[inline]\n")
out.write("pub fn canonical_combining_class(c: char) -> u8 {\n")
out.write(" match c {\n")
for char, combining_class in sorted(combining_classes.items()):
out.write(" '\u{%s}' => %s,\n" % (hexify(char), combining_class))
out.write(" _ => 0,\n")
out.write(" }\n")
out.write("}\n")
gen_mph_data('canonical_combining_class', combining_classes, 'u32',
lambda k: "0x{:X}".format(int(combining_classes[k]) | (k << 8)))
def gen_composition_table(canon_comp, out):
out.write("#[inline]\n")
out.write("pub fn composition_table(c1: char, c2: char) -> Option<char> {\n")
out.write(" match (c1, c2) {\n")
table = {}
for (c1, c2), c3 in canon_comp.items():
if c1 < 0x10000 and c2 < 0x10000:
table[(c1 << 16) | c2] = c3
(salt, keys) = minimal_perfect_hash(table)
gen_mph_data('COMPOSITION_TABLE', table, '(u32, char)',
lambda k: "(0x%s, '\\u{%s}')" % (hexify(k), hexify(table[k])))
out.write("pub(crate) fn composition_table_astral(c1: char, c2: char) -> Option<char> {\n")
out.write(" match (c1, c2) {\n")
for (c1, c2), c3 in sorted(canon_comp.items()):
out.write(" ('\u{%s}', '\u{%s}') => Some('\u{%s}'),\n" % (hexify(c1), hexify(c2), hexify(c3)))
if c1 >= 0x10000 and c2 >= 0x10000:
out.write(" ('\\u{%s}', '\\u{%s}') => Some('\\u{%s}'),\n" % (hexify(c1), hexify(c2), hexify(c3)))
out.write(" _ => None,\n")
out.write(" }\n")
out.write("}\n")
def gen_decomposition_tables(canon_decomp, compat_decomp, out):
tables = [(canon_decomp, 'canonical'), (compat_decomp, 'compatibility')]
def gen_decomposition_tables(canon_decomp, compat_decomp, cjk_compat_variants_decomp, out):
tables = [(canon_decomp, 'canonical'), (compat_decomp, 'compatibility'), (cjk_compat_variants_decomp, 'cjk_compat_variants')]
for table, name in tables:
out.write("#[inline]\n")
out.write("pub fn %s_fully_decomposed(c: char) -> Option<&'static [char]> {\n" % name)
out.write(" match c {\n")
for char, chars in sorted(table.items()):
d = ", ".join("'\u{%s}'" % hexify(c) for c in chars)
out.write(" '\u{%s}' => Some(&[%s]),\n" % (hexify(char), d))
out.write(" _ => None,\n")
out.write(" }\n")
out.write("}\n")
out.write("\n")
gen_mph_data(name + '_decomposed', table, "(u32, &'static [char])",
lambda k: "(0x{:x}, &[{}])".format(k,
", ".join("'\\u{%s}'" % hexify(c) for c in table[k])))
def gen_qc_match(prop_table, out):
out.write(" match c {\n")
@ -343,51 +412,76 @@ def gen_qc_match(prop_table, out):
def gen_nfc_qc(prop_tables, out):
out.write("#[inline]\n")
out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
out.write("pub fn qc_nfc(c: char) -> IsNormalized {\n")
gen_qc_match(prop_tables['NFC_QC'], out)
out.write("}\n")
def gen_nfkc_qc(prop_tables, out):
out.write("#[inline]\n")
out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
out.write("pub fn qc_nfkc(c: char) -> IsNormalized {\n")
gen_qc_match(prop_tables['NFKC_QC'], out)
out.write("}\n")
def gen_nfd_qc(prop_tables, out):
out.write("#[inline]\n")
out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
out.write("pub fn qc_nfd(c: char) -> IsNormalized {\n")
gen_qc_match(prop_tables['NFD_QC'], out)
out.write("}\n")
def gen_combining_mark(general_category_mark, out):
def gen_nfkd_qc(prop_tables, out):
out.write("#[inline]\n")
out.write("pub fn is_combining_mark(c: char) -> bool {\n")
out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
out.write("pub fn qc_nfkd(c: char) -> IsNormalized {\n")
gen_qc_match(prop_tables['NFKD_QC'], out)
out.write("}\n")
def gen_combining_mark(general_category_mark, out):
gen_mph_data('combining_mark', general_category_mark, 'u32',
lambda k: '0x{:04x}'.format(k))
def gen_public_assigned(general_category_public_assigned, out):
# This could be done as a hash but the table is somewhat small.
out.write("#[inline]\n")
out.write("pub fn is_public_assigned(c: char) -> bool {\n")
out.write(" match c {\n")
for char in general_category_mark:
out.write(" '\u{%s}' => true,\n" % hexify(char))
start = True
for first, last in general_category_public_assigned:
if start:
out.write(" ")
start = False
else:
out.write(" | ")
if first == last:
out.write("'\\u{%s}'\n" % hexify(first))
else:
out.write("'\\u{%s}'..='\\u{%s}'\n" % (hexify(first), hexify(last)))
out.write(" => true,\n")
out.write(" _ => false,\n")
out.write(" }\n")
out.write("}\n")
out.write("\n")
def gen_stream_safe(leading, trailing, out):
# This could be done as a hash but the table is very small.
out.write("#[inline]\n")
out.write("pub fn stream_safe_leading_nonstarters(c: char) -> usize {\n")
out.write(" match c {\n")
for char, num_leading in leading.items():
out.write(" '\u{%s}' => %d,\n" % (hexify(char), num_leading))
for char, num_leading in sorted(leading.items()):
out.write(" '\\u{%s}' => %d,\n" % (hexify(char), num_leading))
out.write(" _ => 0,\n")
out.write(" }\n")
out.write("}\n")
out.write("\n")
out.write("#[inline]\n")
out.write("pub fn stream_safe_trailing_nonstarters(c: char) -> usize {\n")
out.write(" match c {\n")
for char, num_trailing in trailing.items():
out.write(" '\u{%s}' => %d,\n" % (hexify(char), num_trailing))
out.write(" _ => 0,\n")
out.write(" }\n")
out.write("}\n")
gen_mph_data('trailing_nonstarters', trailing, 'u32',
lambda k: "0x{:X}".format(int(trailing[k]) | (k << 8)))
def gen_tests(tests, out):
out.write("""#[derive(Debug)]
@ -402,7 +496,7 @@ pub struct NormalizationTest {
""")
out.write("pub const NORMALIZATION_TESTS: &[NormalizationTest] = &[\n")
str_literal = lambda s: '"%s"' % "".join("\u{%s}" % c for c in s)
str_literal = lambda s: '"%s"' % "".join("\\u{%s}" % c for c in s)
for test in tests:
out.write(" NormalizationTest {\n")
@ -415,17 +509,73 @@ pub struct NormalizationTest {
out.write("];\n")
# Guaranteed to be less than n.
def my_hash(x, salt, n):
# This is hash based on the theory that multiplication is efficient
mask_32 = 0xffffffff
y = ((x + salt) * 2654435769) & mask_32
y ^= (x * 0x31415926) & mask_32
return (y * n) >> 32
# Compute minimal perfect hash function, d can be either a dict or list of keys.
def minimal_perfect_hash(d):
n = len(d)
buckets = dict((h, []) for h in range(n))
for key in d:
h = my_hash(key, 0, n)
buckets[h].append(key)
bsorted = [(len(buckets[h]), h) for h in range(n)]
bsorted.sort(reverse = True)
claimed = [False] * n
salts = [0] * n
keys = [0] * n
for (bucket_size, h) in bsorted:
# Note: the traditional perfect hashing approach would also special-case
# bucket_size == 1 here and assign any empty slot, rather than iterating
# until rehash finds an empty slot. But we're not doing that so we can
# avoid the branch.
if bucket_size == 0:
break
else:
for salt in range(1, 32768):
rehashes = [my_hash(key, salt, n) for key in buckets[h]]
# Make sure there are no rehash collisions within this bucket.
if all(not claimed[hash] for hash in rehashes):
if len(set(rehashes)) < bucket_size:
continue
salts[h] = salt
for key in buckets[h]:
rehash = my_hash(key, salt, n)
claimed[rehash] = True
keys[rehash] = key
break
if salts[h] == 0:
print("minimal perfect hashing failed")
# Note: if this happens (because of unfortunate data), then there are
# a few things that could be done. First, the hash function could be
# tweaked. Second, the bucket order could be scrambled (especially the
# singletons). Right now, the buckets are sorted, which has the advantage
# of being deterministic.
#
# As a more extreme approach, the singleton bucket optimization could be
# applied (give the direct address for singleton buckets, rather than
# relying on a rehash). That is definitely the more standard approach in
# the minimal perfect hashing literature, but in testing the branch was a
# significant slowdown.
exit(1)
return (salts, keys)
if __name__ == '__main__':
data = UnicodeData()
with open("tables.rs", "w") as out:
with open("tables.rs", "w", newline = "\n") as out:
out.write(PREAMBLE)
out.write("use quick_check::IsNormalized;\n")
out.write("use quick_check::IsNormalized::*;\n")
out.write("use crate::quick_check::IsNormalized;\n")
out.write("use crate::quick_check::IsNormalized::*;\n")
out.write("\n")
version = "(%s, %s, %s)" % tuple(UNICODE_VERSION.split("."))
out.write("#[allow(unused)]\n")
out.write("pub const UNICODE_VERSION: (u64, u64, u64) = %s;\n\n" % version)
out.write("pub const UNICODE_VERSION: (u8, u8, u8) = %s;\n\n" % version)
gen_combining_class(data.combining_classes, out)
out.write("\n")
@ -433,20 +583,29 @@ if __name__ == '__main__':
gen_composition_table(data.canon_comp, out)
out.write("\n")
gen_decomposition_tables(data.canon_fully_decomp, data.compat_fully_decomp, out)
gen_decomposition_tables(data.canon_fully_decomp, data.compat_fully_decomp, data.cjk_compat_variants_fully_decomp, out)
gen_combining_mark(data.general_category_mark, out)
out.write("\n")
gen_public_assigned(data.general_category_public_assigned, out)
out.write("\n")
gen_nfc_qc(data.norm_props, out)
out.write("\n")
gen_nfkc_qc(data.norm_props, out)
out.write("\n")
gen_nfd_qc(data.norm_props, out)
out.write("\n")
gen_nfkd_qc(data.norm_props, out)
out.write("\n")
gen_stream_safe(data.ss_leading, data.ss_trailing, out)
out.write("\n")
with open("normalization_tests.rs", "w") as out:
with open("normalization_tests.rs", "w", newline = "\n") as out:
out.write(PREAMBLE)
gen_tests(data.norm_tests, out)

18
third_party/rust/unicode-normalization/src/__test_api.rs поставляемый Normal file
Просмотреть файл

@ -0,0 +1,18 @@
// This crate comprises hacks and glue required to test private functions from tests/
//
// Keep this as slim as possible.
//
// If you're caught using this outside this crates tests/, you get to clean up the mess.
#[cfg(not(feature = "std"))]
use crate::no_std_prelude::*;
use crate::stream_safe::StreamSafe;
pub fn stream_safe(s: &str) -> String {
StreamSafe::new(s.chars()).collect()
}
pub mod quick_check {
pub use crate::quick_check::*;
}

Просмотреть файл

@ -7,51 +7,52 @@
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use std::fmt::{self, Write};
use core::fmt::{self, Write};
use core::iter::Fuse;
use core::ops::Range;
use tinyvec::TinyVec;
#[derive(Clone)]
enum DecompositionType {
Canonical,
Compatible
Compatible,
}
/// External iterator for a string decomposition's characters.
#[derive(Clone)]
pub struct Decompositions<I> {
kind: DecompositionType,
iter: I,
done: bool,
iter: Fuse<I>,
// This buffer stores pairs of (canonical combining class, character),
// pushed onto the end in text order.
//
// It's split into two contiguous regions by the `ready` offset. The first
// `ready` pairs are sorted and ready to emit on demand. The "pending"
// suffix afterwards still needs more characters for us to be able to sort
// in canonical order and is not safe to emit.
buffer: Vec<(u8, char)>,
ready: usize,
// It's divided into up to three sections:
// 1) A prefix that is free space;
// 2) "Ready" characters which are sorted and ready to emit on demand;
// 3) A "pending" block which stills needs more characters for us to be able
// to sort in canonical order and is not safe to emit.
buffer: TinyVec<[(u8, char); 4]>,
ready: Range<usize>,
}
#[inline]
pub fn new_canonical<I: Iterator<Item=char>>(iter: I) -> Decompositions<I> {
pub fn new_canonical<I: Iterator<Item = char>>(iter: I) -> Decompositions<I> {
Decompositions {
kind: self::DecompositionType::Canonical,
iter: iter,
done: false,
buffer: Vec::new(),
ready: 0,
iter: iter.fuse(),
buffer: TinyVec::new(),
ready: 0..0,
}
}
#[inline]
pub fn new_compatible<I: Iterator<Item=char>>(iter: I) -> Decompositions<I> {
pub fn new_compatible<I: Iterator<Item = char>>(iter: I) -> Decompositions<I> {
Decompositions {
kind: self::DecompositionType::Compatible,
iter: iter,
done: false,
buffer: Vec::new(),
ready: 0,
iter: iter.fuse(),
buffer: TinyVec::new(),
ready: 0..0,
}
}
@ -59,55 +60,89 @@ impl<I> Decompositions<I> {
#[inline]
fn push_back(&mut self, ch: char) {
let class = super::char::canonical_combining_class(ch);
if class == 0 {
self.sort_pending();
self.buffer.push((class, ch));
self.ready.end = self.buffer.len();
} else {
self.buffer.push((class, ch));
}
self.buffer.push((class, ch));
}
#[inline]
fn sort_pending(&mut self) {
if self.ready == 0 && self.buffer.is_empty() {
return;
}
// NB: `sort_by_key` is stable, so it will preserve the original text's
// order within a combining class.
self.buffer[self.ready..].sort_by_key(|k| k.0);
self.ready = self.buffer.len();
self.buffer[self.ready.end..].sort_by_key(|k| k.0);
}
#[inline]
fn pop_front(&mut self) -> Option<char> {
if self.ready == 0 {
None
fn reset_buffer(&mut self) {
// Equivalent to `self.buffer.drain(0..self.ready.end)`
// but faster than drain() if the buffer is a SmallVec or TinyVec
let pending = self.buffer.len() - self.ready.end;
for i in 0..pending {
self.buffer[i] = self.buffer[i + self.ready.end];
}
self.buffer.truncate(pending);
self.ready = 0..0;
}
#[inline]
fn increment_next_ready(&mut self) {
let next = self.ready.start + 1;
if next == self.ready.end {
self.reset_buffer();
} else {
self.ready -= 1;
Some(self.buffer.remove(0).1)
self.ready.start = next;
}
}
}
impl<I: Iterator<Item=char>> Iterator for Decompositions<I> {
impl<I: Iterator<Item = char>> Iterator for Decompositions<I> {
type Item = char;
#[inline]
fn next(&mut self) -> Option<char> {
while self.ready == 0 && !self.done {
while self.ready.end == 0 {
match (self.iter.next(), &self.kind) {
(Some(ch), &DecompositionType::Canonical) => {
super::char::decompose_canonical(ch, |d| self.push_back(d));
},
}
(Some(ch), &DecompositionType::Compatible) => {
super::char::decompose_compatible(ch, |d| self.push_back(d));
},
}
(None, _) => {
self.sort_pending();
self.done = true;
},
if self.buffer.is_empty() {
return None;
} else {
self.sort_pending();
self.ready.end = self.buffer.len();
// This implementation means that we can call `next`
// on an exhausted iterator; the last outer `next` call
// will result in an inner `next` call. To make this
// safe, we use `fuse`.
break;
}
}
}
}
self.pop_front()
// We can assume here that, if `self.ready.end` is greater than zero,
// it's also greater than `self.ready.start`. That's because we only
// increment `self.ready.start` inside `increment_next_ready`, and
// whenever it reaches equality with `self.ready.end`, we reset both
// to zero, maintaining the invariant that:
// self.ready.start < self.ready.end || self.ready.end == self.ready.start == 0
//
// This less-than-obviously-safe implementation is chosen for performance,
// minimizing the number & complexity of branches in `next` in the common
// case of buffering then unbuffering a single character with each call.
let (_, ch) = self.buffer[self.ready.start];
self.increment_next_ready();
Some(ch)
}
fn size_hint(&self) -> (usize, Option<usize>) {
@ -116,7 +151,7 @@ impl<I: Iterator<Item=char>> Iterator for Decompositions<I> {
}
}
impl<I: Iterator<Item=char> + Clone> fmt::Display for Decompositions<I> {
impl<I: Iterator<Item = char> + Clone> fmt::Display for Decompositions<I> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
for c in self.clone() {
f.write_char(c)?;

Просмотреть файл

@ -34,81 +34,103 @@
//!
//! ```toml
//! [dependencies]
//! unicode-normalization = "0.1.7"
//! unicode-normalization = "0.1.19"
//! ```
#![deny(missing_docs, unsafe_code)]
#![doc(html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png")]
#![doc(
html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png"
)]
#![cfg_attr(not(feature = "std"), no_std)]
pub use tables::UNICODE_VERSION;
pub use decompose::Decompositions;
pub use quick_check::{
#[cfg(not(feature = "std"))]
extern crate alloc;
#[cfg(feature = "std")]
extern crate core;
extern crate tinyvec;
pub use crate::decompose::Decompositions;
pub use crate::quick_check::{
is_nfc, is_nfc_quick, is_nfc_stream_safe, is_nfc_stream_safe_quick, is_nfd, is_nfd_quick,
is_nfd_stream_safe, is_nfd_stream_safe_quick, is_nfkc, is_nfkc_quick, is_nfkd, is_nfkd_quick,
IsNormalized,
is_nfc,
is_nfc_quick,
is_nfc_stream_safe,
is_nfc_stream_safe_quick,
is_nfd,
is_nfd_quick,
is_nfd_stream_safe,
is_nfd_stream_safe_quick,
};
pub use recompose::Recompositions;
pub use stream_safe::StreamSafe;
use std::str::Chars;
pub use crate::recompose::Recompositions;
pub use crate::replace::Replacements;
pub use crate::stream_safe::StreamSafe;
pub use crate::tables::UNICODE_VERSION;
use core::str::Chars;
mod no_std_prelude;
mod decompose;
mod lookups;
mod normalize;
mod recompose;
mod perfect_hash;
mod quick_check;
mod recompose;
mod replace;
mod stream_safe;
#[rustfmt::skip]
mod tables;
#[doc(hidden)]
pub mod __test_api;
#[cfg(test)]
mod test;
#[cfg(test)]
mod normalization_tests;
/// Methods for composing and decomposing characters.
pub mod char {
pub use normalize::{decompose_canonical, decompose_compatible, compose};
pub use crate::normalize::{
compose, decompose_canonical, decompose_cjk_compat_variants, decompose_compatible,
};
/// Look up the canonical combining class of a character.
pub use tables::canonical_combining_class;
pub use crate::lookups::{canonical_combining_class, is_combining_mark};
/// Return whether the given character is a combining mark (`General_Category=Mark`)
pub use tables::is_combining_mark;
/// Return whether the given character is assigned (`General_Category` != `Unassigned`)
/// and not Private-Use (`General_Category` != `Private_Use`), in the supported version
/// of Unicode.
pub use crate::tables::is_public_assigned;
}
/// Methods for iterating over strings while applying Unicode normalizations
/// as described in
/// [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/).
pub trait UnicodeNormalization<I: Iterator<Item=char>> {
pub trait UnicodeNormalization<I: Iterator<Item = char>> {
/// Returns an iterator over the string in Unicode Normalization Form D
/// (canonical decomposition).
#[inline]
fn nfd(self) -> Decompositions<I>;
/// Returns an iterator over the string in Unicode Normalization Form KD
/// (compatibility decomposition).
#[inline]
fn nfkd(self) -> Decompositions<I>;
/// An Iterator over the string in Unicode Normalization Form C
/// (canonical decomposition followed by canonical composition).
#[inline]
fn nfc(self) -> Recompositions<I>;
/// An Iterator over the string in Unicode Normalization Form KC
/// (compatibility decomposition followed by canonical composition).
#[inline]
fn nfkc(self) -> Recompositions<I>;
/// A transformation which replaces CJK Compatibility Ideograph codepoints
/// with normal forms using Standardized Variation Sequences. This is not
/// part of the canonical or compatibility decomposition algorithms, but
/// performing it before those algorithms produces normalized output which
/// better preserves the intent of the original text.
///
/// Note that many systems today ignore variation selectors, so these
/// may not immediately help text display as intended, but they at
/// least preserve the information in a standardized form, giving
/// implementations the option to recognize them.
fn cjk_compat_variants(self) -> Replacements<I>;
/// An Iterator over the string with Conjoining Grapheme Joiner characters
/// inserted according to the Stream-Safe Text Process (UAX15-D4)
#[inline]
fn stream_safe(self) -> StreamSafe<I>;
}
@ -133,13 +155,18 @@ impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
recompose::new_compatible(self.chars())
}
#[inline]
fn cjk_compat_variants(self) -> Replacements<Chars<'a>> {
replace::new_cjk_compat_variants(self.chars())
}
#[inline]
fn stream_safe(self) -> StreamSafe<Chars<'a>> {
StreamSafe::new(self.chars())
}
}
impl<I: Iterator<Item=char>> UnicodeNormalization<I> for I {
impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I {
#[inline]
fn nfd(self) -> Decompositions<I> {
decompose::new_canonical(self)
@ -160,6 +187,11 @@ impl<I: Iterator<Item=char>> UnicodeNormalization<I> for I {
recompose::new_compatible(self)
}
#[inline]
fn cjk_compat_variants(self) -> Replacements<I> {
replace::new_cjk_compat_variants(self)
}
#[inline]
fn stream_safe(self) -> StreamSafe<I> {
StreamSafe::new(self)

135
third_party/rust/unicode-normalization/src/lookups.rs поставляемый Normal file
Просмотреть файл

@ -0,0 +1,135 @@
// Copyright 2019 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//! Lookups of unicode properties using minimal perfect hashing.
use crate::perfect_hash::mph_lookup;
use crate::tables::*;
/// Look up the canonical combining class for a codepoint.
///
/// The value returned is as defined in the Unicode Character Database.
pub fn canonical_combining_class(c: char) -> u8 {
mph_lookup(
c.into(),
CANONICAL_COMBINING_CLASS_SALT,
CANONICAL_COMBINING_CLASS_KV,
u8_lookup_fk,
u8_lookup_fv,
0,
)
}
pub(crate) fn composition_table(c1: char, c2: char) -> Option<char> {
if c1 < '\u{10000}' && c2 < '\u{10000}' {
mph_lookup(
(c1 as u32) << 16 | (c2 as u32),
COMPOSITION_TABLE_SALT,
COMPOSITION_TABLE_KV,
pair_lookup_fk,
pair_lookup_fv_opt,
None,
)
} else {
composition_table_astral(c1, c2)
}
}
pub(crate) fn canonical_fully_decomposed(c: char) -> Option<&'static [char]> {
mph_lookup(
c.into(),
CANONICAL_DECOMPOSED_SALT,
CANONICAL_DECOMPOSED_KV,
pair_lookup_fk,
pair_lookup_fv_opt,
None,
)
}
pub(crate) fn compatibility_fully_decomposed(c: char) -> Option<&'static [char]> {
mph_lookup(
c.into(),
COMPATIBILITY_DECOMPOSED_SALT,
COMPATIBILITY_DECOMPOSED_KV,
pair_lookup_fk,
pair_lookup_fv_opt,
None,
)
}
pub(crate) fn cjk_compat_variants_fully_decomposed(c: char) -> Option<&'static [char]> {
mph_lookup(
c.into(),
CJK_COMPAT_VARIANTS_DECOMPOSED_SALT,
CJK_COMPAT_VARIANTS_DECOMPOSED_KV,
pair_lookup_fk,
pair_lookup_fv_opt,
None,
)
}
/// Return whether the given character is a combining mark (`General_Category=Mark`)
pub fn is_combining_mark(c: char) -> bool {
mph_lookup(
c.into(),
COMBINING_MARK_SALT,
COMBINING_MARK_KV,
bool_lookup_fk,
bool_lookup_fv,
false,
)
}
pub fn stream_safe_trailing_nonstarters(c: char) -> usize {
mph_lookup(
c.into(),
TRAILING_NONSTARTERS_SALT,
TRAILING_NONSTARTERS_KV,
u8_lookup_fk,
u8_lookup_fv,
0,
) as usize
}
/// Extract the key in a 24 bit key and 8 bit value packed in a u32.
#[inline]
fn u8_lookup_fk(kv: u32) -> u32 {
kv >> 8
}
/// Extract the value in a 24 bit key and 8 bit value packed in a u32.
#[inline]
fn u8_lookup_fv(kv: u32) -> u8 {
(kv & 0xff) as u8
}
/// Extract the key for a boolean lookup.
#[inline]
fn bool_lookup_fk(kv: u32) -> u32 {
kv
}
/// Extract the value for a boolean lookup.
#[inline]
fn bool_lookup_fv(_kv: u32) -> bool {
true
}
/// Extract the key in a pair.
#[inline]
fn pair_lookup_fk<T>(kv: (u32, T)) -> u32 {
kv.0
}
/// Extract the value in a pair, returning an option.
#[inline]
fn pair_lookup_fv_opt<T>(kv: (u32, T)) -> Option<T> {
Some(kv.1)
}

6
third_party/rust/unicode-normalization/src/no_std_prelude.rs поставляемый Normal file
Просмотреть файл

@ -0,0 +1,6 @@
#[cfg(not(feature = "std"))]
pub use alloc::{
str::Chars,
string::{String, ToString},
vec::Vec,
};

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -9,16 +9,22 @@
// except according to those terms.
//! Functions for computing canonical and compatible decompositions for Unicode characters.
use std::char;
use std::ops::FnMut;
use tables;
use crate::lookups::{
canonical_fully_decomposed, cjk_compat_variants_fully_decomposed,
compatibility_fully_decomposed, composition_table,
};
use core::{char, ops::FnMut};
/// Compute canonical Unicode decomposition for character.
/// See [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/)
/// for more information.
#[inline]
pub fn decompose_canonical<F>(c: char, emit_char: F) where F: FnMut(char) {
decompose(c, tables::canonical_fully_decomposed, emit_char)
pub fn decompose_canonical<F>(c: char, emit_char: F)
where
F: FnMut(char),
{
decompose(c, canonical_fully_decomposed, emit_char)
}
/// Compute canonical or compatible Unicode decomposition for character.
@ -26,14 +32,49 @@ pub fn decompose_canonical<F>(c: char, emit_char: F) where F: FnMut(char) {
/// for more information.
#[inline]
pub fn decompose_compatible<F: FnMut(char)>(c: char, emit_char: F) {
let decompose_char = |c| tables::compatibility_fully_decomposed(c)
.or_else(|| tables::canonical_fully_decomposed(c));
let decompose_char =
|c| compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c));
decompose(c, decompose_char, emit_char)
}
/// Compute standard-variation decomposition for character.
///
/// [Standardized Variation Sequences] are used instead of the standard canonical
/// decompositions, notably for CJK codepoints with singleton canonical decompositions,
/// to avoid losing information. See the
/// [Unicode Variation Sequence FAQ](http://unicode.org/faq/vs.html) and the
/// "Other Enhancements" section of the
/// [Unicode 6.3 Release Summary](https://www.unicode.org/versions/Unicode6.3.0/#Summary)
/// for more information.
#[inline]
pub fn decompose_cjk_compat_variants<F>(c: char, mut emit_char: F)
where
F: FnMut(char),
{
// 7-bit ASCII never decomposes
if c <= '\x7f' {
emit_char(c);
return;
}
// Don't perform decomposition for Hangul
if let Some(decomposed) = cjk_compat_variants_fully_decomposed(c) {
for &d in decomposed {
emit_char(d);
}
return;
}
// Finally bottom out.
emit_char(c);
}
#[inline]
fn decompose<D, F>(c: char, decompose_char: D, mut emit_char: F)
where D: Fn(char) -> Option<&'static [char]>, F: FnMut(char)
where
D: Fn(char) -> Option<&'static [char]>,
F: FnMut(char),
{
// 7-bit ASCII never decomposes
if c <= '\x7f' {
@ -62,7 +103,7 @@ fn decompose<D, F>(c: char, decompose_char: D, mut emit_char: F)
/// See [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/)
/// for more information.
pub fn compose(a: char, b: char) -> Option<char> {
compose_hangul(a, b).or_else(|| tables::composition_table(a, b))
compose_hangul(a, b).or_else(|| composition_table(a, b))
}
// Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior
@ -74,8 +115,8 @@ const T_BASE: u32 = 0x11A7;
const L_COUNT: u32 = 19;
const V_COUNT: u32 = 21;
const T_COUNT: u32 = 28;
const N_COUNT: u32 = (V_COUNT * T_COUNT);
const S_COUNT: u32 = (L_COUNT * N_COUNT);
const N_COUNT: u32 = V_COUNT * T_COUNT;
const S_COUNT: u32 = L_COUNT * N_COUNT;
const S_LAST: u32 = S_BASE + S_COUNT - 1;
const L_LAST: u32 = L_BASE + L_COUNT - 1;
@ -93,7 +134,10 @@ pub(crate) fn is_hangul_syllable(c: char) -> bool {
// Decompose a precomposed Hangul syllable
#[allow(unsafe_code)]
#[inline(always)]
fn decompose_hangul<F>(s: char, mut emit_char: F) where F: FnMut(char) {
fn decompose_hangul<F>(s: char, mut emit_char: F)
where
F: FnMut(char),
{
let s_index = s as u32 - S_BASE;
let l_index = s_index / N_COUNT;
unsafe {
@ -113,27 +157,32 @@ fn decompose_hangul<F>(s: char, mut emit_char: F) where F: FnMut(char) {
pub(crate) fn hangul_decomposition_length(s: char) -> usize {
let si = s as u32 - S_BASE;
let ti = si % T_COUNT;
if ti > 0 { 3 } else { 2 }
if ti > 0 {
3
} else {
2
}
}
// Compose a pair of Hangul Jamo
#[allow(unsafe_code)]
#[inline(always)]
#[allow(ellipsis_inclusive_range_patterns)]
fn compose_hangul(a: char, b: char) -> Option<char> {
let (a, b) = (a as u32, b as u32);
match (a, b) {
// Compose a leading consonant and a vowel together into an LV_Syllable
(L_BASE ... L_LAST, V_BASE ... V_LAST) => {
(L_BASE...L_LAST, V_BASE...V_LAST) => {
let l_index = a - L_BASE;
let v_index = b - V_BASE;
let lv_index = l_index * N_COUNT + v_index * T_COUNT;
let s = S_BASE + lv_index;
Some(unsafe {char::from_u32_unchecked(s)})
},
Some(unsafe { char::from_u32_unchecked(s) })
}
// Compose an LV_Syllable and a trailing consonant into an LVT_Syllable
(S_BASE ... S_LAST, T_FIRST ... T_LAST) if (a - S_BASE) % T_COUNT == 0 => {
Some(unsafe {char::from_u32_unchecked(a + (b - T_BASE))})
},
(S_BASE...S_LAST, T_FIRST...T_LAST) if (a - S_BASE) % T_COUNT == 0 => {
Some(unsafe { char::from_u32_unchecked(a + (b - T_BASE)) })
}
_ => None,
}
}

50
third_party/rust/unicode-normalization/src/perfect_hash.rs поставляемый Normal file
Просмотреть файл

@ -0,0 +1,50 @@
// Copyright 2019 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//! Support for lookups based on minimal perfect hashing.
// This function is based on multiplication being fast and is "good enough". Also
// it can share some work between the unsalted and salted versions.
#[inline]
fn my_hash(key: u32, salt: u32, n: usize) -> usize {
let y = key.wrapping_add(salt).wrapping_mul(2654435769);
let y = y ^ key.wrapping_mul(0x31415926);
(((y as u64) * (n as u64)) >> 32) as usize
}
/// Do a lookup using minimal perfect hashing.
///
/// The table is stored as a sequence of "salt" values, then a sequence of
/// values that contain packed key/value pairs. The strategy is to hash twice.
/// The first hash retrieves a salt value that makes the second hash unique.
/// The hash function doesn't have to be very good, just good enough that the
/// resulting map is unique.
#[inline]
pub(crate) fn mph_lookup<KV, V, FK, FV>(
x: u32,
salt: &[u16],
kv: &[KV],
fk: FK,
fv: FV,
default: V,
) -> V
where
KV: Copy,
FK: Fn(KV) -> u32,
FV: Fn(KV) -> V,
{
let s = salt[my_hash(x, 0, salt.len())] as u32;
let key_val = kv[my_hash(x, s, salt.len())];
if x == fk(key_val) {
fv(key_val)
} else {
default
}
}

Просмотреть файл

@ -1,6 +1,7 @@
use UnicodeNormalization;
use stream_safe;
use tables;
use crate::lookups::canonical_combining_class;
use crate::stream_safe;
use crate::tables;
use crate::UnicodeNormalization;
/// The QuickCheck algorithm can quickly determine if a text is or isn't
/// normalized without any allocations in many cases, but it has to be able to
@ -18,7 +19,9 @@ pub enum IsNormalized {
// https://unicode.org/reports/tr15/#Detecting_Normalization_Forms
#[inline]
fn quick_check<F, I>(s: I, is_allowed: F, stream_safe: bool) -> IsNormalized
where I: Iterator<Item=char>, F: Fn(char) -> IsNormalized
where
I: Iterator<Item = char>,
F: Fn(char) -> IsNormalized,
{
let mut last_cc = 0u8;
let mut nonstarter_count = 0;
@ -32,7 +35,7 @@ fn quick_check<F, I>(s: I, is_allowed: F, stream_safe: bool) -> IsNormalized
}
// Otherwise, lookup the combining class and QC property
let cc = tables::canonical_combining_class(ch);
let cc = canonical_combining_class(ch);
if last_cc > cc && cc != 0 {
return IsNormalized::No;
}
@ -41,7 +44,7 @@ fn quick_check<F, I>(s: I, is_allowed: F, stream_safe: bool) -> IsNormalized
IsNormalized::No => return IsNormalized::No,
IsNormalized::Maybe => {
result = IsNormalized::Maybe;
},
}
}
if stream_safe {
let decomp = stream_safe::classify_nonstarters(ch);
@ -66,25 +69,37 @@ fn quick_check<F, I>(s: I, is_allowed: F, stream_safe: bool) -> IsNormalized
/// `IsNormalized::Maybe` if further checks are necessary. In this case a check
/// like `s.chars().nfc().eq(s.chars())` should suffice.
#[inline]
pub fn is_nfc_quick<I: Iterator<Item=char>>(s: I) -> IsNormalized {
pub fn is_nfc_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
quick_check(s, tables::qc_nfc, false)
}
/// Quickly check if a string is in NFKC.
#[inline]
pub fn is_nfkc_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
quick_check(s, tables::qc_nfkc, false)
}
/// Quickly check if a string is in NFD.
#[inline]
pub fn is_nfd_quick<I: Iterator<Item=char>>(s: I) -> IsNormalized {
pub fn is_nfd_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
quick_check(s, tables::qc_nfd, false)
}
/// Quickly check if a string is in NFKD.
#[inline]
pub fn is_nfkd_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
quick_check(s, tables::qc_nfkd, false)
}
/// Quickly check if a string is Stream-Safe NFC.
#[inline]
pub fn is_nfc_stream_safe_quick<I: Iterator<Item=char>>(s: I) -> IsNormalized {
pub fn is_nfc_stream_safe_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
quick_check(s, tables::qc_nfc, true)
}
/// Quickly check if a string is Stream-Safe NFD.
#[inline]
pub fn is_nfd_stream_safe_quick<I: Iterator<Item=char>>(s: I) -> IsNormalized {
pub fn is_nfd_stream_safe_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
quick_check(s, tables::qc_nfd, true)
}
@ -98,6 +113,16 @@ pub fn is_nfc(s: &str) -> bool {
}
}
/// Authoritatively check if a string is in NFKC.
#[inline]
pub fn is_nfkc(s: &str) -> bool {
match is_nfkc_quick(s.chars()) {
IsNormalized::Yes => true,
IsNormalized::No => false,
IsNormalized::Maybe => s.chars().eq(s.chars().nfkc()),
}
}
/// Authoritatively check if a string is in NFD.
#[inline]
pub fn is_nfd(s: &str) -> bool {
@ -108,6 +133,16 @@ pub fn is_nfd(s: &str) -> bool {
}
}
/// Authoritatively check if a string is in NFKD.
#[inline]
pub fn is_nfkd(s: &str) -> bool {
match is_nfkd_quick(s.chars()) {
IsNormalized::Yes => true,
IsNormalized::No => false,
IsNormalized::Maybe => s.chars().eq(s.chars().nfkd()),
}
}
/// Authoritatively check if a string is Stream-Safe NFC.
#[inline]
pub fn is_nfc_stream_safe(s: &str) -> bool {
@ -130,11 +165,7 @@ pub fn is_nfd_stream_safe(s: &str) -> bool {
#[cfg(test)]
mod tests {
use super::{
IsNormalized,
is_nfc_stream_safe_quick,
is_nfd_stream_safe_quick,
};
use super::{is_nfc_stream_safe_quick, is_nfd_stream_safe_quick, IsNormalized};
#[test]
fn test_stream_safe_nfd() {

Просмотреть файл

@ -8,15 +8,15 @@
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use std::collections::VecDeque;
use std::fmt::{self, Write};
use decompose::Decompositions;
use crate::decompose::Decompositions;
use core::fmt::{self, Write};
use tinyvec::TinyVec;
#[derive(Clone)]
enum RecompositionState {
Composing,
Purging,
Finished
Purging(usize),
Finished(usize),
}
/// External iterator for a string recomposition's characters.
@ -24,34 +24,34 @@ enum RecompositionState {
pub struct Recompositions<I> {
iter: Decompositions<I>,
state: RecompositionState,
buffer: VecDeque<char>,
buffer: TinyVec<[char; 4]>,
composee: Option<char>,
last_ccc: Option<u8>
last_ccc: Option<u8>,
}
#[inline]
pub fn new_canonical<I: Iterator<Item=char>>(iter: I) -> Recompositions<I> {
pub fn new_canonical<I: Iterator<Item = char>>(iter: I) -> Recompositions<I> {
Recompositions {
iter: super::decompose::new_canonical(iter),
state: self::RecompositionState::Composing,
buffer: VecDeque::new(),
buffer: TinyVec::new(),
composee: None,
last_ccc: None,
}
}
#[inline]
pub fn new_compatible<I: Iterator<Item=char>>(iter: I) -> Recompositions<I> {
pub fn new_compatible<I: Iterator<Item = char>>(iter: I) -> Recompositions<I> {
Recompositions {
iter: super::decompose::new_compatible(iter),
state: self::RecompositionState::Composing,
buffer: VecDeque::new(),
buffer: TinyVec::new(),
composee: None,
last_ccc: None,
}
}
impl<I: Iterator<Item=char>> Iterator for Recompositions<I> {
impl<I: Iterator<Item = char>> Iterator for Recompositions<I> {
type Item = char;
#[inline]
@ -70,36 +70,34 @@ impl<I: Iterator<Item=char>> Iterator for Recompositions<I> {
}
self.composee = Some(ch);
continue;
},
}
Some(k) => k,
};
match self.last_ccc {
None => {
match super::char::compose(k, ch) {
Some(r) => {
self.composee = Some(r);
continue;
}
None => {
if ch_class == 0 {
self.composee = Some(ch);
return Some(k);
}
self.buffer.push_back(ch);
self.last_ccc = Some(ch_class);
}
None => match super::char::compose(k, ch) {
Some(r) => {
self.composee = Some(r);
continue;
}
}
None => {
if ch_class == 0 {
self.composee = Some(ch);
return Some(k);
}
self.buffer.push(ch);
self.last_ccc = Some(ch_class);
}
},
Some(l_class) => {
if l_class >= ch_class {
// `ch` is blocked from `composee`
if ch_class == 0 {
self.composee = Some(ch);
self.last_ccc = None;
self.state = Purging;
self.state = Purging(0);
return Some(k);
}
self.buffer.push_back(ch);
self.buffer.push(ch);
self.last_ccc = Some(ch_class);
continue;
}
@ -109,36 +107,44 @@ impl<I: Iterator<Item=char>> Iterator for Recompositions<I> {
continue;
}
None => {
self.buffer.push_back(ch);
self.buffer.push(ch);
self.last_ccc = Some(ch_class);
}
}
}
}
}
self.state = Finished;
self.state = Finished(0);
if self.composee.is_some() {
return self.composee.take();
}
}
Purging => {
match self.buffer.pop_front() {
None => self.state = Composing,
s => return s
Purging(next) => match self.buffer.get(next).cloned() {
None => {
self.buffer.clear();
self.state = Composing;
}
}
Finished => {
match self.buffer.pop_front() {
None => return self.composee.take(),
s => return s
s => {
self.state = Purging(next + 1);
return s;
}
}
},
Finished(next) => match self.buffer.get(next).cloned() {
None => {
self.buffer.clear();
return self.composee.take();
}
s => {
self.state = Finished(next + 1);
return s;
}
},
}
}
}
}
impl<I: Iterator<Item=char> + Clone> fmt::Display for Recompositions<I> {
impl<I: Iterator<Item = char> + Clone> fmt::Display for Recompositions<I> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
for c in self.clone() {
f.write_char(c)?;

61
third_party/rust/unicode-normalization/src/replace.rs поставляемый Normal file
Просмотреть файл

@ -0,0 +1,61 @@
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use core::fmt::{self, Write};
use tinyvec::ArrayVec;
/// External iterator for replacements for a string's characters.
#[derive(Clone)]
pub struct Replacements<I> {
iter: I,
// At this time, the longest replacement sequence has length 2, so we just
// need buffer space for 1 codepoint.
buffer: Option<char>,
}
#[inline]
pub fn new_cjk_compat_variants<I: Iterator<Item = char>>(iter: I) -> Replacements<I> {
Replacements { iter, buffer: None }
}
impl<I: Iterator<Item = char>> Iterator for Replacements<I> {
type Item = char;
#[inline]
fn next(&mut self) -> Option<char> {
if let Some(c) = self.buffer.take() {
return Some(c);
}
match self.iter.next() {
Some(ch) => {
// At this time, the longest replacement sequence has length 2.
let mut buffer = ArrayVec::<[char; 2]>::new();
super::char::decompose_cjk_compat_variants(ch, |d| buffer.push(d));
self.buffer = buffer.get(1).copied();
Some(buffer[0])
}
None => None,
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
let (lower, _) = self.iter.size_hint();
(lower, None)
}
}
impl<I: Iterator<Item = char> + Clone> fmt::Display for Replacements<I> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
for c in self.clone() {
f.write_char(c)?;
}
Ok(())
}
}

Просмотреть файл

@ -1,8 +1,9 @@
use normalize::{
hangul_decomposition_length,
is_hangul_syllable,
use crate::lookups::{
canonical_combining_class, canonical_fully_decomposed, compatibility_fully_decomposed,
stream_safe_trailing_nonstarters,
};
use tables;
use crate::normalize::{hangul_decomposition_length, is_hangul_syllable};
use crate::tables::stream_safe_leading_nonstarters;
pub(crate) const MAX_NONSTARTERS: usize = 30;
const COMBINING_GRAPHEME_JOINER: char = '\u{034F}';
@ -18,34 +19,39 @@ pub struct StreamSafe<I> {
impl<I> StreamSafe<I> {
pub(crate) fn new(iter: I) -> Self {
Self { iter, nonstarter_count: 0, buffer: None }
Self {
iter,
nonstarter_count: 0,
buffer: None,
}
}
}
impl<I: Iterator<Item=char>> Iterator for StreamSafe<I> {
impl<I: Iterator<Item = char>> Iterator for StreamSafe<I> {
type Item = char;
#[inline]
fn next(&mut self) -> Option<char> {
if let Some(ch) = self.buffer.take() {
return Some(ch);
}
let next_ch = match self.iter.next() {
let next_ch = match self.buffer.take().or_else(|| self.iter.next()) {
None => return None,
Some(c) => c,
};
let d = classify_nonstarters(next_ch);
if self.nonstarter_count + d.leading_nonstarters > MAX_NONSTARTERS {
self.buffer = Some(next_ch);
// Since we're emitting a CGJ, the suffix of the emitted string in NFKD has no trailing
// nonstarters, so we can reset the counter to zero. Put `next_ch` back into the
// iterator (via `self.buffer`), and we'll reclassify it next iteration.
self.nonstarter_count = 0;
self.buffer = Some(next_ch);
return Some(COMBINING_GRAPHEME_JOINER);
}
// No starters in the decomposition, so keep accumulating
// Is the character all nonstarters in NFKD? If so, increment our counter of contiguous
// nonstarters in NKFD.
if d.leading_nonstarters == d.decomposition_len {
self.nonstarter_count += d.decomposition_len;
}
// Otherwise, restart the nonstarter counter.
// Otherwise, reset the counter to the decomposition's number of trailing nonstarters.
else {
self.nonstarter_count = d.trailing_nonstarters;
}
@ -68,7 +74,7 @@ pub(crate) fn classify_nonstarters(c: char) -> Decomposition {
leading_nonstarters: 0,
trailing_nonstarters: 0,
decomposition_len: 1,
}
};
}
// Next, special case Hangul, since it's not handled by our tables.
if is_hangul_syllable(c) {
@ -78,18 +84,15 @@ pub(crate) fn classify_nonstarters(c: char) -> Decomposition {
decomposition_len: hangul_decomposition_length(c),
};
}
let decomp = tables::compatibility_fully_decomposed(c)
.or_else(|| tables::canonical_fully_decomposed(c));
let decomp = compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c));
match decomp {
Some(decomp) => {
Decomposition {
leading_nonstarters: tables::stream_safe_leading_nonstarters(c),
trailing_nonstarters: tables::stream_safe_trailing_nonstarters(c),
decomposition_len: decomp.len(),
}
Some(decomp) => Decomposition {
leading_nonstarters: stream_safe_leading_nonstarters(c),
trailing_nonstarters: stream_safe_trailing_nonstarters(c),
decomposition_len: decomp.len(),
},
None => {
let is_nonstarter = tables::canonical_combining_class(c) != 0;
let is_nonstarter = canonical_combining_class(c) != 0;
let nonstarter = if is_nonstarter { 1 } else { 0 };
Decomposition {
leading_nonstarters: nonstarter,
@ -102,35 +105,38 @@ pub(crate) fn classify_nonstarters(c: char) -> Decomposition {
#[cfg(test)]
mod tests {
use super::{
StreamSafe,
classify_nonstarters,
};
use std::char;
use normalization_tests::NORMALIZATION_TESTS;
use normalize::decompose_compatible;
use tables;
use super::{classify_nonstarters, StreamSafe};
use crate::lookups::canonical_combining_class;
use crate::normalize::decompose_compatible;
#[cfg(not(feature = "std"))]
use crate::no_std_prelude::*;
use core::char;
fn stream_safe(s: &str) -> String {
StreamSafe::new(s.chars()).collect()
}
#[test]
fn test_normalization_tests_unaffected() {
for test in NORMALIZATION_TESTS {
for &s in &[test.source, test.nfc, test.nfd, test.nfkc, test.nfkd] {
assert_eq!(stream_safe(s), s);
}
}
}
#[test]
fn test_simple() {
let technically_okay = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}ngerzone";
assert_eq!(stream_safe(technically_okay), technically_okay);
let too_much = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}ngerzone";
assert_ne!(stream_safe(too_much), too_much);
let fixed_it = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{034f}\u{032e}ngerzone";
assert_eq!(stream_safe(too_much), fixed_it);
let woah_nelly = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}ngerzone";
let its_cool = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{034f}\u{032e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{034f}\u{031d}\u{032e}ngerzone";
assert_eq!(stream_safe(woah_nelly), its_cool);
}
#[test]
fn test_all_nonstarters() {
let s = "\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}";
let expected = "\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{034F}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}";
assert_eq!(stream_safe(s), expected);
}
#[test]
@ -142,19 +148,19 @@ mod tests {
None => continue,
};
let c = classify_nonstarters(ch);
let mut s = vec![];
let mut s = Vec::new();
decompose_compatible(ch, |c| s.push(c));
assert_eq!(s.len(), c.decomposition_len);
let num_leading = s
.iter()
.take_while(|&c| tables::canonical_combining_class(*c) != 0)
.take_while(|&c| canonical_combining_class(*c) != 0)
.count();
let num_trailing = s
.iter()
.rev()
.take_while(|&c| tables::canonical_combining_class(*c) != 0)
.take_while(|&c| canonical_combining_class(*c) != 0)
.count();
assert_eq!(num_leading, c.leading_nonstarters);

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -8,11 +8,12 @@
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use std::char;
use super::UnicodeNormalization;
use super::char::is_combining_mark;
use super::UnicodeNormalization;
use core::char;
#[cfg(not(feature = "std"))]
use crate::no_std_prelude::*;
#[test]
fn test_nfd() {
@ -21,8 +22,11 @@ fn test_nfd() {
assert_eq!($input.nfd().to_string(), $expected);
// A dummy iterator that is not std::str::Chars directly;
// note that `id_func` is used to ensure `Clone` implementation
assert_eq!($input.chars().map(|c| c).nfd().collect::<String>(), $expected);
}
assert_eq!(
$input.chars().map(|c| c).nfd().collect::<String>(),
$expected
);
};
}
t!("abc", "abc");
t!("\u{1e0b}\u{1c4}", "d\u{307}\u{1c4}");
@ -41,7 +45,7 @@ fn test_nfkd() {
macro_rules! t {
($input: expr, $expected: expr) => {
assert_eq!($input.nfkd().to_string(), $expected);
}
};
}
t!("abc", "abc");
t!("\u{1e0b}\u{1c4}", "d\u{307}DZ\u{30c}");
@ -60,7 +64,7 @@ fn test_nfc() {
macro_rules! t {
($input: expr, $expected: expr) => {
assert_eq!($input.nfc().to_string(), $expected);
}
};
}
t!("abc", "abc");
t!("\u{1e0b}\u{1c4}", "\u{1e0b}\u{1c4}");
@ -72,7 +76,10 @@ fn test_nfc() {
t!("\u{301}a", "\u{301}a");
t!("\u{d4db}", "\u{d4db}");
t!("\u{ac1c}", "\u{ac1c}");
t!("a\u{300}\u{305}\u{315}\u{5ae}b", "\u{e0}\u{5ae}\u{305}\u{315}b");
t!(
"a\u{300}\u{305}\u{315}\u{5ae}b",
"\u{e0}\u{5ae}\u{305}\u{315}b"
);
}
#[test]
@ -80,7 +87,7 @@ fn test_nfkc() {
macro_rules! t {
($input: expr, $expected: expr) => {
assert_eq!($input.nfkc().to_string(), $expected);
}
};
}
t!("abc", "abc");
t!("\u{1e0b}\u{1c4}", "\u{1e0b}D\u{17d}");
@ -92,85 +99,10 @@ fn test_nfkc() {
t!("\u{301}a", "\u{301}a");
t!("\u{d4db}", "\u{d4db}");
t!("\u{ac1c}", "\u{ac1c}");
t!("a\u{300}\u{305}\u{315}\u{5ae}b", "\u{e0}\u{5ae}\u{305}\u{315}b");
}
#[test]
fn test_official() {
use normalization_tests::NORMALIZATION_TESTS;
macro_rules! normString {
($method: ident, $input: expr) => { $input.$method().collect::<String>() }
}
for test in NORMALIZATION_TESTS {
// these invariants come from the CONFORMANCE section of
// http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt
{
let r1 = normString!(nfc, test.source);
let r2 = normString!(nfc, test.nfc);
let r3 = normString!(nfc, test.nfd);
let r4 = normString!(nfc, test.nfkc);
let r5 = normString!(nfc, test.nfkd);
assert_eq!(test.nfc, &r1[..]);
assert_eq!(test.nfc, &r2[..]);
assert_eq!(test.nfc, &r3[..]);
assert_eq!(test.nfkc, &r4[..]);
assert_eq!(test.nfkc, &r5[..]);
}
{
let r1 = normString!(nfd, test.source);
let r2 = normString!(nfd, test.nfc);
let r3 = normString!(nfd, test.nfd);
let r4 = normString!(nfd, test.nfkc);
let r5 = normString!(nfd, test.nfkd);
assert_eq!(test.nfd, &r1[..]);
assert_eq!(test.nfd, &r2[..]);
assert_eq!(test.nfd, &r3[..]);
assert_eq!(test.nfkd, &r4[..]);
assert_eq!(test.nfkd, &r5[..]);
}
{
let r1 = normString!(nfkc, test.source);
let r2 = normString!(nfkc, test.nfc);
let r3 = normString!(nfkc, test.nfd);
let r4 = normString!(nfkc, test.nfkc);
let r5 = normString!(nfkc, test.nfkd);
assert_eq!(test.nfkc, &r1[..]);
assert_eq!(test.nfkc, &r2[..]);
assert_eq!(test.nfkc, &r3[..]);
assert_eq!(test.nfkc, &r4[..]);
assert_eq!(test.nfkc, &r5[..]);
}
{
let r1 = normString!(nfkd, test.source);
let r2 = normString!(nfkd, test.nfc);
let r3 = normString!(nfkd, test.nfd);
let r4 = normString!(nfkd, test.nfkc);
let r5 = normString!(nfkd, test.nfkd);
assert_eq!(test.nfkd, &r1[..]);
assert_eq!(test.nfkd, &r2[..]);
assert_eq!(test.nfkd, &r3[..]);
assert_eq!(test.nfkd, &r4[..]);
assert_eq!(test.nfkd, &r5[..]);
}
}
}
#[test]
fn test_quick_check() {
use normalization_tests::NORMALIZATION_TESTS;
use quick_check;
for test in NORMALIZATION_TESTS {
assert!(quick_check::is_nfc(test.nfc));
assert!(quick_check::is_nfd(test.nfd));
if test.nfc != test.nfd {
assert!(!quick_check::is_nfc(test.nfd));
assert!(!quick_check::is_nfd(test.nfc));
}
}
t!(
"a\u{300}\u{305}\u{315}\u{5ae}b",
"\u{e0}\u{5ae}\u{305}\u{315}b"
);
}
#[test]