Bug 1889536 - Vendor idna 1.0.2 and icu_normalizer by updating the url crate. r=glandium,supply-chain-reviewers

Differential Revision: https://phabricator.services.mozilla.com/D206578
This commit is contained in:
Henri Sivonen 2024-07-09 09:50:37 +00:00
Родитель 1ac67d31a5
Коммит ce58d7f51e
125 изменённых файлов: 18659 добавлений и 71100 удалений

72
Cargo.lock сгенерированный
Просмотреть файл

@ -2864,6 +2864,30 @@ version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e"
[[package]]
name = "icu_normalizer"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f"
dependencies = [
"displaydoc",
"icu_collections",
"icu_normalizer_data",
"icu_properties",
"icu_provider",
"smallvec",
"utf16_iter",
"utf8_iter",
"write16",
"zerovec",
]
[[package]]
name = "icu_normalizer_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516"
[[package]]
name = "icu_properties"
version = "1.5.0"
@ -2961,12 +2985,14 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
[[package]]
name = "idna"
version = "0.5.0"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6"
checksum = "bd69211b9b519e98303c015e21a007e293db403b6c85b9b124e133d25e242cdd"
dependencies = [
"unicode-bidi",
"unicode-normalization",
"icu_normalizer",
"icu_properties",
"smallvec",
"utf8_iter",
]
[[package]]
@ -5857,13 +5883,6 @@ dependencies = [
"zerovec",
]
[[package]]
name = "tinyvec"
version = "1.999.999"
dependencies = [
"smallvec",
]
[[package]]
name = "to_shmem"
version = "0.0.1"
@ -6115,15 +6134,6 @@ version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc"
[[package]]
name = "unicode-normalization"
version = "0.1.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921"
dependencies = [
"tinyvec",
]
[[package]]
name = "unicode-width"
version = "0.1.10"
@ -6371,9 +6381,9 @@ checksum = "2ace0b4755d0a2959962769239d56267f8a024fef2d9b32666b3dcd0946b0906"
[[package]]
name = "url"
version = "2.5.0"
version = "2.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633"
checksum = "f7c25da092f0a868cdf09e8674cd3b7ef3a7d92a24253e663a2fb85e2496de56"
dependencies = [
"form_urlencoded",
"idna",
@ -6382,10 +6392,16 @@ dependencies = [
]
[[package]]
name = "utf8_iter"
version = "1.0.3"
name = "utf16_iter"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "64a8922555b9500e3d865caed19330172cd67cbf82203f1a3311d8c305cc9f33"
checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246"
[[package]]
name = "utf8_iter"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
[[package]]
name = "uuid"
@ -6959,6 +6975,12 @@ dependencies = [
"euclid",
]
[[package]]
name = "write16"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936"
[[package]]
name = "writeable"
version = "0.5.5"

Просмотреть файл

@ -136,9 +136,6 @@ redox_users = { path = "build/rust/redox_users" }
# Patch redox_syscall to an empty crate
redox_syscall = { path = "build/rust/redox_syscall" }
# Override tinyvec with smallvec
tinyvec = { path = "build/rust/tinyvec" }
# Patch base64 0.13 to 0.21
base64 = { path = "build/rust/base64" }

Просмотреть файл

@ -1,16 +0,0 @@
[package]
name = "tinyvec"
version = "1.999.999"
edition = "2018"
license = "MPL-2.0"
[lib]
path = "lib.rs"
[dependencies]
smallvec = "1"
[features]
alloc = []
default = []
std = ["alloc"]

Просмотреть файл

@ -1,6 +0,0 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
pub use smallvec::SmallVec as ArrayVec;
pub use smallvec::SmallVec as TinyVec;

Просмотреть файл

@ -2462,6 +2462,17 @@ who = "Makoto Kato <m_kato@ga2.so-net.ne.jp>"
criteria = "safe-to-deploy"
delta = "1.4.0 -> 1.5.0"
[[audits.icu_normalizer]]
who = "Henri Sivonen <hsivonen@hsivonen.fi>"
criteria = "safe-to-deploy"
version = "1.5.0"
notes = "I, Henri Sivonen, am the principal author of this crate."
[[audits.icu_normalizer_data]]
who = "Henri Sivonen <hsivonen@hsivonen.fi>"
criteria = "safe-to-deploy"
version = "1.5.0"
[[audits.icu_properties]]
who = "Jonathan Kew <jkew@mozilla.com>"
criteria = "safe-to-deploy"
@ -2575,6 +2586,12 @@ who = "Valentin Gosu <valentin.gosu@gmail.com>"
criteria = "safe-to-deploy"
delta = "0.4.0 -> 0.5.0"
[[audits.idna]]
who = "Henri Sivonen <hsivonen@hsivonen.fi>"
criteria = "safe-to-deploy"
delta = "0.5.0 -> 1.0.2"
notes = "In the 0.5.0 to 1.0.2 delta, I, Henri Sivonen, rewrote the non-Punycode internals of the crate and made the changes to the Punycode code."
[[audits.indexmap]]
who = "Mike Hommey <mh+mozilla@glandium.org>"
criteria = "safe-to-deploy"
@ -4758,6 +4775,17 @@ who = "Valentin Gosu <valentin.gosu@gmail.com>"
criteria = "safe-to-deploy"
delta = "2.4.1 -> 2.5.0"
[[audits.url]]
who = "Henri Sivonen <hsivonen@hsivonen.fi>"
criteria = "safe-to-deploy"
delta = "2.5.0 -> 2.5.1"
[[audits.utf16_iter]]
who = "Henri Sivonen <hsivonen@hsivonen.fi>"
criteria = "safe-to-deploy"
version = "1.0.5"
notes = "I, Henri Sivonen, wrote this crate."
[[audits.uuid]]
who = "Gabriele Svelto <gsvelto@mozilla.com>"
criteria = "safe-to-deploy"
@ -5149,6 +5177,12 @@ criteria = "safe-to-deploy"
version = "0.1.0"
notes = "Written and maintained by Gfx team at Mozilla."
[[audits.write16]]
who = "Henri Sivonen <hsivonen@hsivonen.fi>"
criteria = "safe-to-deploy"
version = "1.0.0"
notes = "I, Henri Sivonen, wrote this (safe-code-only) crate."
[[audits.writeable]]
who = "Makoto Kato <m_kato@ga2.so-net.ne.jp>"
criteria = "safe-to-deploy"

Просмотреть файл

@ -809,6 +809,13 @@ user-id = 4484
user-login = "hsivonen"
user-name = "Henri Sivonen"
[[publisher.utf8_iter]]
version = "1.0.4"
when = "2023-12-01"
user-id = 4484
user-login = "hsivonen"
user-name = "Henri Sivonen"
[[publisher.walkdir]]
version = "2.3.2"
when = "2021-03-22"

1
third_party/rust/icu_normalizer/.cargo-checksum.json поставляемый Normal file
Просмотреть файл

@ -0,0 +1 @@
{"files":{"Cargo.toml":"83b30ee0282024b826e1ef2d28519c230f663d2d882e64017bbe692d62c58741","LICENSE":"f367c1b8e1aa262435251e442901da4607b4650e0e63a026f5044473ecfb90f2","README.md":"7c238039ae55d7dc74c6fe4d0db071db103c2740a1637943ded2a8c504c58b86","benches/bench.rs":"9cd781e3d0e8d772860cd332b4f403910f3ca52fd69a459f5ac95d28f0e25ac2","benches/canonical_composition.rs":"78c6a077a26efd61586386e4eb39b3fc5b1875c73fab26e86292bf2eeaa93709","benches/canonical_decomposition.rs":"c57ab476622ec5e42b65556fc76313b6755714e539847012074eaad79bc72794","benches/composing_normalizer_nfc.rs":"8c80e55ebbab2c93f4c01140de69eba94ab25777401fd68e69e45638268ffd23","benches/composing_normalizer_nfkc.rs":"64244b5e94adb859469311c4cfc72835aafd7c58cf0aee319aeee47220dd0c63","benches/data/README.md":"fa79b84815a228c3fbfa5d4c6d12885036994ca8ad61e683b2113cf2b428bb85","benches/data/TestNames_Japanese_h.txt":"6522f8ed794ad348c904079082ec3aa303ae7acf3f68bbc49fa0ee90eebf31e0","benches/data/TestNames_Japanese_k.txt":"e4e18804fe742ecd27ae48bc3564c6bc653180a3c649d43a2ab4d8b7f2607627","benches/data/TestNames_Korean.txt":"9cbf54d5ee16726c0fc9477366e273ba1b82e651c9e88e6c7532df5344f03920","benches/data/TestNames_Latin.txt":"3a30d450d259a6be4a6aee8eeef08d3767d11fcc047b8f58060c542efe1182d1","benches/data/TestNames_Thai.txt":"28d76ddb62d6f47646232860fce7440544f402158443889393fd7e8bf10e9c3d","benches/data/TestRandomWordsUDHR_ar.txt":"02a775153e9746ae938a9db0b60244f2c00d911bb72b611a3593b0991fd95723","benches/data/TestRandomWordsUDHR_de.txt":"100b9502e7ddcb2fcbd055cb7ec9113245105bd1c606cace5e5bc147cc18727b","benches/data/TestRandomWordsUDHR_el.txt":"d1a2f0f9efc9ce663026ca7c285177391937c90008479a8c5b909c300dc86972","benches/data/TestRandomWordsUDHR_es.txt":"deeebda09e0ce0f80dd805317e96d1a630908601ff2a4d1ccb0021b00b55814b","benches/data/TestRandomWordsUDHR_fr.txt":"5931edc9f1af2c27a0b35c9624732e70b87b0fd72ab486710f3aa6367c7ad35f","benches/data/TestRandomWordsUDHR_he.txt":"dc77a89ffb9803e5c574d87f4789cb17624df73e40a8a92961df8ea8be103425","benches/data/TestRandomWordsUDHR_pl.txt":"26c378295ee2ef75ccacea691df0456394184a9a5c9ce48b2bada169b2402bbb","benches/data/TestRandomWordsUDHR_ru.txt":"a1c339f6d7b69cf9154e855c290ab09eeaf167ebcdf6d4bcb917de039fba10ee","benches/data/TestRandomWordsUDHR_th.txt":"3ba518be9863c85c3ac80cbb12299e3594e6f5afed3406d910d948007adaaf4e","benches/data/TestRandomWordsUDHR_tr.txt":"815c7babbc7228ef89b56f29638aeb63013aeca0003a49e58994e26b41cba01c","benches/data/wotw.txt":"8f28e68041ce75bbf75e72e186a6145e4c2de9e7e62b9b86ce0621c527a23669","benches/decomposing_normalizer_nfd.rs":"9caf896987e509af1e37488592022a62e8960692909745c4d08a539e7f283146","benches/decomposing_normalizer_nfkd.rs":"ce1c64b789baa9b4c5fb6511a187014f913e99126f1c932a4a12dc9a29367508","src/error.rs":"c1d7089ec5b1d124e701dd41704a0b7c685a1d7f8ed7d9eed0faaf093d2485f2","src/lib.rs":"cf85240373c7bc9896444c599450bba7351339748b0349fccfdec6a65cce8c30","src/properties.rs":"e3314a9801cc64f64c79740faed8495cb828bfcf4b1e34c9f8251ea7ecebd4e5","src/provider.rs":"4fdca8144102c7775debead6b50e758bf9382743630bf14de8a9b12a79fc6fed","src/uts46.rs":"d2f3d36ea5cd365631cfbe83b855bfc533d14e17d50c1e1b33da4df1de25563e","tests/data/NormalizationTest.txt":"1b04c22b82064adf871e76fd2148cd749129163f7d05bd7ace923516a65afe02","tests/data/README.md":"521fcd44a1f10f21629df88113fa29ca9f4e1dfbeea79fda19a7dc8ba435e24b","tests/tests.rs":"ba36cb3e89d2ea5c0312ab7a8d46c8c36ea9f01d35f2842e4778c2dee30cba54"},"package":"19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f"}

144
third_party/rust/icu_normalizer/Cargo.toml поставляемый Normal file
Просмотреть файл

@ -0,0 +1,144 @@
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
#
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
# to registry (e.g., crates.io) dependencies.
#
# If you are reading this file be aware that the original Cargo.toml
# will likely look very different (and much more reasonable).
# See Cargo.toml.orig for the original contents.
[package]
edition = "2021"
rust-version = "1.67"
name = "icu_normalizer"
version = "1.5.0"
authors = ["The ICU4X Project Developers"]
include = [
"data/**/*",
"src/**/*",
"examples/**/*",
"benches/**/*",
"tests/**/*",
"Cargo.toml",
"LICENSE",
"README.md",
]
description = "API for normalizing text into Unicode Normalization Forms"
homepage = "https://icu4x.unicode.org"
readme = "README.md"
categories = ["internationalization"]
license = "Unicode-3.0"
repository = "https://github.com/unicode-org/icu4x"
[package.metadata.docs.rs]
all-features = true
[[bench]]
name = "bench"
harness = false
[dependencies.databake]
version = "0.1.8"
features = ["derive"]
optional = true
default-features = false
[dependencies.displaydoc]
version = "0.2.3"
default-features = false
[dependencies.icu_collections]
version = "~1.5.0"
default-features = false
[dependencies.icu_normalizer_data]
version = "~1.5.0"
optional = true
default-features = false
[dependencies.icu_properties]
version = "~1.5.0"
default-features = false
[dependencies.icu_provider]
version = "~1.5.0"
features = ["macros"]
default-features = false
[dependencies.serde]
version = "1.0.110"
features = [
"derive",
"alloc",
]
optional = true
default-features = false
[dependencies.smallvec]
version = "1.10.0"
default-features = false
[dependencies.utf16_iter]
version = "1.0.2"
default-features = false
[dependencies.utf8_iter]
version = "1.0.2"
default-features = false
[dependencies.write16]
version = "1.0.0"
features = ["alloc"]
default-features = false
[dependencies.zerovec]
version = "0.10.2"
default-features = false
[dev-dependencies.arraystring]
version = "0.3.0"
[dev-dependencies.arrayvec]
version = "0.7.2"
[dev-dependencies.atoi]
version = "1.0.0"
[dev-dependencies.detone]
version = "1.0.0"
[dev-dependencies.write16]
version = "1.0.0"
features = ["arrayvec"]
default-features = false
[features]
compiled_data = [
"dep:icu_normalizer_data",
"icu_properties/compiled_data",
]
datagen = [
"serde",
"dep:databake",
"icu_collections/databake",
"zerovec/databake",
"icu_properties/datagen",
]
default = ["compiled_data"]
experimental = []
serde = [
"dep:serde",
"icu_collections/serde",
"zerovec/serde",
"icu_properties/serde",
]
std = [
"icu_collections/std",
"icu_properties/std",
"icu_provider/std",
]
[target."cfg(not(target_arch = \"wasm32\"))".dev-dependencies.criterion]
version = "0.5.0"

46
third_party/rust/icu_normalizer/LICENSE поставляемый Normal file
Просмотреть файл

@ -0,0 +1,46 @@
UNICODE LICENSE V3
COPYRIGHT AND PERMISSION NOTICE
Copyright © 2020-2024 Unicode, Inc.
NOTICE TO USER: Carefully read the following legal agreement. BY
DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR
SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT
DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE.
Permission is hereby granted, free of charge, to any person obtaining a
copy of data files and any associated documentation (the "Data Files") or
software and any associated documentation (the "Software") to deal in the
Data Files or Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, and/or sell
copies of the Data Files or Software, and to permit persons to whom the
Data Files or Software are furnished to do so, provided that either (a)
this copyright and permission notice appear with all copies of the Data
Files or Software, or (b) this copyright and permission notice appear in
associated Documentation.
THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
THIRD PARTY RIGHTS.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA
FILES OR SOFTWARE.
Except as contained in this notice, the name of a copyright holder shall
not be used in advertising or otherwise to promote the sale, use or other
dealings in these Data Files or Software without prior written
authorization of the copyright holder.
SPDX-License-Identifier: Unicode-3.0
Portions of ICU4X may have been adapted from ICU4C and/or ICU4J.
ICU 1.8.1 to ICU 57.1 © 1995-2016 International Business Machines Corporation and others.

56
third_party/rust/icu_normalizer/README.md поставляемый Normal file
Просмотреть файл

@ -0,0 +1,56 @@
# icu_normalizer [![crates.io](https://img.shields.io/crates/v/icu_normalizer)](https://crates.io/crates/icu_normalizer)
<!-- cargo-rdme start -->
Normalizing text into Unicode Normalization Forms.
This module is published as its own crate ([`icu_normalizer`](https://docs.rs/icu_normalizer/latest/icu_normalizer/))
and as part of the [`icu`](https://docs.rs/icu/latest/icu/) crate. See the latter for more details on the ICU4X project.
## Implementation notes
The normalizer operates on a lazy iterator over Unicode scalar values (Rust `char`) internally
and iterating over guaranteed-valid UTF-8, potentially-invalid UTF-8, and potentially-invalid
UTF-16 is a step that doesnt leak into the normalizer internals. Ill-formed byte sequences are
treated as U+FFFD.
The normalizer data layout is not based on the ICU4C design at all. Instead, the normalization
data layout is a clean-slate design optimized for the concept of fusing the NFD decomposition
into the collator. That is, the decomposing normalizer is a by-product of the collator-motivated
data layout.
Notably, the decomposition data structure is optimized for a starter decomposing to itself,
which is the most common case, and for a starter decomposing to a starter and a non-starter
on the Basic Multilingual Plane. Notably, in this case, the collator makes use of the
knowledge that the second character of such a decomposition is a non-starter. Therefore,
decomposition into two starters is handled by generic fallback path that looks the
decomposition from an array by offset and length instead of baking a BMP starter pair directly
into a trie value.
The decompositions into non-starters are hard-coded. At present in Unicode, these appear
to be special cases falling into three categories:
1. Deprecated combining marks.
2. Particular Tibetan vowel sings.
3. NFKD only: half-width kana voicing marks.
Hopefully Unicode never adds more decompositions into non-starters (other than a character
decomposing to itself), but if it does, a code update is needed instead of a mere data update.
The composing normalizer builds on the decomposing normalizer by performing the canonical
composition post-processing per spec. As an optimization, though, the composing normalizer
attempts to pass through already-normalized text consisting of starters that never combine
backwards and that map to themselves if followed by a character whose decomposition starts
with a starter that never combines backwards.
As a difference with ICU4C, the composing normalizer has only the simplest possible
passthrough (only one inversion list lookup per character in the best case) and the full
decompose-then-canonically-compose behavior, whereas ICU4C has other paths between these
extremes. The ICU4X collator doesn't make use of the FCD concept at all in order to avoid
doing the work of checking whether the FCD condition holds.
<!-- cargo-rdme end -->
## More Information
For more information on development, authorship, contributing etc. please visit [`ICU4X home page`](https://github.com/unicode-org/icu4x).

24
third_party/rust/icu_normalizer/benches/bench.rs поставляемый Normal file
Просмотреть файл

@ -0,0 +1,24 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use criterion::{criterion_group, criterion_main};
mod canonical_composition;
mod canonical_decomposition;
mod composing_normalizer_nfc;
mod composing_normalizer_nfkc;
mod decomposing_normalizer_nfd;
mod decomposing_normalizer_nfkd;
criterion_group!(
benches,
canonical_composition::criterion_benchmark,
canonical_decomposition::criterion_benchmark,
composing_normalizer_nfc::criterion_benchmark,
composing_normalizer_nfkc::criterion_benchmark,
decomposing_normalizer_nfd::criterion_benchmark,
decomposing_normalizer_nfkd::criterion_benchmark,
);
criterion_main!(benches);

186
third_party/rust/icu_normalizer/benches/canonical_composition.rs поставляемый Normal file
Просмотреть файл

@ -0,0 +1,186 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use criterion::{black_box, BenchmarkId, Criterion};
use detone::IterDecomposeVietnamese;
use icu_normalizer::properties::{CanonicalComposition, CanonicalDecomposition, Decomposed};
use icu_normalizer::ComposingNormalizer;
struct BenchDataContent {
pub file_name: String,
pub pairs: Vec<(char, char)>,
}
fn strip_headers(content: &str) -> String {
content
.lines()
.filter(|&s| !s.starts_with('#'))
.map(|s| s.to_owned())
.collect::<Vec<String>>()
.join("\n")
}
fn normalizer_bench_data() -> [BenchDataContent; 16] {
let nfc_normalizer: ComposingNormalizer = ComposingNormalizer::new_nfc();
return [
BenchDataContent {
file_name: "TestNames_Latin".to_owned(),
pairs: decompose_data(
&nfc_normalizer
.normalize(&strip_headers(include_str!("./data/TestNames_Latin.txt"))),
),
},
BenchDataContent {
file_name: "TestNames_Japanese_h".to_owned(),
pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
"./data/TestNames_Japanese_h.txt"
)))),
},
BenchDataContent {
file_name: "TestNames_Japanese_k".to_owned(),
pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
"./data/TestNames_Japanese_k.txt"
)))),
},
BenchDataContent {
file_name: "TestNames_Korean".to_owned(),
pairs: decompose_data(
&nfc_normalizer
.normalize(&strip_headers(include_str!("./data/TestNames_Korean.txt"))),
),
},
BenchDataContent {
file_name: "TestRandomWordsUDHR_ar".to_owned(),
#[cfg(debug_assertions)]
pairs: Vec::new(),
#[cfg(not(debug_assertions))]
pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
"./data/TestRandomWordsUDHR_ar.txt"
)))),
},
BenchDataContent {
file_name: "TestRandomWordsUDHR_de".to_owned(),
pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
"./data/TestRandomWordsUDHR_de.txt"
)))),
},
BenchDataContent {
file_name: "TestRandomWordsUDHR_el".to_owned(),
pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
"./data/TestRandomWordsUDHR_el.txt"
)))),
},
BenchDataContent {
file_name: "TestRandomWordsUDHR_es".to_owned(),
pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
"./data/TestRandomWordsUDHR_es.txt"
)))),
},
BenchDataContent {
file_name: "TestRandomWordsUDHR_fr".to_owned(),
pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
"./data/TestRandomWordsUDHR_fr.txt"
)))),
},
BenchDataContent {
file_name: "TestRandomWordsUDHR_he".to_owned(),
pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
"./data/TestRandomWordsUDHR_he.txt"
)))),
},
BenchDataContent {
file_name: "TestRandomWordsUDHR_pl".to_owned(),
pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
"./data/TestRandomWordsUDHR_pl.txt"
)))),
},
BenchDataContent {
file_name: "TestRandomWordsUDHR_ru".to_owned(),
pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
"./data/TestRandomWordsUDHR_ru.txt"
)))),
},
BenchDataContent {
file_name: "TestRandomWordsUDHR_th".to_owned(),
#[cfg(debug_assertions)]
pairs: Vec::new(),
#[cfg(not(debug_assertions))]
pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
"./data/TestRandomWordsUDHR_th.txt"
)))),
},
BenchDataContent {
file_name: "TestRandomWordsUDHR_tr".to_owned(),
pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
"./data/TestRandomWordsUDHR_tr.txt"
)))),
},
BenchDataContent {
file_name: "udhr_vie".to_owned(),
pairs: decompose_data(
&nfc_normalizer.normalize(&strip_headers(include_str!("data/wotw.txt"))),
),
},
BenchDataContent {
file_name: "udhr_vie_detone".to_owned(),
pairs: {
let result: Vec<(char, char)> = nfc_normalizer
.normalize(&strip_headers(include_str!("data/wotw.txt")))
.chars()
.filter_map(|c| {
let mut iter = std::iter::once(c).decompose_vietnamese_tones(true);
if let Some(base) = iter.next() {
iter.next().map(|tone| (base, tone))
} else {
None
}
})
.collect();
assert!(!result.is_empty());
result
},
},
];
}
fn function_under_bench(
canonical_composer: &CanonicalComposition,
composable_points: &[(char, char)],
) {
for pair in composable_points.iter() {
canonical_composer.compose(pair.0, pair.1);
}
}
pub fn criterion_benchmark(criterion: &mut Criterion) {
let group_name = "canonical_composition";
let mut group = criterion.benchmark_group(group_name);
let composer = CanonicalComposition::new();
for bench_data_content in black_box(normalizer_bench_data()) {
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)),
|bencher| bencher.iter(|| function_under_bench(&composer, &bench_data_content.pairs)),
);
}
group.finish();
}
fn decompose_data(nfc: &str) -> Vec<(char, char)> {
let decomposer = CanonicalDecomposition::new();
nfc.chars()
.map(|c| decomposer.decompose(c))
.filter_map(|decomposed| {
if let Decomposed::Expansion(a, b) = decomposed {
Some((a, b))
} else {
None
}
})
.collect()
}

Просмотреть файл

@ -0,0 +1,159 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use criterion::{black_box, BenchmarkId, Criterion};
use icu_normalizer::properties::CanonicalDecomposition;
use icu_normalizer::{ComposingNormalizer, DecomposingNormalizer};
struct BenchDataContent {
pub file_name: String,
pub nfc: String,
pub nfd: String,
pub nfkc: String,
pub nfkd: String,
}
fn strip_headers(content: &str) -> String {
content
.lines()
.filter(|&s| !s.starts_with('#'))
.map(|s| s.to_owned())
.collect::<Vec<String>>()
.join("\n")
}
fn normalizer_bench_data() -> [BenchDataContent; 15] {
let nfc_normalizer: ComposingNormalizer = ComposingNormalizer::new_nfc();
let nfd_normalizer: DecomposingNormalizer = DecomposingNormalizer::new_nfd();
let nfkc_normalizer: ComposingNormalizer = ComposingNormalizer::new_nfkc();
let nfkd_normalizer: DecomposingNormalizer = DecomposingNormalizer::new_nfkd();
let content_latin: (&str, &str) = (
"TestNames_Latin",
&strip_headers(include_str!("./data/TestNames_Latin.txt")),
);
let content_jp_h: (&str, &str) = (
"TestNames_Japanese_h",
&strip_headers(include_str!("./data/TestNames_Japanese_h.txt")),
);
let content_jp_k: (&str, &str) = (
"TestNames_Japanese_k",
&strip_headers(include_str!("./data/TestNames_Japanese_k.txt")),
);
let content_korean: (&str, &str) = (
"TestNames_Korean",
&strip_headers(include_str!("./data/TestNames_Korean.txt")),
);
let content_random_words_ar: (&str, &str) = (
"TestRandomWordsUDHR_ar",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_ar.txt")),
);
let content_random_words_de: (&str, &str) = (
"TestRandomWordsUDHR_de",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_de.txt")),
);
let content_random_words_el: (&str, &str) = (
"TestRandomWordsUDHR_el",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_el.txt")),
);
let content_random_words_es: (&str, &str) = (
"TestRandomWordsUDHR_es",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_es.txt")),
);
let content_random_words_fr: (&str, &str) = (
"TestRandomWordsUDHR_fr",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_fr.txt")),
);
let content_random_words_he: (&str, &str) = (
"TestRandomWordsUDHR_he",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_he.txt")),
);
let content_random_words_pl: (&str, &str) = (
"TestRandomWordsUDHR_pl",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_pl.txt")),
);
let content_random_words_ru: (&str, &str) = (
"TestRandomWordsUDHR_ru",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_ru.txt")),
);
let content_random_words_th: (&str, &str) = (
"TestRandomWordsUDHR_th",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_th.txt")),
);
let content_random_words_tr: (&str, &str) = (
"TestRandomWordsUDHR_tr",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_tr.txt")),
);
let content_viet: (&str, &str) = ("udhr_vie", &strip_headers(include_str!("data/wotw.txt")));
[
content_latin,
content_viet,
content_jp_k,
content_jp_h,
content_korean,
content_random_words_ru,
content_random_words_ar,
content_random_words_el,
content_random_words_es,
content_random_words_fr,
content_random_words_tr,
content_random_words_th,
content_random_words_pl,
content_random_words_he,
content_random_words_de,
]
.map(|(file_name, raw_content)| BenchDataContent {
file_name: file_name.to_owned(),
nfc: nfc_normalizer.normalize(raw_content),
nfd: nfd_normalizer.normalize(raw_content),
nfkc: nfkc_normalizer.normalize(raw_content),
nfkd: nfkd_normalizer.normalize(raw_content),
})
}
#[cfg(debug_assertions)]
fn function_under_bench(
_canonical_decomposer: &CanonicalDecomposition,
_decomposable_points: &str,
) {
// using debug assertion fails some test.
// "cargo test --bench bench" will pass
// "cargo bench" will work as expected, because the profile doesn't include debug assertions.
}
#[cfg(not(debug_assertions))]
fn function_under_bench(canonical_decomposer: &CanonicalDecomposition, decomposable_points: &str) {
decomposable_points.chars().for_each(|point| {
canonical_decomposer.decompose(point);
});
}
pub fn criterion_benchmark(criterion: &mut Criterion) {
let group_name = "canonical_decomposition";
let mut group = criterion.benchmark_group(group_name);
let decomposer = CanonicalDecomposition::new();
for bench_data_content in black_box(normalizer_bench_data()) {
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)),
|bencher| bencher.iter(|| function_under_bench(&decomposer, &bench_data_content.nfc)),
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)),
|bencher| bencher.iter(|| function_under_bench(&decomposer, &bench_data_content.nfd)),
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)),
|bencher| bencher.iter(|| function_under_bench(&decomposer, &bench_data_content.nfkc)),
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)),
|bencher| bencher.iter(|| function_under_bench(&decomposer, &bench_data_content.nfkd)),
);
}
group.finish();
}

Просмотреть файл

@ -0,0 +1,230 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use criterion::{black_box, BenchmarkId, Criterion};
use icu_normalizer::{ComposingNormalizer, DecomposingNormalizer};
struct BenchDataContent {
pub file_name: String,
pub nfc: String,
pub nfd: String,
pub nfkc: String,
pub nfkd: String,
pub nfc_u16: Vec<u16>,
pub nfd_u16: Vec<u16>,
pub nfkc_u16: Vec<u16>,
pub nfkd_u16: Vec<u16>,
}
fn strip_headers(content: &str) -> String {
content
.lines()
.filter(|&s| !s.starts_with('#'))
.map(|s| s.to_owned())
.collect::<Vec<String>>()
.join("\n")
}
fn normalizer_bench_data() -> [BenchDataContent; 15] {
let nfc_normalizer: ComposingNormalizer = ComposingNormalizer::new_nfc();
let nfd_normalizer: DecomposingNormalizer = DecomposingNormalizer::new_nfd();
let nfkc_normalizer: ComposingNormalizer = ComposingNormalizer::new_nfkc();
let nfkd_normalizer: DecomposingNormalizer = DecomposingNormalizer::new_nfkd();
let content_latin: (&str, &str) = (
"TestNames_Latin",
&strip_headers(include_str!("./data/TestNames_Latin.txt")),
);
let content_jp_h: (&str, &str) = (
"TestNames_Japanese_h",
&strip_headers(include_str!("./data/TestNames_Japanese_h.txt")),
);
let content_jp_k: (&str, &str) = (
"TestNames_Japanese_k",
&strip_headers(include_str!("./data/TestNames_Japanese_k.txt")),
);
let content_korean: (&str, &str) = (
"TestNames_Korean",
&strip_headers(include_str!("./data/TestNames_Korean.txt")),
);
let content_random_words_ar: (&str, &str) = (
"TestRandomWordsUDHR_ar",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_ar.txt")),
);
let content_random_words_de: (&str, &str) = (
"TestRandomWordsUDHR_de",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_de.txt")),
);
let content_random_words_el: (&str, &str) = (
"TestRandomWordsUDHR_el",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_el.txt")),
);
let content_random_words_es: (&str, &str) = (
"TestRandomWordsUDHR_es",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_es.txt")),
);
let content_random_words_fr: (&str, &str) = (
"TestRandomWordsUDHR_fr",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_fr.txt")),
);
let content_random_words_he: (&str, &str) = (
"TestRandomWordsUDHR_he",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_he.txt")),
);
let content_random_words_pl: (&str, &str) = (
"TestRandomWordsUDHR_pl",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_pl.txt")),
);
let content_random_words_ru: (&str, &str) = (
"TestRandomWordsUDHR_ru",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_ru.txt")),
);
let content_random_words_th: (&str, &str) = (
"TestRandomWordsUDHR_th",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_th.txt")),
);
let content_random_words_tr: (&str, &str) = (
"TestRandomWordsUDHR_tr",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_tr.txt")),
);
let content_viet: (&str, &str) = ("wotw", &strip_headers(include_str!("./data/wotw.txt")));
[
content_latin,
content_viet,
content_jp_k,
content_jp_h,
content_korean,
content_random_words_ru,
content_random_words_ar,
content_random_words_el,
content_random_words_es,
content_random_words_fr,
content_random_words_tr,
content_random_words_th,
content_random_words_pl,
content_random_words_he,
content_random_words_de,
]
.map(|(file_name, raw_content)| {
let nfc = &nfc_normalizer.normalize(raw_content);
let nfd = &nfd_normalizer.normalize(raw_content);
let nfkc = &nfkc_normalizer.normalize(raw_content);
let nfkd = &nfkd_normalizer.normalize(raw_content);
BenchDataContent {
file_name: file_name.to_owned(),
nfc: nfc.to_owned(),
nfd: nfd.to_owned(),
nfkc: nfkc.to_owned(),
nfkd: nfkd.to_owned(),
nfc_u16: nfc.encode_utf16().collect(),
nfd_u16: nfd.encode_utf16().collect(),
nfkc_u16: nfkc.encode_utf16().collect(),
nfkd_u16: nfkd.encode_utf16().collect(),
}
})
}
fn function_under_bench(normalizer: &ComposingNormalizer, text: &str) {
normalizer.normalize(text);
}
fn function_under_bench_utf16(normalizer: &ComposingNormalizer, text: &[u16]) {
normalizer.normalize_utf16(text);
}
pub fn criterion_benchmark(criterion: &mut Criterion) {
let group_name = "composing_normalizer_nfc";
let normalizer_under_bench: ComposingNormalizer = ComposingNormalizer::new_nfc();
let mut group = criterion.benchmark_group(group_name);
for bench_data_content in black_box(normalizer_bench_data()) {
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)),
|bencher| {
bencher
.iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfc))
},
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)),
|bencher| {
bencher
.iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfd))
},
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)),
|bencher| {
bencher.iter(|| {
function_under_bench(&normalizer_under_bench, &bench_data_content.nfkc)
})
},
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)),
|bencher| {
bencher.iter(|| {
function_under_bench(&normalizer_under_bench, &bench_data_content.nfkd)
})
},
);
// UTF_16
group.bench_function(
BenchmarkId::from_parameter(format!(
"from_nfc_{}_utf_16",
bench_data_content.file_name
)),
|bencher| {
bencher.iter(|| {
function_under_bench_utf16(&normalizer_under_bench, &bench_data_content.nfc_u16)
})
},
);
group.bench_function(
BenchmarkId::from_parameter(format!(
"from_nfd_{}_utf_16",
bench_data_content.file_name
)),
|bencher| {
bencher.iter(|| {
function_under_bench_utf16(&normalizer_under_bench, &bench_data_content.nfd_u16)
})
},
);
group.bench_function(
BenchmarkId::from_parameter(format!(
"from_nfkc_{}_utf_16",
bench_data_content.file_name
)),
|bencher| {
bencher.iter(|| {
function_under_bench_utf16(
&normalizer_under_bench,
&bench_data_content.nfkc_u16,
)
})
},
);
group.bench_function(
BenchmarkId::from_parameter(format!(
"from_nfkd_{}_utf_16",
bench_data_content.file_name
)),
|bencher| {
bencher.iter(|| {
function_under_bench_utf16(
&normalizer_under_bench,
&bench_data_content.nfkd_u16,
)
})
},
);
}
group.finish();
}

Просмотреть файл

@ -0,0 +1,211 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use criterion::{black_box, BenchmarkId, Criterion};
use icu_normalizer::{ComposingNormalizer, DecomposingNormalizer};
struct BenchDataContent {
pub file_name: String,
pub nfc: String,
pub nfd: String,
pub nfkc: String,
pub nfkd: String,
pub nfc_u16: Vec<u16>,
pub nfd_u16: Vec<u16>,
pub nfkc_u16: Vec<u16>,
pub nfkd_u16: Vec<u16>,
}
fn strip_headers(content: &str) -> String {
content
.lines()
.filter(|&s| !s.starts_with('#'))
.map(|s| s.to_owned())
.collect::<Vec<String>>()
.join("\n")
}
fn normalizer_bench_data() -> [BenchDataContent; 15] {
let nfc_normalizer: ComposingNormalizer = ComposingNormalizer::new_nfc();
let nfd_normalizer: DecomposingNormalizer = DecomposingNormalizer::new_nfd();
let nfkc_normalizer: ComposingNormalizer = ComposingNormalizer::new_nfkc();
let nfkd_normalizer: DecomposingNormalizer = DecomposingNormalizer::new_nfkd();
let content_latin: (&str, &str) = (
"TestNames_Latin",
&strip_headers(include_str!("./data/TestNames_Latin.txt")),
);
let content_jp_h: (&str, &str) = (
"TestNames_Japanese_h",
&strip_headers(include_str!("./data/TestNames_Japanese_h.txt")),
);
let content_jp_k: (&str, &str) = (
"TestNames_Japanese_k",
&strip_headers(include_str!("./data/TestNames_Japanese_k.txt")),
);
let content_korean: (&str, &str) = (
"TestNames_Korean",
&strip_headers(include_str!("./data/TestNames_Korean.txt")),
);
let content_random_words_ar: (&str, &str) = (
"TestRandomWordsUDHR_ar",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_ar.txt")),
);
let content_random_words_de: (&str, &str) = (
"TestRandomWordsUDHR_de",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_de.txt")),
);
let content_random_words_el: (&str, &str) = (
"TestRandomWordsUDHR_el",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_el.txt")),
);
let content_random_words_es: (&str, &str) = (
"TestRandomWordsUDHR_es",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_es.txt")),
);
let content_random_words_fr: (&str, &str) = (
"TestRandomWordsUDHR_fr",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_fr.txt")),
);
let content_random_words_he: (&str, &str) = (
"TestRandomWordsUDHR_he",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_he.txt")),
);
let content_random_words_pl: (&str, &str) = (
"TestRandomWordsUDHR_pl",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_pl.txt")),
);
let content_random_words_ru: (&str, &str) = (
"TestRandomWordsUDHR_ru",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_ru.txt")),
);
let content_random_words_th: (&str, &str) = (
"TestRandomWordsUDHR_th",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_th.txt")),
);
let content_random_words_tr: (&str, &str) = (
"TestRandomWordsUDHR_tr",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_tr.txt")),
);
let content_viet: (&str, &str) = ("wotw", &strip_headers(include_str!("./data/wotw.txt")));
[
content_latin,
content_viet,
content_jp_k,
content_jp_h,
content_korean,
content_random_words_ru,
content_random_words_ar,
content_random_words_el,
content_random_words_es,
content_random_words_fr,
content_random_words_tr,
content_random_words_th,
content_random_words_pl,
content_random_words_he,
content_random_words_de,
]
.map(|(file_name, raw_content)| {
let nfc = &nfc_normalizer.normalize(raw_content);
let nfd = &nfd_normalizer.normalize(raw_content);
let nfkc = &nfkc_normalizer.normalize(raw_content);
let nfkd = &nfkd_normalizer.normalize(raw_content);
BenchDataContent {
file_name: file_name.to_owned(),
nfc: nfc.to_owned(),
nfd: nfd.to_owned(),
nfkc: nfkc.to_owned(),
nfkd: nfkd.to_owned(),
nfc_u16: nfc.encode_utf16().collect(),
nfd_u16: nfd.encode_utf16().collect(),
nfkc_u16: nfkc.encode_utf16().collect(),
nfkd_u16: nfkd.encode_utf16().collect(),
}
})
}
fn function_under_bench(normalizer: &ComposingNormalizer, text: &str) {
normalizer.normalize(text);
}
fn function_under_bench_u16(normalizer: &ComposingNormalizer, text: &[u16]) {
normalizer.normalize_utf16(text);
}
pub fn criterion_benchmark(criterion: &mut Criterion) {
let group_name = "composing_normalizer_nfkc";
let normalizer_under_bench: ComposingNormalizer = ComposingNormalizer::new_nfkc();
let mut group = criterion.benchmark_group(group_name);
for bench_data_content in black_box(normalizer_bench_data()) {
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)),
|bencher| {
bencher
.iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfc))
},
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)),
|bencher| {
bencher
.iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfd))
},
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)),
|bencher| {
bencher.iter(|| {
function_under_bench(&normalizer_under_bench, &bench_data_content.nfkc)
})
},
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)),
|bencher| {
bencher.iter(|| {
function_under_bench(&normalizer_under_bench, &bench_data_content.nfkd)
})
},
);
// UTF 16
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfc_{}_u16", bench_data_content.file_name)),
|bencher| {
bencher.iter(|| {
function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfc_u16)
})
},
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfd_{}_u16", bench_data_content.file_name)),
|bencher| {
bencher.iter(|| {
function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfd_u16)
})
},
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfkc_{}_u16", bench_data_content.file_name)),
|bencher| {
bencher.iter(|| {
function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkc_u16)
})
},
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfkd_{}_u16", bench_data_content.file_name)),
|bencher| {
bencher.iter(|| {
function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkd_u16)
})
},
);
}
group.finish();
}

25
third_party/rust/icu_normalizer/benches/data/README.md поставляемый Normal file
Просмотреть файл

@ -0,0 +1,25 @@
# Generating microbench data
The full versions of these files are located
[in another part of the repository](https://github.com/unicode-org/icu/tree/main/icu4j/perf-tests/data).
## Sanitizing the file
```shell
sed -i '/^#/d' ${filename}
sed -i '/^$/d' ${filename}
```
## Shuffling the file
```shell
shuf -n 20 ${filename} -o ${filename}
```
## Add back the header (if you plan on submitting the files)
```
# This file is part of ICU4X. For terms of use, please see the file
# called LICENSE at the top level of the ICU4X source tree
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
```

Просмотреть файл

@ -0,0 +1,54 @@
# This file is part of ICU4X. For terms of use, please see the file
# called LICENSE at the top level of the ICU4X source tree
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
かげやま,みのる
むらかみ,とおる
つじさわ,けい
やすい,たかゆき
むらさき,としお
はせがわ,ひであき
うるしばら,よしひこ
ままだ,ひろし
おおぼら,えいじろう
おおば,まさひで
きたばたけ,たかひこ
はまさき,あつし
ほりい,つねお
もり,だいいち
いとう,しんいち
くにもと,じゅんじ
おか,のりひと
たに,よしあき
しらがき,ひろあき
しらはま,たけひろ
むらかみ,やすひろ
うめはら,たかし
いわた,ひろし
すぎえ,かつとし
てらにし,ひろみつ
まつおか,だいすけ
もろほし,すすむ
いしはら,たかし
おしま,ひろお
なかお,ゆうじ
いかり,はるお
きまち,まさき
ふるかわ,みちお
かねこ,しゅうへい
なかがわ,ともみ
ささき,しんご
うちだ,たくじ
うめだ,さかえ
しばた,いくこ
まきした,けいこ
まつもと,しんいちろう
たかの,かずよし
いしわた,なおひさ
いうち,まこと
いまい,りほ
みずた,のりあき
かくたに,まなぶ
わだ,ほまれ
わかまつ,かずき
かわぐち,ひろき

Просмотреть файл

@ -0,0 +1,54 @@
# This file is part of ICU4X. For terms of use, please see the file
# called LICENSE at the top level of the ICU4X source tree
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
ホリモト,ユウジ
ハナミ,ヤスヒデ
イシザカ,タカユキ
ゼンケ,トシオ
ハトリ,ユウコ
ナガオカ,トモユキ
コウダ,ケンイチ
イシダ,ヒロシ
ミワ,シゲユキ
イシカワ,ヒロシ
スズキ,ユウスケ
オクダ,ヨシノリ
シムラ,サカエ
エビシマ,ヤスユキ
イブカ,ヨシテル
タノ,マコト
ドウゾノ,セイヤ
ヤマナカ,サツミ
トミイエ,ハヤト
アザミ,ツトム
タナカ,キョウコ
コジマ,アツシ
フミハラ,カオリ
スズキ,マサユキ
ナトリ,ケンヤ
スズキ,ユウコ
スズキ,ヒサエ
ナカガワ,カツヨシ
スズキ,マサフミ
マツヤマ,トシオ
ヨシナガ,チカエ
キタムラ,リカコ
アオキ,タクオ
ヤマグチ,ヤスヒロ
スギムラ,シゲオ
ウエスギ,マサミ
マツムラ,シンイチ
クバ,タカシ
スドウ,タカトシ
フジモト,ヒロシ
イトウ,シュウイチ
コバヤシ,カズミ
タナカ,ヒロカツ
イシダ,ツカサ
ヤマダ,マサコ
カミヤ,トミエ
タケモト,ユウジ
スミノ,コウジ
ヒロハタ,タクヤ
ミヒラ,リョウヘイ

Просмотреть файл

@ -0,0 +1,54 @@
# This file is part of ICU4X. For terms of use, please see the file
# called LICENSE at the top level of the ICU4X source tree
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
김명희
홍차수
허순재
강영휘
김운주
이종환
이은국
강태호
강일래
김동현
곽기자
차재수
표봉기
문대원
이형기
최교표
박식현
홍종립
서창수
김쌍건
서말도
이병훈
김희수
박학태
강태종
조문란
신범균
백두진
이철정
김태중
이성현
김주조
김강행
이정길
김완일
권수자
이춘철
김판근
김곡리
이경형
이운만
손상철
유기숙
박정한
조윤래
유신호
이두수
김재률
김성홍
김혜경

Просмотреть файл

@ -0,0 +1,54 @@
# This file is part of ICU4X. For terms of use, please see the file
# called LICENSE at the top level of the ICU4X source tree
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
González, Joan
Reinders, Jim
Applebroog, Ida
Kidd, Joseph Bartholomew
Gulácsy, Lajos
Letendre, Rita
Zuccaro, Federico
Apt the Elder, Ulrich
Drummond, Arthur
Manley, Thomas
Broc, Jean
Ramunno, Tony
Simone dei Crocifissi
Lane, Theodore
Symonds, William Robert
Johnson, Frank Tenney
Cox, Gardner
Bunbury, Charles
Pedro de la Cuadra
Payne, William
Lucas, John Seymour
Holsman, Elizabeth T.
de Vries, Auke
Laszlo, Philip Alexius de
Shigemasa
Wolfe, Ruth Mitchell
Buck, John
Baselitz, Georg
Hook, Walter
Segall, Lasar
Brush, George deForest
Master of Jánosrét
Sutherland, Elizabeth Leveson-Gower, Countess of
Tuckerman, Jane
Varley, F.H.
Fosso, Samuel
Gardner, Daniel
Sadler, Walter Dendy
Clausen, Franciska
Coman, Charlotte Buell
Wakelin, Roland
Payne, Jon, CML
Campagna, Girolamo
Wiener, Phyllis
Sallee, Charles
Fitzgerald, John Anster
Gribbroek, Robert
Laporte, John
Lévy-Dhurmer, Lucien
Young, Stephen Scott

54
third_party/rust/icu_normalizer/benches/data/TestNames_Thai.txt поставляемый Normal file
Просмотреть файл

@ -0,0 +1,54 @@
# This file is part of ICU4X. For terms of use, please see the file
# called LICENSE at the top level of the ICU4X source tree
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
ณรงค์ โต๊ะเงิน
กิตติ บุญวันต์
สมหมาย ดาบทองดี
ธวัชชัย อิสระนิมิตร
วรรณา โสภณนรินทร์
วินัย หมู่มิ่ง
พัชรี ชูจิรวงศ์
สมปอง จิวไพโรจน์กิจ
บุญส่ง กวยรักษา
นิพนธ์ นิ่มใหม่
พัชรี สุวพรศิลป์
เจริญ นววัฒนทรัพย์
อรพินท์ แซ่เจี่ย
ชัยพร สมใจนึก
ประนอม โคศิลา
ฉวีวรรณ ศรสังข์ทอง
วัชรา เจริญรัตนพร
สุภัท นกศิริ
อู๋ มาลาเล็ก
ประยูร ไชโย
ละออ อยู่ยืนยง
สมใจ วิวัฒน์วานิช
จุมพล จันทรศรีเกษร
พุฒ ดอกไม้จีน
บุญชัย วรกิจพรสิน
สมาน ธูปเทียน
พงศ์ศักดิ์ แซ่แต้
อำนาจ ไวจงเจริญ
พรทิพย์ แซ่ลี้
อุไรวรรณ สาครสินธุ์
อำพล วีระตะนนท์
สมจิตร ใจวังโลก
สุเทพ ตันวินิจ
สวาท ทรัพย์มาก
สมศักดิ์ เจือจันทร์
ดัสซันซิงห์ กุลาตี
ธีร ศรแก้ว
พรรณยุพา ฮ่อสกุล
สำราญ จันทร์เอี่ยม
พจน์ มั่นกันนาน
สุธี บุณยเกียรติ
บุญโชติ ทิพย์ประเสริฐสิน
ประดิษฐ์ ทองพสิฐสมบัติ
จำเนียร เพ็งเจริญ
สมศักดิ์ อรุณรัตน์
อนุชา จารุหิรัญสกุล
พิกุล มโนภิญโญภิญญะ
ผ่องศรี นกแก้ว
อารี วิไลวรรณ
ณรงค์วิทย์ วิทสัทธาวรกุล

Просмотреть файл

@ -0,0 +1,54 @@
# This file is part of ICU4X. For terms of use, please see the file
# called LICENSE at the top level of the ICU4X source tree
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
ممارسة مراعاة
العنصرية
حدود والشيخوخة
بالحكم كهذا ينتفع
البلاد
تربية
الغير التقدم والعدل
نحو بالتعليم والحرية
تأمين متساو
للتعليم فيها
آذت اعتداء للتعليم
ليس المتأصلة
والمساهمة الضروري تتناقض
وتأسيس
رضى
شرعي الطبية
لكيلا الجمعية والحرية
للرجال التزوج
بالكرامة
حرية بين
هذه العيش تنظر
قيد
يقررها والصداقة
اعتُمد وينبغي اجتماعي
حرمان
للإدراك بأجر إنتاجه
التربية القانون
لإنصافه وتأسيس وسمعته
أساسه للرجال
كافة
المجهود دولي أينما
وإلى
بنشاط تجري
والأمم مثل لحقوق
الإنسان بشروط بحماية
شرفه
كما الوظائف
حياته ديسمبر
ولما
هذه
غاية جديد إنسان
حرية
متهم الوطنية قدمًا
التملك وضع
شرعية ويعبر تأدية
بنظام عمل والأخلاق
التملك لشخصيته يلجأ
بحال يضطر ولا
الانضمام بالكرامة
عضوا

Просмотреть файл

@ -0,0 +1,54 @@
# This file is part of ICU4X. For terms of use, please see the file
# called LICENSE at the top level of the ICU4X source tree
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
Herrschaft Freiheiten Not
Gewalt
stets anderer begründet
erhobenen innerstaatliche
Heiratsfähige freie
offenstehen Begrenzung grausamer
Maßnahmen höchste
unentbehrlich privat
erniedrigender
Verachtung freie
innezuhaben innerstaatlichen
kommen
werden gleichgültig
Würde überall höchste
Schutzmaßnahmen den Pflichten
Wille Bestimmung
Leibeigenschaft einschließlich für
gleiche bekräftigt Gewissens
Wohles
Generalversammlung
Volkes
Völkern gegenwärtig Zusammenarbeit
Heiratsfähige sowie Jeder
Stellung
Lebensstandard
seinem
Rede strafbaren Sicherheit
mit
Kulthandlungen Grund
ärztlicher
Auflösung Anforderungen anzugehören
Furcht
keine Geburt
Wohles Furcht genügen
befriedigende Medien
anzugehören Urlaub Vereinigungen
hinzuwirken verboten Resolution
kommen
sozialer vor irgendein
Bestimmung Bestimmung
Fall natürliche kein
Geschlecht Aufhetzung eigenen
seinen
über
Unterlassung Berücksichtigung
war
Rufes stets
Volkes anderer Beschränkungen
Handlungen dessen
Die

Просмотреть файл

@ -0,0 +1,54 @@
# This file is part of ICU4X. For terms of use, please see the file
# called LICENSE at the top level of the ICU4X source tree
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
προάγει αλληλογραφία
λογική έχει
ιδρύει ζωή τεχνική
δυνατότητες
περιορισμό συνόλου
ασκεί παραγνώριση συναφθεί
αναγνωρίζουν ποινικής εκδηλώνει
κοινότητας διακυβέρνηση στα
απέναντι υψηλή
περιστάσεων αξιόποινη
σεβασμό
συντήρησής κατά εξασφαλίσουν
παραβιάζουν συμπληρώνεται νόμο
άμεσα
σημαίνει καθεστώς
ΑΝΘΡΩΠΙΝΑ θέλησης ανθρωπίνων
ΔΙΑΚΗΡΥΞΗ αθλιότητα ασφάλιση
μέσο
ίση Εχει
ειρήνης Κάθε
μέλη μορφή
όσο
κρατείται Στο Διακηρύσσει
οικονομικών έκφρασης εξασφαλίζεται
κάθε
περίπτωση απολαμβάνουν
ποινικό γεροντική
είναι μαζί δικαστήρια
μαζί προοπτική
δική
βαρβαρότητας
οικονομικών εξασφαλίσει
υποχρεώσεις οδήγησαν
Οικουμενική Διακήρυξης γονείς
στις μυστική αντιπροσώπους
Διακήρυξης άδειες βιοτικό
αναπηρία ομάδα
πραγματικό
καλύτερες
ανάπαυση
δίκαιες ένα δικαίου
μετέχει στους
θρησκευτικών ποινικής
Κανείς ίσα
πεποιθήσεις
πολιτικές ανάλογα δουλεία
πολιτικές ιατρική ωσότου
ηθικής χωρίς
ανδρών ικανό
καθώς

Просмотреть файл

@ -0,0 +1,54 @@
# This file is part of ICU4X. For terms of use, please see the file
# called LICENSE at the top level of the ICU4X source tree
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
duración común
delito reconocimiento alimentación
inalienables
entre seguridad escogidos
comportarse dignidad
autónomo gobierno tiempo
omisiones
comisión
Derechos territorios
debe
han
regresar inalienables
regresar
desempleo científico
arbitrariamente proclamada
están contraerse esposos
cualesquiera
salir carácter desarrollo
solamente justas
personalidad una
cuanto
garantice resolución
concepción
tomar impondrá
cualquier reconocimiento
obligatoria obligatoria satisfactoria
acusación sin
artísticas penal culturales
pagadas examen
Además Organización dignidad
opresión esposos ejercidos
barbarie están mientras
por
idioma
recursos pagadas
materia Nada ella
con injerencias
inspirándose
organización
gozar jurisdicción
que
asegurar
humana libertad
nadie equivalente
escoger remuneración
torturas
individuos poder
disfruten seres Preámbulo
desempleo
liberados

Просмотреть файл

@ -0,0 +1,54 @@
# This file is part of ICU4X. For terms of use, please see the file
# called LICENSE at the top level of the ICU4X source tree
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
conforme êtres fonctions
non tout généralisé
premier lui
faire hommes dégalité
peuple volonté bénéficier
générale nationales
cruels plus
dencourager opinions
genre lesprit
dorigine effectif
exigences auront
résultent situation recevoir
peuples Chacun
sont dégalité
jouissent
auront lesprit
pays telle
publiquement
mariage foi
travail démocratique religieux
rémunération
omissions telles
Léducation
raison complétée donner
invoqué auront arbitraires
lamitié suffisant affaires
travaille laccomplissement lintermédiaire
race
opinions celles
assurer par privée
valeur
violant traite premier
inhérente
bienfaits lavènement
Unies sil actions
inquiété lesclavage
inquiété
esclaves lieu
salaire
par
toute
innocente procédure membres
arts lidéal envers
suffrage territoires inhumains
dimmixtions lorganisation progrès
comme égalité Unies
maternité
violerait suprême sécurité
impliquant eux loisirs
nationalité

Просмотреть файл

@ -0,0 +1,54 @@
# This file is part of ICU4X. For terms of use, please see the file
# called LICENSE at the top level of the ICU4X source tree
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
זקנה משפילים
ינתן חברתי עניניו
הפוב
ולהיות זכויות הישגים
יאסרו מטעמי וללא
ספרותית השלם
למנוחה חינם
וההתאגדות
לטפח
באלה במלואן
יהנו
ולרווחתם לגבר האדם
בכבודו שבארצות כבוד
ובינלאומיים
בכך לתנאי אישי
שאינן
שרירותי
במשפט
ולעקרונותיהן מטעם
שרירותית האשמה יהיה
החינוך ולבטחון
סובלנות אשמתו במגילה
המאוחדות חיוני
חשוב במקרה
כלתי העולם
שמקורה כציבור
לשויון
לתקנה
תלוי ההתאספות
הדיבור שהוא
והבלתי והבסיסית
ולעקרונותיהן יהא וישאף
ביתנ הבינלאומי
והזלזול להקנות
בגלל כולם שיושלם
לחיים
בדבר
לשירות
זכויות
לפני
אדם ולא מזזמנות
קנינו שהיה ההתאספות
בינלאומי חיוניות לבקש
תהיינה
ובזכות בכורה מהגנה
מתוך
ובמצפון מזומנות לאגד
והחמריים סוציאלי
אנושיים ובהצבעה
פראיים

Просмотреть файл

@ -0,0 +1,54 @@
# This file is part of ICU4X. For terms of use, please see the file
# called LICENSE at the top level of the ICU4X source tree
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
uciskowi posiadania prawo
społecznego największych skazany
czy
potrzeby samodzielnie przystępowania
Krzewi też dokonania
pełną prawo
buntu
moralności
zapewnienia znaczenie
nieludzki wypadek Nikt
zasadności jakikolwiek Każdy
samowolnie krajem
międzynarodowego
członek wielu
rozwój wynikających obalenia
rasy
grudnia która
jedynie urlopu ani
małżeńskie stanowi ustaniu
człowieka postępowych
prześladowania
politycznej które zawarcia
Deklaracja
ingerować wyłącznie
studia Nikt
innego uprawianie zrozumienie
wybranych swobodę wyznania
wolni osobowości
ograniczenie Nie
równej społecznego uciekać
będącą POWSZECHNA
niezdolności poszukiwania międzynarodowej
konieczne potrzeby posiada
opinii wychowywania 1948
międzynarodowej zatrzymać
przedstawicieli
przeciw
wynikających organy pracę
człowiek grupami
niezbędnych
wolności podstawowym
opinii małżonków wolność
postępować zdecydowanie komórką
odniesieniu
pokoju azyl
zawodowych powrócić człowiek
konstytucję
takiej postaciach powszechnego
wygnać wygnać
wspólny poszanowania

Просмотреть файл

@ -0,0 +1,54 @@
# This file is part of ICU4X. For terms of use, please see the file
# called LICENSE at the top level of the ICU4X source tree
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
областях
будут должен
обеспечиваются нежели
котором Уставе
социального моральных
совершеннолетия предоставление
том независимо
существование
вмешательства какому ограниченной
распространять
находить помощь
искусством
унижающим положения искать
изгнанию член совершеннолетия
обществом имуществом государственной
идеи братства
наслаждаться значение социальной
осуществления юрисдикцией наказанию
достойное свою III
жизнь расторжения инвалидности
терпимости этого
целях равны
обеспечиваются законным
принуждаем правосубъектности
пыткам доступа неприкосновенность
Брак против
прибегать независимой
человека человеческой
быть независимо религии
публичным
членам против
разумом результатом семью
Принята участие
беспристрастным тем
частным основной
правового
страной обслуживание
было свободу полное
рабочего свободны
состоянии помощь религиозными
полное
владеть власти морали
меньшей
братства социальному убежища
государств
равны который дети
терпимости
получать бесплатным полного
богослужении
отдельным

Просмотреть файл

@ -0,0 +1,54 @@
# This file is part of ICU4X. For terms of use, please see the file
# called LICENSE at the top level of the ICU4X source tree
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
คิด ใตัอำ เคลื่อนไหว
บังคับ บาก
สิ่ง สิ้น
วัตถุ
ชาย อาศัย เท่านั้น
สิน
เกา
ดูแล พิธีกรรม
ภายใน
เพศ
หนัก ประสงค์
เหตุ
งาน รักษา
เพศ ภาษา
นี้
คู่ สัญชาติ ต้องการ
วิธี ระหว่าง ตกลง
ทำนอง
สืบ กับ ศิลปกรรม
เหนือ วรรณกรรม
คิด การก หน้าที่
ชาติ ศิลปกรรม แต่
สามัญ สอด
เหยียด วิธี จุด
หน้า ถ้า เบื้อง
ประชุม
ศิลปกรรม
เสรีภาพ โหด ก่อ
เกียรติศักดิ์ ป่วย เอกราช
ประหัต มโนธรรม การ
แทน
ขัดขืน เวลา เสียง
กฎบัตร พยายาม
สิน หน้า
จำเป็น
ประชาธิปไตย หน่วย
กรณี จริงจัง
ทำนอง
ทาษ
เพิ่ม
บรรดา ขวาง
กักขัง
มนุษย์
ชาย ประกัน มนุษยธรรม
จะบัน มูลฐาน เถื่อน
พฤติ
มิได้
หญิง คู่
สมา ปฏิบัติ อนึ่ง
สิ่ง ทาษ

Просмотреть файл

@ -0,0 +1,54 @@
# This file is part of ICU4X. For terms of use, please see the file
# called LICENSE at the top level of the ICU4X source tree
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
mecburidir ilim
isnadın sınırları suç
tutuklanamaz diğer
memleket korunmasi kullanılamaz
İnsanlık ilerlemeyi
bir mülk menfaatlerinin
usul zümreye herhangi
mahkeme vicdana ilerleyişe
zulüm zalimane
ilim öncelikle çocuk
mevzubahis ancak
muamelesi dinlenmeye
eşitlikle prensiplerine ülkenin
öğretim bulunmalarına yardım
memleketler amacıyla
birbirlerine
olmalıdır
bırakılamaz serbestisine
hürriyetin iyi
hükmü işbu zalimane
evlenme memleketi tedbirlerle
evlenmek ahalisi işini
hürriyetler
belirlenmiş kere
elde cürüme
tanınan dünyaca yüksek
müddetinin ailesine
vicdan kırıcı itibariyle
geniş inanma
kendi görevleri Teşkilatı
yaymak
öğretim vesayet
renk kişiliğinin
tamamlanan
haklara bulunma
hükmü uygulanabilecek
etmiş geliştirilmesini hoşgörü
sahiptir temel
giyim
Bundan temeli
icaplarını
mülk karışma tekmil
vicdana hürriyetine işini
Herkesin vahşiliklere
dolaşma dünyanın
davasının Uluslararasında idamesi
eşittir
haklardan hakkı
kovuşturmalar hürriyetlerden gözönünde
Evrensel fiilli beyannamesi

58
third_party/rust/icu_normalizer/benches/data/wotw.txt поставляемый Normal file
Просмотреть файл

@ -0,0 +1,58 @@
# This file is part of ICU4X. For terms of use, please see the file
# called LICENSE at the top level of the ICU4X source tree
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
# The contents of this file have been translated by "Google Translate".
Vào những năm cuối của thế kỷ 19, không ai có thể tin rằng thế giới này
đang được theo dõi một cách sâu sắc và chặt chẽ bởi những trí thông minh
lớn hơn con người nhưng cũng nguy hiểm như chính con người; rằng khi con
người bận rộn với những mối quan tâm khác nhau của họ, họ bị xem xét và
nghiên cứu kỹ lưỡng, có lẽ gần như một người đàn ông với kính hiển vi có thể
xem xét kỹ lưỡng những sinh vật nhất thời tụ tập và sinh sôi nảy nở trong
một giọt nước. Với sự tự mãn vô hạn, con người đi đi lại lại khắp thế giới
này chỉ vì những công việc nhỏ nhặt của họ, thanh thản với niềm tin chắc
chắn về đế chế của họ đối với vật chất. Có thể là infusoria dưới kính hiển
vi cũng làm như vậy. Không ai coi các thế giới cũ hơn trong không gian là
nguồn gây nguy hiểm cho con người, hoặc nghĩ về chúng chỉ để bác bỏ ý
tưởng về sự sống đối với chúng là không thể hoặc không thể xảy ra.
Thật tò mò khi nhớ lại một số thói quen tinh thần của những ngày đã
qua. Hầu hết những người trên trái đất đều tưởng tượng rằng có thể có
những người khác trên sao Hỏa, có lẽ thấp kém hơn họ và sẵn sàng chào
đón một doanh nghiệp truyền giáo. Tuy nhiên, bên kia vịnh không gian,
những bộ óc đối với tâm trí của chúng ta cũng như tâm trí của chúng ta đối
với những con thú bị diệt vong, những bộ óc rộng lớn, lạnh lùng và vô cảm,
nhìn trái đất này với con mắt ghen tị, và dần dần và chắc chắn vạch ra
những kế hoạch chống lại chúng ta. Và đầu thế kỷ 20 đã xảy ra sự vỡ mộng
lớn. Hành tinh sao Hỏa, tôi không cần nhắc độc giả, quay xung quanh mặt
trời ở khoảng cách trung bình 140.000.000 dặm, và ánh sáng và nhiệt mà
nó nhận được từ mặt trời chỉ bằng một nửa so với thế giới này nhận được.
Nếu giả thuyết về tinh vân có bất kỳ sự thật nào, nó phải tồn tại lâu
đời hơn thế giới của chúng ta; và rất lâu trước khi trái đất này ngừng
nóng chảy, sự sống trên bề mặt của nó hẳn đã bắt đầu quá trình của nó.
Thực tế là nó chỉ chiếm một phần bảy thể tích của trái đất đã làm tăng
tốc độ nguội đi của nó đến nhiệt độ mà sự sống có thể bắt đầu. Nó có
không khí và nước và tất cả những gì cần thiết để hỗ trợ sự tồn tại
sinh động. Tuy nhiên, con người quá hão huyền và bị mù quáng bởi sự phù
phiếm của mình, đến nỗi cho đến tận cuối thế kỷ 19, không có nhà văn nào
bày tỏ bất kỳ ý tưởng nào rằng sự sống thông minh có thể đã phát triển ở đó xa,
hoặc thực sự là ở tất cả, vượt ra ngoài mức độ trần gian của nó. Người ta
cũng không hiểu một cách tổng quát rằng vì sao Hỏa già hơn trái đất của chúng
ta, chỉ bằng một phần tư diện tích bề mặt và ở xa mặt trời hơn, nên điều tất
yếu dẫn đến là nó không chỉ xa hơn so với thời điểm bắt đầu mà còn gần ngày kết
thúc hơn. Sự nguội lạnh thế tục mà một ngày nào đó phải vượt qua hành tinh của chúng
ta đã thực sự đi xa với người hàng xóm của chúng ta. Tình trạng vật lý của nó phần lớn
vẫn còn là một bí ẩn, nhưng giờ đây chúng ta biết rằng ngay cả ở vùng xích đạo của nó,
nhiệt độ giữa trưa hầu như không bằng nhiệt độ của mùa đông lạnh nhất của chúng ta.
Không khí của nó loãng hơn nhiều so với không khí của chúng ta, các đại dương của nó đã
thu hẹp lại cho đến khi chỉ bao phủ một phần ba bề mặt của nó, và khi các mùa chậm chạp
của nó thay đổi, các chỏm tuyết khổng lồ tụ lại và tan chảy ở hai cực và định kỳ làm ngập các vùng ôn đới của nó.
Giai đoạn cuối cùng của sự kiệt sức, mà đối với chúng ta vẫn còn quá xa vời, đã trở thành
một vấn đề ngày nay đối với các cư dân trên sao Hỏa. Áp lực trước mắt của sự cần
thiết đã làm sáng tỏ trí tuệ của họ, mở rộng sức mạnh của họ và làm chai đá trái
tim họ. Và nhìn xuyên qua không gian với các công cụ, và trí thông minh như chúng
ta hiếm khi mơ tới, họ thấy, ở khoảng cách gần nhất chỉ cách họ 35.000.000 dặm
về phía mặt trời, một ngôi sao buổi sáng của hy vọng, hành tinh ấm áp hơn của chúng
ta, màu xanh lục của thảm thực vật và màu xám của nước , với bầu không khí nhiều
mây hùng hồn của sự màu mỡ, với những cái nhìn thoáng qua qua những đám mây
trôi dạt của nó là những dải đất rộng lớn đông dân và những vùng biển chật hẹp đông đúc hải quân.

Просмотреть файл

@ -0,0 +1,213 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use criterion::{black_box, BenchmarkId, Criterion};
use icu_normalizer::{ComposingNormalizer, DecomposingNormalizer};
struct BenchDataContent {
pub file_name: String,
pub nfc: String,
pub nfd: String,
pub nfkc: String,
pub nfkd: String,
pub nfc_u16: Vec<u16>,
pub nfd_u16: Vec<u16>,
pub nfkc_u16: Vec<u16>,
pub nfkd_u16: Vec<u16>,
}
fn strip_headers(content: &str) -> String {
content
.lines()
.filter(|&s| !s.starts_with('#'))
.map(|s| s.to_owned())
.collect::<Vec<String>>()
.join("\n")
}
fn normalizer_bench_data() -> [BenchDataContent; 15] {
let nfc_normalizer: ComposingNormalizer = ComposingNormalizer::new_nfc();
let nfd_normalizer: DecomposingNormalizer = DecomposingNormalizer::new_nfd();
let nfkc_normalizer: ComposingNormalizer = ComposingNormalizer::new_nfkc();
let nfkd_normalizer: DecomposingNormalizer = DecomposingNormalizer::new_nfkd();
let content_latin: (&str, &str) = (
"TestNames_Latin",
&strip_headers(include_str!("./data/TestNames_Latin.txt")),
);
let content_jp_h: (&str, &str) = (
"TestNames_Japanese_h",
&strip_headers(include_str!("./data/TestNames_Japanese_h.txt")),
);
let content_jp_k: (&str, &str) = (
"TestNames_Japanese_k",
&strip_headers(include_str!("./data/TestNames_Japanese_k.txt")),
);
let content_korean: (&str, &str) = (
"TestNames_Korean",
&strip_headers(include_str!("./data/TestNames_Korean.txt")),
);
let content_random_words_ar: (&str, &str) = (
"TestRandomWordsUDHR_ar",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_ar.txt")),
);
let content_random_words_de: (&str, &str) = (
"TestRandomWordsUDHR_de",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_de.txt")),
);
let content_random_words_el: (&str, &str) = (
"TestRandomWordsUDHR_el",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_el.txt")),
);
let content_random_words_es: (&str, &str) = (
"TestRandomWordsUDHR_es",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_es.txt")),
);
let content_random_words_fr: (&str, &str) = (
"TestRandomWordsUDHR_fr",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_fr.txt")),
);
let content_random_words_he: (&str, &str) = (
"TestRandomWordsUDHR_he",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_he.txt")),
);
let content_random_words_pl: (&str, &str) = (
"TestRandomWordsUDHR_pl",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_pl.txt")),
);
let content_random_words_ru: (&str, &str) = (
"TestRandomWordsUDHR_ru",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_ru.txt")),
);
let content_random_words_th: (&str, &str) = (
"TestRandomWordsUDHR_th",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_th.txt")),
);
let content_random_words_tr: (&str, &str) = (
"TestRandomWordsUDHR_tr",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_tr.txt")),
);
let content_viet: (&str, &str) = ("wotw", &strip_headers(include_str!("./data/wotw.txt")));
[
content_latin,
content_viet,
content_jp_k,
content_jp_h,
content_korean,
content_random_words_ru,
content_random_words_ar,
content_random_words_el,
content_random_words_es,
content_random_words_fr,
content_random_words_tr,
content_random_words_th,
content_random_words_pl,
content_random_words_he,
content_random_words_de,
]
.map(|(file_name, raw_content)| {
let nfc = &nfc_normalizer.normalize(raw_content);
let nfd = &nfd_normalizer.normalize(raw_content);
let nfkc = &nfkc_normalizer.normalize(raw_content);
let nfkd = &nfkd_normalizer.normalize(raw_content);
BenchDataContent {
file_name: file_name.to_owned(),
nfc: nfc.to_owned(),
nfd: nfd.to_owned(),
nfkc: nfkc.to_owned(),
nfkd: nfkd.to_owned(),
nfc_u16: nfc.encode_utf16().collect(),
nfd_u16: nfd.encode_utf16().collect(),
nfkc_u16: nfkc.encode_utf16().collect(),
nfkd_u16: nfkd.encode_utf16().collect(),
}
})
}
fn function_under_bench(normalizer: &DecomposingNormalizer, text: &str) {
normalizer.normalize(text);
}
fn function_under_bench_u16(normalizer: &DecomposingNormalizer, text: &[u16]) {
normalizer.normalize_utf16(text);
}
pub fn criterion_benchmark(criterion: &mut Criterion) {
let group_name = "decomposing_normalizer_nfd";
let normalizer_under_bench: DecomposingNormalizer = DecomposingNormalizer::new_nfd();
let mut group = criterion.benchmark_group(group_name);
for bench_data_content in black_box(normalizer_bench_data()) {
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)),
|bencher| {
bencher
.iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfc))
},
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)),
|bencher| {
bencher
.iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfd))
},
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)),
|bencher| {
bencher.iter(|| {
function_under_bench(&normalizer_under_bench, &bench_data_content.nfkc)
})
},
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)),
|bencher| {
bencher.iter(|| {
function_under_bench(&normalizer_under_bench, &bench_data_content.nfkd)
})
},
);
// UTF 16
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfc_{}_u16", bench_data_content.file_name)),
|bencher| {
bencher.iter(|| {
function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfc_u16)
})
},
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfd_{}_u16", bench_data_content.file_name)),
|bencher| {
bencher.iter(|| {
function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfd_u16)
})
},
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfkc_{}_u16", bench_data_content.file_name)),
|bencher| {
bencher.iter(|| {
function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkc_u16)
})
},
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfkd_{}_u16", bench_data_content.file_name)),
|bencher| {
bencher.iter(|| {
function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkd_u16)
})
},
);
}
group.finish();
}

Просмотреть файл

@ -0,0 +1,211 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
use criterion::{black_box, BenchmarkId, Criterion};
use icu_normalizer::{ComposingNormalizer, DecomposingNormalizer};
struct BenchDataContent {
pub file_name: String,
pub nfc: String,
pub nfd: String,
pub nfkc: String,
pub nfkd: String,
pub nfc_u16: Vec<u16>,
pub nfd_u16: Vec<u16>,
pub nfkc_u16: Vec<u16>,
pub nfkd_u16: Vec<u16>,
}
fn strip_headers(content: &str) -> String {
content
.lines()
.filter(|&s| !s.starts_with('#'))
.map(|s| s.to_owned())
.collect::<Vec<String>>()
.join("\n")
}
fn normalizer_bench_data() -> [BenchDataContent; 15] {
let nfc_normalizer: ComposingNormalizer = ComposingNormalizer::new_nfc();
let nfd_normalizer: DecomposingNormalizer = DecomposingNormalizer::new_nfd();
let nfkc_normalizer: ComposingNormalizer = ComposingNormalizer::new_nfkc();
let nfkd_normalizer: DecomposingNormalizer = DecomposingNormalizer::new_nfkd();
let content_latin: (&str, &str) = (
"TestNames_Latin",
&strip_headers(include_str!("./data/TestNames_Latin.txt")),
);
let content_jp_h: (&str, &str) = (
"TestNames_Japanese_h",
&strip_headers(include_str!("./data/TestNames_Japanese_h.txt")),
);
let content_jp_k: (&str, &str) = (
"TestNames_Japanese_k",
&strip_headers(include_str!("./data/TestNames_Japanese_k.txt")),
);
let content_korean: (&str, &str) = (
"TestNames_Korean",
&strip_headers(include_str!("./data/TestNames_Korean.txt")),
);
let content_random_words_ar: (&str, &str) = (
"TestRandomWordsUDHR_ar",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_ar.txt")),
);
let content_random_words_de: (&str, &str) = (
"TestRandomWordsUDHR_de",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_de.txt")),
);
let content_random_words_el: (&str, &str) = (
"TestRandomWordsUDHR_el",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_el.txt")),
);
let content_random_words_es: (&str, &str) = (
"TestRandomWordsUDHR_es",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_es.txt")),
);
let content_random_words_fr: (&str, &str) = (
"TestRandomWordsUDHR_fr",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_fr.txt")),
);
let content_random_words_he: (&str, &str) = (
"TestRandomWordsUDHR_he",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_he.txt")),
);
let content_random_words_pl: (&str, &str) = (
"TestRandomWordsUDHR_pl",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_pl.txt")),
);
let content_random_words_ru: (&str, &str) = (
"TestRandomWordsUDHR_ru",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_ru.txt")),
);
let content_random_words_th: (&str, &str) = (
"TestRandomWordsUDHR_th",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_th.txt")),
);
let content_random_words_tr: (&str, &str) = (
"TestRandomWordsUDHR_tr",
&strip_headers(include_str!("./data/TestRandomWordsUDHR_tr.txt")),
);
let content_viet: (&str, &str) = ("wotw", &strip_headers(include_str!("./data/wotw.txt")));
[
content_latin,
content_viet,
content_jp_k,
content_jp_h,
content_korean,
content_random_words_ru,
content_random_words_ar,
content_random_words_el,
content_random_words_es,
content_random_words_fr,
content_random_words_tr,
content_random_words_th,
content_random_words_pl,
content_random_words_he,
content_random_words_de,
]
.map(|(file_name, raw_content)| {
let nfc = &nfc_normalizer.normalize(raw_content);
let nfd = &nfd_normalizer.normalize(raw_content);
let nfkc = &nfkc_normalizer.normalize(raw_content);
let nfkd = &nfkd_normalizer.normalize(raw_content);
BenchDataContent {
file_name: file_name.to_owned(),
nfc: nfc.to_owned(),
nfd: nfd.to_owned(),
nfkc: nfkc.to_owned(),
nfkd: nfkd.to_owned(),
nfc_u16: nfc.encode_utf16().collect(),
nfd_u16: nfd.encode_utf16().collect(),
nfkc_u16: nfkc.encode_utf16().collect(),
nfkd_u16: nfkd.encode_utf16().collect(),
}
})
}
fn function_under_bench(normalizer: &DecomposingNormalizer, text: &str) {
normalizer.normalize(text);
}
fn function_under_bench_u16(normalizer: &DecomposingNormalizer, text: &[u16]) {
normalizer.normalize_utf16(text);
}
pub fn criterion_benchmark(criterion: &mut Criterion) {
let group_name = "decomposing_normalizer_nfkd";
let normalizer_under_bench: DecomposingNormalizer = DecomposingNormalizer::new_nfkd();
let mut group = criterion.benchmark_group(group_name);
for bench_data_content in black_box(normalizer_bench_data()) {
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)),
|bencher| {
bencher
.iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfc))
},
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)),
|bencher| {
bencher
.iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfd))
},
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)),
|bencher| {
bencher.iter(|| {
function_under_bench(&normalizer_under_bench, &bench_data_content.nfkc)
})
},
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)),
|bencher| {
bencher.iter(|| {
function_under_bench(&normalizer_under_bench, &bench_data_content.nfkd)
})
},
);
// UTF 16
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfc_{}_u16", bench_data_content.file_name)),
|bencher| {
bencher.iter(|| {
function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfc_u16)
})
},
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfd_{}_u16", bench_data_content.file_name)),
|bencher| {
bencher.iter(|| {
function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfd_u16)
})
},
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfkc_{}_u16", bench_data_content.file_name)),
|bencher| {
bencher.iter(|| {
function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkc_u16)
})
},
);
group.bench_function(
BenchmarkId::from_parameter(format!("from_nfkd_{}_u16", bench_data_content.file_name)),
|bencher| {
bencher.iter(|| {
function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkd_u16)
})
},
);
}
group.finish();
}

42
third_party/rust/icu_normalizer/src/error.rs поставляемый Normal file
Просмотреть файл

@ -0,0 +1,42 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! Normalizer-specific error
use displaydoc::Display;
use icu_properties::PropertiesError;
use icu_provider::DataError;
/// A list of error outcomes for various operations in this module.
///
/// Re-exported as [`Error`](crate::Error).
#[derive(Display, Debug)]
#[non_exhaustive]
pub enum NormalizerError {
/// Error coming from the data provider
#[displaydoc("{0}")]
Data(DataError),
/// The data uses a planned but unsupported feature.
FutureExtension,
/// Data failed manual validation
ValidationError,
}
#[cfg(feature = "std")]
impl std::error::Error for NormalizerError {}
impl From<DataError> for NormalizerError {
fn from(e: DataError) -> Self {
NormalizerError::Data(e)
}
}
impl From<PropertiesError> for NormalizerError {
fn from(e: PropertiesError) -> Self {
match e {
PropertiesError::PropDataLoad(d) => NormalizerError::Data(d),
_ => unreachable!("Shouldn't have non-Data PropertiesError"),
}
}
}

2779
third_party/rust/icu_normalizer/src/lib.rs поставляемый Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

520
third_party/rust/icu_normalizer/src/properties.rs поставляемый Normal file
Просмотреть файл

@ -0,0 +1,520 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! Access to the Unicode properties or property-based operations that
//! are required for NFC and NFD.
//!
//! Applications should generally use the full normalizers that are
//! provided at the top level of this crate. However, the APIs in this
//! module are provided for callers such as HarfBuzz that specifically
//! want access to the raw canonical composition operation e.g. for use in a
//! glyph-availability-guided custom normalizer.
use crate::char_from_u16;
use crate::error::NormalizerError;
use crate::in_inclusive_range;
use crate::provider::CanonicalCompositionsV1Marker;
use crate::provider::CanonicalDecompositionDataV1Marker;
use crate::provider::CanonicalDecompositionTablesV1Marker;
use crate::provider::NonRecursiveDecompositionSupplementV1Marker;
use crate::trie_value_has_ccc;
use crate::trie_value_indicates_special_non_starter_decomposition;
use crate::BACKWARD_COMBINING_STARTER_MARKER;
use crate::FDFA_MARKER;
use crate::HANGUL_L_BASE;
use crate::HANGUL_N_COUNT;
use crate::HANGUL_S_BASE;
use crate::HANGUL_S_COUNT;
use crate::HANGUL_T_BASE;
use crate::HANGUL_T_COUNT;
use crate::HANGUL_V_BASE;
use crate::NON_ROUND_TRIP_MARKER;
use crate::SPECIAL_NON_STARTER_DECOMPOSITION_MARKER_U16;
/// want access to the underlying properties e.g. for use in a
/// glyph-availability-guided custom normalizer.
use icu_properties::CanonicalCombiningClass;
use icu_provider::prelude::*;
/// The raw canonical composition operation.
///
/// Callers should generally use `ComposingNormalizer` instead of this API.
/// However, this API is provided for callers such as HarfBuzz that specifically
/// want access to the raw canonical composition operation e.g. for use in a
/// glyph-availability-guided custom normalizer.
#[derive(Debug)]
pub struct CanonicalComposition {
canonical_compositions: DataPayload<CanonicalCompositionsV1Marker>,
}
#[cfg(feature = "compiled_data")]
impl Default for CanonicalComposition {
fn default() -> Self {
Self::new()
}
}
impl CanonicalComposition {
/// Performs canonical composition (including Hangul) on a pair of
/// characters or returns `None` if these characters don't compose.
/// Composition exclusions are taken into account.
///
/// # Examples
///
/// ```
/// let comp = icu::normalizer::properties::CanonicalComposition::new();
///
/// assert_eq!(comp.compose('a', 'b'), None); // Just two non-composing starters
/// assert_eq!(comp.compose('a', '\u{0308}'), Some('ä'));
/// assert_eq!(comp.compose('ẹ', '\u{0302}'), Some('ệ'));
/// assert_eq!(comp.compose('𝅗', '𝅥'), None); // Composition exclusion
/// assert_eq!(comp.compose('ে', 'া'), Some('ো')); // Second is starter
/// assert_eq!(comp.compose('ᄀ', 'ᅡ'), Some('가')); // Hangul LV
/// assert_eq!(comp.compose('가', 'ᆨ'), Some('각')); // Hangul LVT
/// ```
#[inline(always)]
pub fn compose(&self, starter: char, second: char) -> Option<char> {
crate::compose(
self.canonical_compositions
.get()
.canonical_compositions
.iter(),
starter,
second,
)
}
/// Constructs a new `CanonicalComposition` using compiled data.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
#[cfg(feature = "compiled_data")]
pub const fn new() -> Self {
Self {
canonical_compositions: DataPayload::from_static_ref(
crate::provider::Baked::SINGLETON_NORMALIZER_COMP_V1,
),
}
}
icu_provider::gen_any_buffer_data_constructors!(locale: skip, options: skip, error: NormalizerError,
#[cfg(skip)]
functions: [
new,
try_new_with_any_provider,
try_new_with_buffer_provider,
try_new_unstable,
Self,
]
);
#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new)]
pub fn try_new_unstable<D>(provider: &D) -> Result<Self, NormalizerError>
where
D: DataProvider<CanonicalCompositionsV1Marker> + ?Sized,
{
let canonical_compositions: DataPayload<CanonicalCompositionsV1Marker> =
provider.load(Default::default())?.take_payload()?;
Ok(CanonicalComposition {
canonical_compositions,
})
}
}
/// The outcome of non-recursive canonical decomposition of a character.
#[allow(clippy::exhaustive_enums)]
#[derive(Debug, PartialEq, Eq)]
pub enum Decomposed {
/// The character is its own canonical decomposition.
Default,
/// The character decomposes to a single different character.
Singleton(char),
/// The character decomposes to two characters.
Expansion(char, char),
}
/// The raw (non-recursive) canonical decomposition operation.
///
/// Callers should generally use `DecomposingNormalizer` instead of this API.
/// However, this API is provided for callers such as HarfBuzz that specifically
/// want access to non-recursive canonical decomposition e.g. for use in a
/// glyph-availability-guided custom normalizer.
#[derive(Debug)]
pub struct CanonicalDecomposition {
decompositions: DataPayload<CanonicalDecompositionDataV1Marker>,
tables: DataPayload<CanonicalDecompositionTablesV1Marker>,
non_recursive: DataPayload<NonRecursiveDecompositionSupplementV1Marker>,
}
#[cfg(feature = "compiled_data")]
impl Default for CanonicalDecomposition {
fn default() -> Self {
Self::new()
}
}
impl CanonicalDecomposition {
/// Performs non-recursive canonical decomposition (including for Hangul).
///
/// ```
/// use icu::normalizer::properties::Decomposed;
/// let decomp = icu::normalizer::properties::CanonicalDecomposition::new();
///
/// assert_eq!(decomp.decompose('e'), Decomposed::Default);
/// assert_eq!(
/// decomp.decompose('ệ'),
/// Decomposed::Expansion('ẹ', '\u{0302}')
/// );
/// assert_eq!(decomp.decompose('각'), Decomposed::Expansion('가', 'ᆨ'));
/// assert_eq!(decomp.decompose('\u{212B}'), Decomposed::Singleton('Å')); // ANGSTROM SIGN
/// assert_eq!(decomp.decompose('\u{2126}'), Decomposed::Singleton('Ω')); // OHM SIGN
/// assert_eq!(decomp.decompose('\u{1F71}'), Decomposed::Singleton('ά')); // oxia
/// ```
#[inline]
pub fn decompose(&self, c: char) -> Decomposed {
let lvt = u32::from(c).wrapping_sub(HANGUL_S_BASE);
if lvt >= HANGUL_S_COUNT {
return self.decompose_non_hangul(c);
}
let t = lvt % HANGUL_T_COUNT;
if t == 0 {
let l = lvt / HANGUL_N_COUNT;
let v = (lvt % HANGUL_N_COUNT) / HANGUL_T_COUNT;
// Safe because values known to be in range
return Decomposed::Expansion(
unsafe { char::from_u32_unchecked(HANGUL_L_BASE + l) },
unsafe { char::from_u32_unchecked(HANGUL_V_BASE + v) },
);
}
let lv = lvt - t;
// Safe because values known to be in range
Decomposed::Expansion(
unsafe { char::from_u32_unchecked(HANGUL_S_BASE + lv) },
unsafe { char::from_u32_unchecked(HANGUL_T_BASE + t) },
)
}
/// Performs non-recursive canonical decomposition except Hangul syllables
/// are reported as `Decomposed::Default`.
#[inline(always)]
fn decompose_non_hangul(&self, c: char) -> Decomposed {
let decomposition = self.decompositions.get().trie.get(c);
if decomposition <= BACKWARD_COMBINING_STARTER_MARKER {
return Decomposed::Default;
}
// The loop is only broken out of as goto forward
#[allow(clippy::never_loop)]
loop {
let trail_or_complex = (decomposition >> 16) as u16;
let lead = decomposition as u16;
if lead > NON_ROUND_TRIP_MARKER && trail_or_complex != 0 {
// Decomposition into two BMP characters: starter and non-starter
if in_inclusive_range(c, '\u{1F71}', '\u{1FFB}') {
// Look in the other trie due to oxia singleton
// mappings to corresponding character with tonos.
break;
}
return Decomposed::Expansion(char_from_u16(lead), char_from_u16(trail_or_complex));
}
if lead > NON_ROUND_TRIP_MARKER {
// Decomposition into one BMP character or non-starter
debug_assert_ne!(
lead, FDFA_MARKER,
"How come we got the U+FDFA NFKD marker here?"
);
if lead == SPECIAL_NON_STARTER_DECOMPOSITION_MARKER_U16 {
// Non-starter
if !in_inclusive_range(c, '\u{0340}', '\u{0F81}') {
return Decomposed::Default;
}
return match c {
'\u{0340}' => {
// COMBINING GRAVE TONE MARK
Decomposed::Singleton('\u{0300}')
}
'\u{0341}' => {
// COMBINING ACUTE TONE MARK
Decomposed::Singleton('\u{0301}')
}
'\u{0343}' => {
// COMBINING GREEK KORONIS
Decomposed::Singleton('\u{0313}')
}
'\u{0344}' => {
// COMBINING GREEK DIALYTIKA TONOS
Decomposed::Expansion('\u{0308}', '\u{0301}')
}
'\u{0F73}' => {
// TIBETAN VOWEL SIGN II
Decomposed::Expansion('\u{0F71}', '\u{0F72}')
}
'\u{0F75}' => {
// TIBETAN VOWEL SIGN UU
Decomposed::Expansion('\u{0F71}', '\u{0F74}')
}
'\u{0F81}' => {
// TIBETAN VOWEL SIGN REVERSED II
Decomposed::Expansion('\u{0F71}', '\u{0F80}')
}
_ => Decomposed::Default,
};
}
return Decomposed::Singleton(char_from_u16(lead));
}
// The recursive decomposition of ANGSTROM SIGN is in the complex
// decomposition structure to avoid a branch in `potential_passthrough`
// for the BMP case.
if c == '\u{212B}' {
// ANGSTROM SIGN
return Decomposed::Singleton('\u{00C5}');
}
// Complex decomposition
// Format for 16-bit value:
// 15..13: length minus two for 16-bit case and length minus one for
// the 32-bit case. Length 8 needs to fit in three bits in
// the 16-bit case, and this way the value is future-proofed
// up to 9 in the 16-bit case. Zero is unused and length one
// in the 16-bit case goes directly into the trie.
// 12: 1 if all trailing characters are guaranteed non-starters,
// 0 if no guarantees about non-starterness.
// Note: The bit choice is this way around to allow for
// dynamically falling back to not having this but instead
// having one more bit for length by merely choosing
// different masks.
// 11..0: Start offset in storage. The offset is to the logical
// sequence of scalars16, scalars32, supplementary_scalars16,
// supplementary_scalars32.
let offset = usize::from(trail_or_complex & 0xFFF);
let tables = self.tables.get();
if offset < tables.scalars16.len() {
if usize::from(trail_or_complex >> 13) != 0 {
// i.e. logical len isn't 2
break;
}
if let Some(first) = tables.scalars16.get(offset) {
if let Some(second) = tables.scalars16.get(offset + 1) {
// Two BMP starters
return Decomposed::Expansion(char_from_u16(first), char_from_u16(second));
}
}
// GIGO case
debug_assert!(false);
return Decomposed::Default;
}
let len = usize::from(trail_or_complex >> 13) + 1;
if len > 2 {
break;
}
let offset24 = offset - tables.scalars16.len();
if let Some(first_c) = tables.scalars24.get(offset24) {
if len == 1 {
if c != first_c {
return Decomposed::Singleton(first_c);
} else {
// Singleton representation used to avoid
// NFC passthrough of characters that combine
// with starters that can occur as the first
// character of an expansion decomposition.
// See section 5 of
// https://www.unicode.org/L2/L2024/24009-utc178-properties-recs.pdf
return Decomposed::Default;
}
}
if let Some(second_c) = tables.scalars24.get(offset24 + 1) {
return Decomposed::Expansion(first_c, second_c);
}
}
// GIGO case
debug_assert!(false);
return Decomposed::Default;
}
let non_recursive = self.non_recursive.get();
let non_recursive_decomposition = non_recursive.trie.get(c);
if non_recursive_decomposition == 0 {
// GIGO case
debug_assert!(false);
return Decomposed::Default;
}
let trail_or_complex = (non_recursive_decomposition >> 16) as u16;
let lead = non_recursive_decomposition as u16;
if lead != 0 && trail_or_complex != 0 {
// Decomposition into two BMP characters
return Decomposed::Expansion(char_from_u16(lead), char_from_u16(trail_or_complex));
}
if lead != 0 {
// Decomposition into one BMP character
return Decomposed::Singleton(char_from_u16(lead));
}
// Decomposition into two non-BMP characters
// Low is offset into a table plus one to keep it non-zero.
let offset = usize::from(trail_or_complex - 1);
if let Some(first) = non_recursive.scalars24.get(offset) {
if let Some(second) = non_recursive.scalars24.get(offset + 1) {
return Decomposed::Expansion(first, second);
}
}
// GIGO case
debug_assert!(false);
Decomposed::Default
}
/// Construct from compiled data.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
#[cfg(feature = "compiled_data")]
pub const fn new() -> Self {
const _: () = assert!(
crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1
.scalars16
.const_len()
+ crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1
.scalars24
.const_len()
<= 0xFFF,
"NormalizerError::FutureExtension"
);
Self {
decompositions: DataPayload::from_static_ref(
crate::provider::Baked::SINGLETON_NORMALIZER_NFD_V1,
),
tables: DataPayload::from_static_ref(
crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1,
),
non_recursive: DataPayload::from_static_ref(
crate::provider::Baked::SINGLETON_NORMALIZER_DECOMP_V1,
),
}
}
icu_provider::gen_any_buffer_data_constructors!(locale: skip, options: skip, error: NormalizerError,
#[cfg(skip)]
functions: [
new,
try_new_with_any_provider,
try_new_with_buffer_provider,
try_new_unstable,
Self,
]
);
#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new)]
pub fn try_new_unstable<D>(provider: &D) -> Result<Self, NormalizerError>
where
D: DataProvider<CanonicalDecompositionDataV1Marker>
+ DataProvider<CanonicalDecompositionTablesV1Marker>
+ DataProvider<NonRecursiveDecompositionSupplementV1Marker>
+ ?Sized,
{
let decompositions: DataPayload<CanonicalDecompositionDataV1Marker> =
provider.load(Default::default())?.take_payload()?;
let tables: DataPayload<CanonicalDecompositionTablesV1Marker> =
provider.load(Default::default())?.take_payload()?;
if tables.get().scalars16.len() + tables.get().scalars24.len() > 0xFFF {
// The data is from a future where there exists a normalization flavor whose
// complex decompositions take more than 0xFFF but fewer than 0x1FFF code points
// of space. If a good use case from such a decomposition flavor arises, we can
// dynamically change the bit masks so that the length mask becomes 0x1FFF instead
// of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However,
// since for now the masks are hard-coded, error out.
return Err(NormalizerError::FutureExtension);
}
let non_recursive: DataPayload<NonRecursiveDecompositionSupplementV1Marker> =
provider.load(Default::default())?.take_payload()?;
Ok(CanonicalDecomposition {
decompositions,
tables,
non_recursive,
})
}
}
/// Lookup of the Canonical_Combining_Class Unicode property.
///
/// # Example
///
/// ```
/// use icu::properties::CanonicalCombiningClass;
/// use icu::normalizer::properties::CanonicalCombiningClassMap;
///
/// let map = CanonicalCombiningClassMap::new();
/// assert_eq!(map.get('a'), CanonicalCombiningClass::NotReordered); // U+0061: LATIN SMALL LETTER A
/// assert_eq!(map.get32(0x0301), CanonicalCombiningClass::Above); // U+0301: COMBINING ACUTE ACCENT
/// ```
#[derive(Debug)]
pub struct CanonicalCombiningClassMap {
/// The data trie
decompositions: DataPayload<CanonicalDecompositionDataV1Marker>,
}
#[cfg(feature = "compiled_data")]
impl Default for CanonicalCombiningClassMap {
fn default() -> Self {
Self::new()
}
}
impl CanonicalCombiningClassMap {
/// Look up the canonical combining class for a scalar value
#[inline(always)]
pub fn get(&self, c: char) -> CanonicalCombiningClass {
self.get32(u32::from(c))
}
/// Look up the canonical combining class for a scalar value
/// represented as `u32`. If the argument is outside the scalar
/// value range, `CanonicalCombiningClass::NotReordered` is returned.
pub fn get32(&self, c: u32) -> CanonicalCombiningClass {
let trie_value = self.decompositions.get().trie.get32(c);
if trie_value_has_ccc(trie_value) {
CanonicalCombiningClass(trie_value as u8)
} else if trie_value_indicates_special_non_starter_decomposition(trie_value) {
match c {
0x0340 | 0x0341 | 0x0343 | 0x0344 => CanonicalCombiningClass::Above,
_ => CanonicalCombiningClass::NotReordered,
}
} else {
CanonicalCombiningClass::NotReordered
}
}
/// Construct from compiled data.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
#[cfg(feature = "compiled_data")]
pub const fn new() -> Self {
CanonicalCombiningClassMap {
decompositions: DataPayload::from_static_ref(
crate::provider::Baked::SINGLETON_NORMALIZER_NFD_V1,
),
}
}
icu_provider::gen_any_buffer_data_constructors!(locale: skip, options: skip, error: NormalizerError,
#[cfg(skip)]
functions: [
new,
try_new_with_any_provider,
try_new_with_buffer_provider,
try_new_unstable,
Self,
]);
#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new)]
pub fn try_new_unstable<D>(provider: &D) -> Result<Self, NormalizerError>
where
D: DataProvider<CanonicalDecompositionDataV1Marker> + ?Sized,
{
let decompositions: DataPayload<CanonicalDecompositionDataV1Marker> =
provider.load(Default::default())?.take_payload()?;
Ok(CanonicalCombiningClassMap { decompositions })
}
}

208
third_party/rust/icu_normalizer/src/provider.rs поставляемый Normal file
Просмотреть файл

@ -0,0 +1,208 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! 🚧 \[Unstable\] Data provider struct definitions for this ICU4X component.
//!
//! <div class="stab unstable">
//! 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
//! including in SemVer minor releases. While the serde representation of data structs is guaranteed
//! to be stable, their Rust representation might not be. Use with caution.
//! </div>
//!
//! Read more about data providers: [`icu_provider`]
// Provider structs must be stable
#![allow(clippy::exhaustive_structs, clippy::exhaustive_enums)]
use icu_collections::char16trie::Char16Trie;
use icu_collections::codepointtrie::CodePointTrie;
use icu_provider::prelude::*;
use zerovec::ZeroVec;
#[cfg(feature = "compiled_data")]
#[derive(Debug)]
/// Baked data
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. In particular, the `DataProvider` implementations are only
/// guaranteed to match with this version's `*_unstable` providers. Use with caution.
/// </div>
pub struct Baked;
#[cfg(feature = "compiled_data")]
const _: () = {
pub mod icu {
pub use crate as normalizer;
pub use icu_collections as collections;
}
icu_normalizer_data::make_provider!(Baked);
icu_normalizer_data::impl_normalizer_comp_v1!(Baked);
icu_normalizer_data::impl_normalizer_decomp_v1!(Baked);
icu_normalizer_data::impl_normalizer_nfd_v1!(Baked);
icu_normalizer_data::impl_normalizer_nfdex_v1!(Baked);
icu_normalizer_data::impl_normalizer_nfkd_v1!(Baked);
icu_normalizer_data::impl_normalizer_nfkdex_v1!(Baked);
icu_normalizer_data::impl_normalizer_uts46d_v1!(Baked);
};
#[cfg(feature = "datagen")]
/// The latest minimum set of keys required by this component.
pub const KEYS: &[DataKey] = &[
CanonicalCompositionsV1Marker::KEY,
CanonicalDecompositionDataV1Marker::KEY,
CanonicalDecompositionTablesV1Marker::KEY,
CompatibilityDecompositionSupplementV1Marker::KEY,
CompatibilityDecompositionTablesV1Marker::KEY,
NonRecursiveDecompositionSupplementV1Marker::KEY,
Uts46DecompositionSupplementV1Marker::KEY,
];
/// Main data for NFD
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
/// </div>
#[icu_provider::data_struct(marker(
CanonicalDecompositionDataV1Marker,
"normalizer/nfd@1",
singleton
))]
#[derive(Debug, PartialEq, Clone)]
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake), databake(path = icu_normalizer::provider))]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
pub struct DecompositionDataV1<'data> {
/// Trie for NFD decomposition.
#[cfg_attr(feature = "serde", serde(borrow))]
pub trie: CodePointTrie<'data, u32>,
}
/// Data that either NFKD or the decomposed form of UTS 46 needs
/// _in addition to_ the NFD data.
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
/// </div>
#[icu_provider::data_struct(
marker(
CompatibilityDecompositionSupplementV1Marker,
"normalizer/nfkd@1",
singleton
),
marker(Uts46DecompositionSupplementV1Marker, "normalizer/uts46d@1", singleton)
)]
#[derive(Debug, PartialEq, Clone)]
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake), databake(path = icu_normalizer::provider))]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
pub struct DecompositionSupplementV1<'data> {
/// Trie for the decompositions that differ from NFD.
/// Getting a zero from this trie means that you need
/// to make another lookup from `DecompositionDataV1::trie`.
#[cfg_attr(feature = "serde", serde(borrow))]
pub trie: CodePointTrie<'data, u32>,
/// Flags that indicate how the set of characters whose
/// decompositions starts with a non-starter differs from
/// the set for NFD.
///
/// Bit 0: Whether half-width kana voicing marks decompose
/// into non-starters (their full-width combining
/// counterparts).
/// Bit 1: Whether U+0345 COMBINING GREEK YPOGEGRAMMENI
/// decomposes into a starter (U+03B9 GREEK SMALL
/// LETTER IOTA).
/// (Other bits unused.)
pub flags: u8,
/// The passthrough bounds of NFD/NFC are lowered to this
/// maximum instead. (16-bit, because cannot be higher
/// than 0x0300, which is the bound for NFC.)
pub passthrough_cap: u16,
}
impl DecompositionSupplementV1<'_> {
const HALF_WIDTH_VOICING_MARK_MASK: u8 = 1;
/// Whether half-width kana voicing marks decompose into non-starters
/// (their full-width combining counterparts).
pub fn half_width_voicing_marks_become_non_starters(&self) -> bool {
(self.flags & DecompositionSupplementV1::HALF_WIDTH_VOICING_MARK_MASK) != 0
}
}
/// The expansion tables for cases where the decomposition isn't
/// contained in the trie value
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
/// </div>
#[icu_provider::data_struct(
marker(CanonicalDecompositionTablesV1Marker, "normalizer/nfdex@1", singleton),
marker(
CompatibilityDecompositionTablesV1Marker,
"normalizer/nfkdex@1",
singleton
)
)]
#[derive(Debug, PartialEq, Clone)]
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake), databake(path = icu_normalizer::provider))]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
pub struct DecompositionTablesV1<'data> {
/// Decompositions that are fully within the BMP
#[cfg_attr(feature = "serde", serde(borrow))]
pub scalars16: ZeroVec<'data, u16>,
/// Decompositions with at least one character outside
/// the BMP
#[cfg_attr(feature = "serde", serde(borrow))]
pub scalars24: ZeroVec<'data, char>,
}
/// Non-Hangul canonical compositions
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
/// </div>
#[icu_provider::data_struct(marker(CanonicalCompositionsV1Marker, "normalizer/comp@1", singleton))]
#[derive(Debug, PartialEq, Clone)]
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake), databake(path = icu_normalizer::provider))]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
pub struct CanonicalCompositionsV1<'data> {
/// Trie keys are two-`char` strings with the second
/// character coming first. The value, if any, is the
/// (non-Hangul) canonical composition.
#[cfg_attr(feature = "serde", serde(borrow))]
pub canonical_compositions: Char16Trie<'data>,
}
/// Non-recursive canonical decompositions that differ from
/// `DecompositionDataV1`.
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
/// </div>
#[icu_provider::data_struct(marker(
NonRecursiveDecompositionSupplementV1Marker,
"normalizer/decomp@1",
singleton
))]
#[derive(Debug, PartialEq, Clone)]
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake), databake(path = icu_normalizer::provider))]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
pub struct NonRecursiveDecompositionSupplementV1<'data> {
/// Trie for the supplementary non-recursive decompositions
#[cfg_attr(feature = "serde", serde(borrow))]
pub trie: CodePointTrie<'data, u32>,
/// Decompositions with at least one character outside
/// the BMP
#[cfg_attr(feature = "serde", serde(borrow))]
pub scalars24: ZeroVec<'data, char>,
}

136
third_party/rust/icu_normalizer/src/uts46.rs поставляемый Normal file
Просмотреть файл

@ -0,0 +1,136 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! Bundles the part of UTS 46 that makes sense to implement as a
//! normalization.
//!
//! This is meant to be used as a building block of an UTS 46
//! implementation, such as the `idna` crate.
use crate::CanonicalCompositionsV1Marker;
use crate::CanonicalDecompositionDataV1Marker;
use crate::CanonicalDecompositionTablesV1Marker;
use crate::CompatibilityDecompositionTablesV1Marker;
use crate::ComposingNormalizer;
use crate::NormalizerError;
use crate::Uts46DecompositionSupplementV1Marker;
use icu_provider::DataProvider;
// Implementation note: Despite merely wrapping a `ComposingNormalizer`,
// having a `Uts46Mapper` serves two purposes:
//
// 1. Denying public access to parts of the `ComposingNormalizer` API
// that don't work when the data contains markers for ignorables.
// 2. Providing a place where additional iterator pre-processing or
// post-processing can take place if needed in the future. (When
// writing this, it looked like such processing was needed but
// now isn't needed after all.)
/// A mapper that knows how to performs the subsets of UTS 46 processing
/// documented on the methods.
#[derive(Debug)]
pub struct Uts46Mapper {
normalizer: ComposingNormalizer,
}
#[cfg(feature = "compiled_data")]
impl Default for Uts46Mapper {
fn default() -> Self {
Self::new()
}
}
impl Uts46Mapper {
/// Construct with compiled data.
#[cfg(feature = "compiled_data")]
pub const fn new() -> Self {
Uts46Mapper {
normalizer: ComposingNormalizer::new_uts46(),
}
}
/// Construct with provider.
#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new)]
pub fn try_new<D>(provider: &D) -> Result<Self, NormalizerError>
where
D: DataProvider<CanonicalDecompositionDataV1Marker>
+ DataProvider<Uts46DecompositionSupplementV1Marker>
+ DataProvider<CanonicalDecompositionTablesV1Marker>
+ DataProvider<CompatibilityDecompositionTablesV1Marker>
// UTS 46 tables merged into CompatibilityDecompositionTablesV1Marker
+ DataProvider<CanonicalCompositionsV1Marker>
+ ?Sized,
{
let normalizer = ComposingNormalizer::try_new_uts46_unstable(provider)?;
Ok(Uts46Mapper { normalizer })
}
/// Returns an iterator adaptor that turns an `Iterator` over `char`
/// into an iterator yielding a `char` sequence that gets the following
/// operations from the "Map" and "Normalize" steps of the "Processing"
/// section of UTS 46 lazily applied to it:
///
/// 1. The _ignored_ characters are ignored.
/// 2. The _mapped_ characters are mapped.
/// 3. The _disallowed_ characters are replaced with U+FFFD,
/// which itself is a disallowed character.
/// 4. The _deviation_ characters are treated as _mapped_ or _valid_
/// as appropriate.
/// 5. The _disallowed_STD3_valid_ characters are treated as allowed.
/// 6. The _disallowed_STD3_mapped_ characters are treated as
/// _mapped_.
/// 7. The result is normalized to NFC.
///
/// Notably:
///
/// * The STD3 or WHATWG ASCII deny list should be implemented as a
/// post-processing step.
/// * Transitional processing is not performed. Transitional mapping
/// would be a pre-processing step, but transitional processing is
/// deprecated, and none of Firefox, Safari, or Chrome use it.
pub fn map_normalize<'delegate, I: Iterator<Item = char> + 'delegate>(
&'delegate self,
iter: I,
) -> impl Iterator<Item = char> + 'delegate {
self.normalizer
.normalize_iter_private(iter, crate::IgnorableBehavior::Ignored)
}
/// Returns an iterator adaptor that turns an `Iterator` over `char`
/// into an iterator yielding a `char` sequence that gets the following
/// operations from the NFC check and statucs steps of the "Validity
/// Criteria" section of UTS 46 lazily applied to it:
///
/// 1. The _ignored_ characters are treated as _disallowed_.
/// 2. The _mapped_ characters are mapped.
/// 3. The _disallowed_ characters are replaced with U+FFFD,
/// which itself is a disallowed character.
/// 4. The _deviation_ characters are treated as _mapped_ or _valid_
/// as appropriate.
/// 5. The _disallowed_STD3_valid_ characters are treated as allowed.
/// 6. The _disallowed_STD3_mapped_ characters are treated as
/// _mapped_.
/// 7. The result is normalized to NFC.
///
/// Notably:
///
/// * The STD3 or WHATWG ASCII deny list should be implemented as a
/// post-processing step.
/// * Transitional processing is not performed. Transitional mapping
/// would be a pre-processing step, but transitional processing is
/// deprecated, and none of Firefox, Safari, or Chrome use it.
/// * The output needs to be compared with input to see if anything
/// changed. This check catches failures to adhere to the normalization
/// and status requirements. In particular, this comparison results
/// in _mapped_ characters resulting in error like "Validity Criteria"
/// requires.
pub fn normalize_validate<'delegate, I: Iterator<Item = char> + 'delegate>(
&'delegate self,
iter: I,
) -> impl Iterator<Item = char> + 'delegate {
self.normalizer
.normalize_iter_private(iter, crate::IgnorableBehavior::ReplacementCharacter)
}
}

Просмотреть файл

@ -0,0 +1,4 @@
# This is a placeholder in the interest of keeping the repository size smaller.
# Replace this file with the contents of
# https://www.unicode.org/Public/UCD/latest/ucd/NormalizationTest.txt to actually
# run the conformance test.

2
third_party/rust/icu_normalizer/tests/data/README.md поставляемый Normal file
Просмотреть файл

@ -0,0 +1,2 @@
The test data comes from
https://www.unicode.org/Public/UCD/latest/ucd/NormalizationTest.txt

1549
third_party/rust/icu_normalizer/tests/tests.rs поставляемый Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

1
third_party/rust/icu_normalizer_data/.cargo-checksum.json поставляемый Normal file
Просмотреть файл

@ -0,0 +1 @@
{"files":{"Cargo.toml":"8a831cf2a49499d9005ed9b4fa48cda2311e43b8a56272f07f48be45f93bfc79","LICENSE":"f367c1b8e1aa262435251e442901da4607b4650e0e63a026f5044473ecfb90f2","README.md":"52aa166967a2e729c2bbe88d827ed5f27e0908c7bf99806f534685c042961577","data/macros.rs":"01406adb7f8a71771640320ee0dffda2e8f721426fd0244b5e428c7e19c2dda2","data/macros/normalizer_comp_v1.rs.data":"4fea06eeaa69c3d3c18b8a854c7af369c0eadfb97cb79e32f8ccd62bbef81234","data/macros/normalizer_decomp_v1.rs.data":"cbe2a0e5ddacb10d1718f7f83ca5cd261b9618cf31b27cd46bfc61363bfc1a90","data/macros/normalizer_nfd_v1.rs.data":"1692d8a94a94afcb25dc4cadd2f413f6b20f8735128d8f2a4c4d7ade6c6e9c86","data/macros/normalizer_nfdex_v1.rs.data":"80eebad6112ac9a3af7120c6a6e7d9c8acf765e4b6ec482a33520ea05e5e27c4","data/macros/normalizer_nfkd_v1.rs.data":"6918be7b4c8f39c24b69f7958175abe8cc846a99cf1067fe09293dc919d5e963","data/macros/normalizer_nfkdex_v1.rs.data":"919d8973135e4a258094b3de711479e6d066de8f4579182b3ecb69a6cdb66e6e","data/macros/normalizer_uts46d_v1.rs.data":"081e089334626c603e2071060326d74328d6a22b0a71e5ead004f50c8956bd94","src/lib.rs":"6dadcea5dc4643966028d0470bd90f7ad5197709599571bd1750df8aa6d37e51"},"package":"f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516"}

42
third_party/rust/icu_normalizer_data/Cargo.toml поставляемый Normal file
Просмотреть файл

@ -0,0 +1,42 @@
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
#
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
# to registry (e.g., crates.io) dependencies.
#
# If you are reading this file be aware that the original Cargo.toml
# will likely look very different (and much more reasonable).
# See Cargo.toml.orig for the original contents.
[package]
edition = "2021"
rust-version = "1.67"
name = "icu_normalizer_data"
version = "1.5.0"
authors = ["The ICU4X Project Developers"]
include = [
"data/**/*",
"src/**/*",
"examples/**/*",
"benches/**/*",
"tests/**/*",
"Cargo.toml",
"LICENSE",
"README.md",
]
description = "Data for the icu_normalizer crate"
homepage = "https://icu4x.unicode.org"
readme = "README.md"
categories = ["internationalization"]
license = "Unicode-3.0"
repository = "https://github.com/unicode-org/icu4x"
[package.metadata.sources.cldr]
tagged = "45.0.0"
[package.metadata.sources.icuexport]
tagged = "icu4x/2024-05-16/75.x"
[package.metadata.sources.segmenter_lstm]
tagged = "v0.1.0"

46
third_party/rust/icu_normalizer_data/LICENSE поставляемый Normal file
Просмотреть файл

@ -0,0 +1,46 @@
UNICODE LICENSE V3
COPYRIGHT AND PERMISSION NOTICE
Copyright © 2020-2024 Unicode, Inc.
NOTICE TO USER: Carefully read the following legal agreement. BY
DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR
SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT
DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE.
Permission is hereby granted, free of charge, to any person obtaining a
copy of data files and any associated documentation (the "Data Files") or
software and any associated documentation (the "Software") to deal in the
Data Files or Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, and/or sell
copies of the Data Files or Software, and to permit persons to whom the
Data Files or Software are furnished to do so, provided that either (a)
this copyright and permission notice appear with all copies of the Data
Files or Software, or (b) this copyright and permission notice appear in
associated Documentation.
THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
THIRD PARTY RIGHTS.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA
FILES OR SOFTWARE.
Except as contained in this notice, the name of a copyright holder shall
not be used in advertising or otherwise to promote the sale, use or other
dealings in these Data Files or Software without prior written
authorization of the copyright holder.
SPDX-License-Identifier: Unicode-3.0
Portions of ICU4X may have been adapted from ICU4C and/or ICU4J.
ICU 1.8.1 to ICU 57.1 © 1995-2016 International Business Machines Corporation and others.

14
third_party/rust/icu_normalizer_data/README.md поставляемый Normal file
Просмотреть файл

@ -0,0 +1,14 @@
# icu_normalizer_data [![crates.io](https://img.shields.io/crates/v/icu_normalizer_data)](https://crates.io/crates/icu_normalizer_data)
<!-- cargo-rdme start -->
Data for the `icu_normalizer` crate
This data was generated with CLDR version 45.0.0, ICU version icu4x/2024-05-16/75.x, and
LSTM segmenter version v0.1.0.
<!-- cargo-rdme end -->
## More Information
For more information on development, authorship, contributing etc. please visit [`ICU4X home page`](https://github.com/unicode-org/icu4x).

76
third_party/rust/icu_normalizer_data/data/macros.rs поставляемый Normal file
Просмотреть файл

@ -0,0 +1,76 @@
// @generated
/// Marks a type as a data provider. You can then use macros like
/// `impl_core_helloworld_v1` to add implementations.
///
/// ```ignore
/// struct MyProvider;
/// const _: () = {
/// include!("path/to/generated/macros.rs");
/// make_provider!(MyProvider);
/// impl_core_helloworld_v1!(MyProvider);
/// }
/// ```
#[doc(hidden)]
#[macro_export]
macro_rules! __make_provider {
($ name : ty) => {
#[clippy::msrv = "1.67"]
impl $name {
#[doc(hidden)]
#[allow(dead_code)]
pub const MUST_USE_MAKE_PROVIDER_MACRO: () = ();
}
icu_provider::impl_data_provider_never_marker!($name);
};
}
#[doc(inline)]
pub use __make_provider as make_provider;
#[macro_use]
#[path = "macros/normalizer_comp_v1.rs.data"]
mod normalizer_comp_v1;
#[doc(inline)]
pub use __impl_normalizer_comp_v1 as impl_normalizer_comp_v1;
#[doc(inline)]
pub use __impliterable_normalizer_comp_v1 as impliterable_normalizer_comp_v1;
#[macro_use]
#[path = "macros/normalizer_decomp_v1.rs.data"]
mod normalizer_decomp_v1;
#[doc(inline)]
pub use __impl_normalizer_decomp_v1 as impl_normalizer_decomp_v1;
#[doc(inline)]
pub use __impliterable_normalizer_decomp_v1 as impliterable_normalizer_decomp_v1;
#[macro_use]
#[path = "macros/normalizer_nfd_v1.rs.data"]
mod normalizer_nfd_v1;
#[doc(inline)]
pub use __impl_normalizer_nfd_v1 as impl_normalizer_nfd_v1;
#[doc(inline)]
pub use __impliterable_normalizer_nfd_v1 as impliterable_normalizer_nfd_v1;
#[macro_use]
#[path = "macros/normalizer_nfdex_v1.rs.data"]
mod normalizer_nfdex_v1;
#[doc(inline)]
pub use __impl_normalizer_nfdex_v1 as impl_normalizer_nfdex_v1;
#[doc(inline)]
pub use __impliterable_normalizer_nfdex_v1 as impliterable_normalizer_nfdex_v1;
#[macro_use]
#[path = "macros/normalizer_nfkd_v1.rs.data"]
mod normalizer_nfkd_v1;
#[doc(inline)]
pub use __impl_normalizer_nfkd_v1 as impl_normalizer_nfkd_v1;
#[doc(inline)]
pub use __impliterable_normalizer_nfkd_v1 as impliterable_normalizer_nfkd_v1;
#[macro_use]
#[path = "macros/normalizer_nfkdex_v1.rs.data"]
mod normalizer_nfkdex_v1;
#[doc(inline)]
pub use __impl_normalizer_nfkdex_v1 as impl_normalizer_nfkdex_v1;
#[doc(inline)]
pub use __impliterable_normalizer_nfkdex_v1 as impliterable_normalizer_nfkdex_v1;
#[macro_use]
#[path = "macros/normalizer_uts46d_v1.rs.data"]
mod normalizer_uts46d_v1;
#[doc(inline)]
pub use __impl_normalizer_uts46d_v1 as impl_normalizer_uts46d_v1;
#[doc(inline)]
pub use __impliterable_normalizer_uts46d_v1 as impliterable_normalizer_uts46d_v1;

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

17
third_party/rust/icu_normalizer_data/src/lib.rs поставляемый Normal file
Просмотреть файл

@ -0,0 +1,17 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! Data for the `icu_normalizer` crate
//!
//! This data was generated with CLDR version 45.0.0, ICU version icu4x/2024-05-16/75.x, and
//! LSTM segmenter version v0.1.0.
#![no_std]
// The source is not readable and is massive as HTML.
#![doc(html_no_source)]
#[cfg(icu4x_custom_data)]
include!(concat!(core::env!("ICU4X_DATA_DIR"), "/macros.rs"));
#[cfg(not(icu4x_custom_data))]
include!("../data/macros.rs");

2
third_party/rust/idna/.cargo-checksum.json поставляемый
Просмотреть файл

@ -1 +1 @@
{"files":{"Cargo.toml":"8be30a9748419aed461ce333e260ff4a461bf8166dfc7768307f32fcfc4fbea1","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"76e972ac0f4ddb116e86e10100132a783931a596e7b9872eaa31be15cd4d751d","benches/all.rs":"e734b9c9092ed66986725f86cfe90f3756cfddb058af308b796ba494f9beefc2","src/IdnaMappingTable.txt":"87d6553a4b86bc49dcade38bf26b745cd81800eb8af295dc3fb99b4729eaea38","src/lib.rs":"e7fd80070a7e52dfd1e9fe785bf092eddc9fb421fd0f9a1ba1c2189b8d40d3ed","src/make_uts46_mapping_table.py":"917055fa841f813de2bcf79cc79b595da3d5551559ee768db8660ab77cb26c34","src/punycode.rs":"3697674a70647d200853ac9d1910ffcb4796534332fe328de16c4bb1283e2ec1","src/uts46.rs":"4eee036b6448489002ac5190f3ac28834a4caa063c7cc77474ea6256199619ae","src/uts46_mapping_table.rs":"942fff78147c61da942f5f3a7ff4e90f9d7a00a29285733ac3fc3357eb2ed06f","tests/IdnaTestV2.txt":"c6f3778b0545fd150c8063286c7f5adc901e16557eddccc3751213646d07593d","tests/bad_punycode_tests.json":"ff0a15479ed2cb08f7b4b39465160da66d1ac7575e5d76990c17e7b76cb5e0f5","tests/punycode.rs":"0b0f315a8b124c1275a423a69169b13b19bcd7e9e6a5158bd0d642d01c6db145","tests/punycode_tests.json":"3d4ac0cf25984c37b9ce197f5df680a0136f728fb8ec82bc76624e42139eb3a8","tests/tests.rs":"d205a2bfb29dfee73e014faebd3207a55ef0d40121e6dbd52f5d611b37ac111e","tests/unit.rs":"be025a7d9bab3bd1ce134c87f9d848269e157b31ca5ba0ea03426c1ac736b69e","tests/uts46.rs":"06c97bf7dc20f5372b542fa46922d6dd63fe15e0aa34d799d08df9e3a241aa21"},"package":"634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6"}
{"files":{"Cargo.toml":"d453ab4fa012a1f5d9233aa29fa03a7d5bcff06008f2197ce0ddac7e7aa28b2b","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"76e972ac0f4ddb116e86e10100132a783931a596e7b9872eaa31be15cd4d751d","README.md":"dd73e159f3b31a7070f4564f9e68dca14495452e3b30d6fe4ca1d84656b69ee6","benches/all.rs":"53002f41ac38bdd5b1bb0a7ec8d5a9b49ce6cd3d073ce16c1014f9d4e90b762b","src/deprecated.rs":"bdba5a73432d9755c831ec01edf4d512f9390b351dba0eb8ce7b0430fa1073ad","src/lib.rs":"4d30605daf5c18d282d460ee561c7e5218aea76cf33fc072fd79f9617256f04e","src/punycode.rs":"2d9dda9bb6504863ea6f374e9ab4192ccc475a789a43a0fb624b15459a611fbc","src/uts46.rs":"2e719c93954930de20789896b153af7dd84c20e14edba6317f9dd80e3baaccc9","tests/IdnaTestV2.txt":"d668c4ea58d60fe04e6c011df98e0b317da6abaa1273d58f42b581eb0dd7adda","tests/bad_punycode_tests.json":"ff0a15479ed2cb08f7b4b39465160da66d1ac7575e5d76990c17e7b76cb5e0f5","tests/deprecated.rs":"cce256f6616a19314330a06003d6308138aae8257136431d143f062f14ab17c7","tests/punycode.rs":"75fa73b6429ccacaeb5d72fab0b927cdf9f2173a9fc5fb366697bf7002b73921","tests/punycode_tests.json":"50859b828d14d5eeba5ab930de25fb72a35310a0b46f421f65d64c7c3e54d08a","tests/tests.rs":"ecee59f0b0be27ba1e7b24bb449c681024253d0275065f0f0e258e7ec2977d12","tests/unit.rs":"7e450599b52900baa51ea26ff0cb55a830456f60642985abbc87ec671a91b8e1","tests/unitbis.rs":"545259b767cd045aed01c1515c3b092d1b3f6b3366ce88d1593a2c8e3ffcd2af","tests/uts46.rs":"0a1c339708f1ab845d726b1f55dc1be8a423a1304b0399234391d0bd419e3fe0"},"package":"bd69211b9b519e98303c015e21a007e293db403b6c85b9b124e133d25e242cdd"}

45
third_party/rust/idna/Cargo.toml поставляемый
Просмотреть файл

@ -11,13 +11,18 @@
[package]
edition = "2018"
rust-version = "1.51"
rust-version = "1.67"
name = "idna"
version = "0.5.0"
version = "1.0.2"
authors = ["The rust-url developers"]
autotests = false
description = "IDNA (Internationalizing Domain Names in Applications) and Punycode."
categories = ["no_std"]
readme = "README.md"
keywords = [
"no_std",
"web",
"http",
]
license = "MIT OR Apache-2.0"
repository = "https://github.com/servo/rust-url/"
@ -34,18 +39,25 @@ harness = false
[[test]]
name = "unit"
[[test]]
name = "unitbis"
[[bench]]
name = "all"
harness = false
[dependencies.unicode-bidi]
version = "0.3.10"
features = ["hardcoded-data"]
default-features = false
[dependencies.icu_normalizer]
version = "1.4.3"
[dependencies.unicode-normalization]
version = "0.1.22"
default-features = false
[dependencies.icu_properties]
version = "1.4.2"
[dependencies.smallvec]
version = "1.13.1"
features = ["const_generics"]
[dependencies.utf8_iter]
version = "1.0.4"
[dev-dependencies.assert_matches]
version = "1.3"
@ -61,9 +73,12 @@ version = "0.9"
[features]
alloc = []
default = ["std"]
std = [
"alloc",
"unicode-bidi/std",
"unicode-normalization/std",
compiled_data = [
"icu_normalizer/compiled_data",
"icu_properties/compiled_data",
]
default = [
"std",
"compiled_data",
]
std = ["alloc"]

38
third_party/rust/idna/README.md поставляемый Normal file
Просмотреть файл

@ -0,0 +1,38 @@
# `idna`
IDNA library for Rust implementing [UTS 46: Unicode IDNA Compatibility Processing](https://www.unicode.org/reports/tr46/) as parametrized by the [WHATWG URL Standard](https://url.spec.whatwg.org/#idna).
## What it does
* An implementation of UTS 46 is provided, with configurable ASCII deny list (e.g. STD3 or WHATWG rules).
* A callback mechanism is provided for pluggable logic for deciding if a label is deemed potentially too misleading to render as Unicode in a user interface.
* Errors are marked as U+FFFD REPLACEMENT CHARACTERs in Unicode output so that locations of errors may be illustrated to the user.
## What it does not do
* There is no default/sample policy provided for the callback mechanism mentioned above.
* Only UTS 46 is implemented: There is no API to request strictly IDNA 2008 only or strictly IDNA 2003 only.
* There is no API for categorizing errors beyond there being an error.
* Checks that are configurable in UTS 46 but that the WHATWG URL Standard always set a particular way (regardless of the _beStrict_ flag in the URL Standard) cannot be configured (with the exception of the old deprecated API supporting transitional processing).
## Usage
Apps that need to prepare a hostname for usage in protocols are likely to only need the top-level function `domain_to_ascii_cow` with `AsciiDenyList::URL` as the second argument. Note that this rejects IPv6 addresses, so before this, you need to check if the first byte of the input is `b'['` and, if it is, treat the input as an IPv6 address instead.
Apps that need to display host names to the user should use `uts46::Uts46::to_user_interface`. The _ToUnicode_ operation is rarely appropriate for direct application usage.
## Cargo features
* `alloc` - For future proofing. Currently always required. Currently, the crate internal may allocate heap but for typical inputs do not allocate on the heap (apart from the output `String` when applicable).
* `compiled_data` - For future proofing. Currently always required. (Passed through to ICU4X.)
* `std` - Adds `impl std::error::Error for Errors {}` (and implies `alloc`).
* By default, all of the above are enabled.
## Breaking changes since 0.5.0
* Stricter IDNA 2008 restrictions are no longer supported. Attempting to enable them panics immediately. UTS 46 allows all the names that IDNA 2008 allows, and when transitional processing is disabled, they resolve the same way. There are additional names that IDNA 2008 disallows but UTS 46 maps to names that IDNA 2008 allows (notably, input is mapped to fold-case output). UTS 46 also allows symbols that were allowed in IDNA 2003 as well as newer symbols that are allowed according to the same principle. (Earlier versions of this crate allowed rejecting such symbols. Rejecting characters that UTS 46 maps to IDNA 2008-permitted characters wasn't supported in earlier versions, either.)
* `domain_to_ascii_strict` now performs the _CheckHyphens_ check (matching previous documentation).
* The ContextJ rules are now implemented and always enabled, even when using the old deprecated API, so input that fails those rules is rejected.
* The `Idna::to_ascii_inner` method has been removed. It didn't make sense as a public method, since callers were unable to figure out if there were errors. (A GitHub search found no callers for this method.)
* Punycode labels whose decoding does not yield any non-ASCII characters are now treated as being in error.
* When turning off default cargo features, the cargo feature `compiled_data` needs to be explicitly enabled.

9
third_party/rust/idna/benches/all.rs поставляемый
Просмотреть файл

@ -1,3 +1,5 @@
#![allow(deprecated)]
#[macro_use]
extern crate bencher;
extern crate idna;
@ -11,6 +13,12 @@ fn to_unicode_puny_label(bench: &mut Bencher) {
bench.iter(|| config.to_unicode(black_box(encoded)));
}
fn to_ascii_already_puny_label(bench: &mut Bencher) {
let encoded = "abc.xn--mgbcm";
let config = Config::default();
bench.iter(|| config.to_ascii(black_box(encoded)));
}
fn to_unicode_ascii(bench: &mut Bencher) {
let encoded = "example.com";
let config = Config::default();
@ -47,6 +55,7 @@ benchmark_group!(
to_unicode_ascii,
to_unicode_merged_label,
to_ascii_puny_label,
to_ascii_already_puny_label,
to_ascii_simple,
to_ascii_merged,
);

8727
third_party/rust/idna/src/IdnaMappingTable.txt поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

248
third_party/rust/idna/src/deprecated.rs поставляемый Normal file
Просмотреть файл

@ -0,0 +1,248 @@
// Copyright 2013-2014 The rust-url developers.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//! Deprecated API for [*Unicode IDNA Compatibility Processing*
//! (Unicode Technical Standard #46)](http://www.unicode.org/reports/tr46/)
#![allow(deprecated)]
use alloc::borrow::Cow;
use alloc::string::String;
use crate::uts46::*;
use crate::Errors;
/// Performs preprocessing equivalent to UTS 46 transitional processing
/// if `transitional` is `true`. If `transitional` is `false`, merely
/// lets the input pass through as-is (for call site convenience).
///
/// The output of this function is to be passed to [`Uts46::process`].
fn map_transitional(domain: &str, transitional: bool) -> Cow<'_, str> {
if !transitional {
return Cow::Borrowed(domain);
}
let mut chars = domain.chars();
loop {
let prev = chars.clone();
if let Some(c) = chars.next() {
match c {
'ß' | 'ẞ' | 'ς' | '\u{200C}' | '\u{200D}' => {
let mut s = String::with_capacity(domain.len());
let tail = prev.as_str();
let head = &domain[..domain.len() - tail.len()];
s.push_str(head);
for c in tail.chars() {
match c {
'ß' | 'ẞ' => {
s.push_str("ss");
}
'ς' => {
s.push('σ');
}
'\u{200C}' | '\u{200D}' => {}
_ => {
s.push(c);
}
}
}
return Cow::Owned(s);
}
_ => {}
}
} else {
break;
}
}
Cow::Borrowed(domain)
}
/// Deprecated. Use the crate-top-level functions or [`Uts46`].
#[derive(Default)]
#[deprecated]
pub struct Idna {
config: Config,
}
impl Idna {
pub fn new(config: Config) -> Self {
Self { config }
}
/// [UTS 46 ToASCII](http://www.unicode.org/reports/tr46/#ToASCII)
#[allow(clippy::wrong_self_convention)] // Retain old weirdness in deprecated API
pub fn to_ascii(&mut self, domain: &str, out: &mut String) -> Result<(), Errors> {
let mapped = map_transitional(domain, self.config.transitional_processing);
match Uts46::new().process(
mapped.as_bytes(),
self.config.deny_list(),
self.config.hyphens(),
ErrorPolicy::FailFast, // Old code did not appear to expect the output to be useful in the error case.
|_, _, _| false,
out,
None,
) {
Ok(ProcessingSuccess::Passthrough) => {
if self.config.verify_dns_length && !verify_dns_length(&mapped, true) {
return Err(crate::Errors::default());
}
out.push_str(&mapped);
Ok(())
}
Ok(ProcessingSuccess::WroteToSink) => {
if self.config.verify_dns_length && !verify_dns_length(out, true) {
return Err(crate::Errors::default());
}
Ok(())
}
Err(ProcessingError::ValidityError) => Err(crate::Errors::default()),
Err(ProcessingError::SinkError) => unreachable!(),
}
}
/// [UTS 46 ToUnicode](http://www.unicode.org/reports/tr46/#ToUnicode)
#[allow(clippy::wrong_self_convention)] // Retain old weirdness in deprecated API
pub fn to_unicode(&mut self, domain: &str, out: &mut String) -> Result<(), Errors> {
let mapped = map_transitional(domain, self.config.transitional_processing);
match Uts46::new().process(
mapped.as_bytes(),
self.config.deny_list(),
self.config.hyphens(),
ErrorPolicy::MarkErrors,
|_, _, _| true,
out,
None,
) {
Ok(ProcessingSuccess::Passthrough) => {
out.push_str(&mapped);
Ok(())
}
Ok(ProcessingSuccess::WroteToSink) => Ok(()),
Err(ProcessingError::ValidityError) => Err(crate::Errors::default()),
Err(ProcessingError::SinkError) => unreachable!(),
}
}
}
/// Deprecated configuration API.
#[derive(Clone, Copy)]
#[must_use]
#[deprecated]
pub struct Config {
use_std3_ascii_rules: bool,
transitional_processing: bool,
verify_dns_length: bool,
check_hyphens: bool,
}
/// The defaults are that of _beStrict=false_ in the [WHATWG URL Standard](https://url.spec.whatwg.org/#idna)
impl Default for Config {
fn default() -> Self {
Config {
use_std3_ascii_rules: false,
transitional_processing: false,
check_hyphens: false,
// Only use for to_ascii, not to_unicode
verify_dns_length: false,
}
}
}
impl Config {
/// Whether to enforce STD3 or WHATWG URL Standard ASCII deny list.
///
/// `true` for STD3, `false` for no deny list.
///
/// Note that `true` rejects pseudo-hosts used by various TXT record-based protocols.
#[inline]
pub fn use_std3_ascii_rules(mut self, value: bool) -> Self {
self.use_std3_ascii_rules = value;
self
}
/// Whether to enable (deprecated) transitional processing.
///
/// Note that Firefox, Safari, and Chrome do not use transitional
/// processing.
#[inline]
pub fn transitional_processing(mut self, value: bool) -> Self {
self.transitional_processing = value;
self
}
/// Whether the _VerifyDNSLength_ operation should be performed
/// by `to_ascii`.
///
/// For compatibility with previous behavior, even when set to `true`,
/// the trailing root label dot is allowed contrary to the spec.
#[inline]
pub fn verify_dns_length(mut self, value: bool) -> Self {
self.verify_dns_length = value;
self
}
/// Whether to enforce STD3 rules for hyphen placement.
///
/// `true` to deny hyphens in the first and last positions.
/// `false` to not enforce hyphen placement.
///
/// Note that for backward compatibility this is not the same as
/// UTS 46 _CheckHyphens_, which also disallows hyphens in the
/// third and fourth positions.
///
/// Note that `true` rejects real-world names, including some GitHub user pages.
#[inline]
pub fn check_hyphens(mut self, value: bool) -> Self {
self.check_hyphens = value;
self
}
/// Obsolete method retained to ease migration. The argument must be `false`.
///
/// Panics
///
/// If the argument is `true`.
#[inline]
#[allow(unused_mut)]
pub fn use_idna_2008_rules(mut self, value: bool) -> Self {
assert!(!value, "IDNA 2008 rules are no longer supported");
self
}
/// Compute the deny list
fn deny_list(&self) -> AsciiDenyList {
if self.use_std3_ascii_rules {
AsciiDenyList::STD3
} else {
AsciiDenyList::EMPTY
}
}
/// Compute the hyphen mode
fn hyphens(&self) -> Hyphens {
if self.check_hyphens {
Hyphens::CheckFirstLast
} else {
Hyphens::Allow
}
}
/// [UTS 46 ToASCII](http://www.unicode.org/reports/tr46/#ToASCII)
pub fn to_ascii(self, domain: &str) -> Result<String, Errors> {
let mut result = String::with_capacity(domain.len());
let mut codec = Idna::new(self);
codec.to_ascii(domain, &mut result).map(|()| result)
}
/// [UTS 46 ToUnicode](http://www.unicode.org/reports/tr46/#ToUnicode)
pub fn to_unicode(self, domain: &str) -> (String, Result<(), Errors>) {
let mut codec = Idna::new(self);
let mut out = String::with_capacity(domain.len());
let result = codec.to_unicode(domain, &mut out);
(out, result)
}
}

118
third_party/rust/idna/src/lib.rs поставляемый
Просмотреть файл

@ -42,45 +42,127 @@ extern crate alloc;
#[cfg(not(feature = "alloc"))]
compile_error!("the `alloc` feature must be enabled");
#[cfg(test)]
#[macro_use]
extern crate assert_matches;
// Avoid a breaking change if in the future there's a use case for
// having a Bring-Your-Own-ICU4X-Data constructor for `Uts46` and
// not also having compiled data in the binary.
#[cfg(not(feature = "compiled_data"))]
compile_error!("the `compiled_data` feature must be enabled");
use alloc::borrow::Cow;
use alloc::string::String;
pub use uts46::AsciiDenyList;
use uts46::Uts46;
mod deprecated;
pub mod punycode;
mod uts46;
pub mod uts46;
pub use crate::uts46::{Config, Errors, Idna};
#[allow(deprecated)]
pub use crate::deprecated::{Config, Idna};
/// The [domain to ASCII](https://url.spec.whatwg.org/#concept-domain-to-ascii) algorithm.
/// Type indicating that there were errors during UTS #46 processing.
#[derive(Default, Debug)]
#[non_exhaustive]
pub struct Errors {}
impl From<Errors> for Result<(), Errors> {
fn from(e: Errors) -> Result<(), Errors> {
Err(e)
}
}
#[cfg(feature = "std")]
impl std::error::Error for Errors {}
impl core::fmt::Display for Errors {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
core::fmt::Debug::fmt(self, f)
}
}
/// The [domain to ASCII](https://url.spec.whatwg.org/#concept-domain-to-ascii) algorithm;
/// version returning a `Cow`.
///
/// Most applications should be using this function rather than the sibling functions,
/// and most applications should pass [`AsciiDenyList::URL`] as the second argument.
/// Passing [`AsciiDenyList::URL`] as the second argument makes this function also
/// perform the [forbidden domain code point](https://url.spec.whatwg.org/#forbidden-domain-code-point)
/// check in addition to the [domain to ASCII](https://url.spec.whatwg.org/#concept-domain-to-ascii)
/// algorithm.
///
/// Returns the ASCII representation a domain name,
/// normalizing characters (upper-case to lower-case and other kinds of equivalence)
/// and using Punycode as necessary.
///
/// This process may fail.
///
/// If you have a `&str` instead of `&[u8]`, just call `.to_bytes()` on it before
/// passing it to this function. It's still preferable to use this function over
/// the sibling functions that take `&str`.
pub fn domain_to_ascii_cow(
domain: &[u8],
ascii_deny_list: AsciiDenyList,
) -> Result<Cow<'_, str>, Errors> {
Uts46::new().to_ascii(
domain,
ascii_deny_list,
uts46::Hyphens::Allow,
uts46::DnsLength::Ignore,
)
}
/// The [domain to ASCII](https://url.spec.whatwg.org/#concept-domain-to-ascii) algorithm;
/// version returning `String` and no ASCII deny list (i.e. _UseSTD3ASCIIRules=false_).
///
/// This function exists for backward-compatibility. Consider using [`domain_to_ascii_cow`]
/// instead.
///
/// Return the ASCII representation a domain name,
/// normalizing characters (upper-case to lower-case and other kinds of equivalence)
/// and using Punycode as necessary.
///
/// This process may fail.
pub fn domain_to_ascii(domain: &str) -> Result<String, uts46::Errors> {
Config::default().to_ascii(domain)
pub fn domain_to_ascii(domain: &str) -> Result<String, Errors> {
domain_to_ascii_cow(domain.as_bytes(), AsciiDenyList::EMPTY).map(|cow| cow.into_owned())
}
/// The [domain to ASCII](https://url.spec.whatwg.org/#concept-domain-to-ascii) algorithm,
/// with the `beStrict` flag set.
pub fn domain_to_ascii_strict(domain: &str) -> Result<String, uts46::Errors> {
Config::default()
.use_std3_ascii_rules(true)
.verify_dns_length(true)
.to_ascii(domain)
///
/// Note that this rejects various real-world names including:
/// * YouTube CDN nodes
/// * Some GitHub user pages
/// * Pseudo-hosts used by various TXT record-based protocols.
pub fn domain_to_ascii_strict(domain: &str) -> Result<String, Errors> {
Uts46::new()
.to_ascii(
domain.as_bytes(),
uts46::AsciiDenyList::STD3,
uts46::Hyphens::Check,
uts46::DnsLength::Verify,
)
.map(|cow| cow.into_owned())
}
/// The [domain to Unicode](https://url.spec.whatwg.org/#concept-domain-to-unicode) algorithm.
/// The [domain to Unicode](https://url.spec.whatwg.org/#concept-domain-to-unicode) algorithm;
/// version returning `String` and no ASCII deny list (i.e. _UseSTD3ASCIIRules=false_).
///
/// This function exists for backward-compatibility. Consider using [`Uts46::to_user_interface`]
/// or [`Uts46::to_unicode`].
///
/// Return the Unicode representation of a domain name,
/// normalizing characters (upper-case to lower-case and other kinds of equivalence)
/// and decoding Punycode as necessary.
///
/// This may indicate [syntax violations](https://url.spec.whatwg.org/#syntax-violation)
/// but always returns a string for the mapped domain.
pub fn domain_to_unicode(domain: &str) -> (String, Result<(), uts46::Errors>) {
Config::default().to_unicode(domain)
/// If the second item of the tuple indicates an error, the first item of the tuple
/// denotes errors using the REPLACEMENT CHARACTERs in order to be able to illustrate
/// errors to the user. When the second item of the return tuple signals an error,
/// the first item of the tuple must not be used in a network protocol.
pub fn domain_to_unicode(domain: &str) -> (String, Result<(), Errors>) {
let (cow, result) = Uts46::new().to_unicode(
domain.as_bytes(),
uts46::AsciiDenyList::EMPTY,
uts46::Hyphens::Allow,
);
(cow.into_owned(), result)
}

Просмотреть файл

@ -1,185 +0,0 @@
# Copyright 2013-2014 The rust-url developers.
#
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
# option. This file may not be copied, modified, or distributed
# except according to those terms.
# Run as: python make_uts46_mapping_table.py IdnaMappingTable.txt > uts46_mapping_table.rs
# You can get the latest idna table from
# http://www.unicode.org/Public/idna/latest/IdnaMappingTable.txt
import collections
import itertools
print('''\
// Copyright 2013-2020 The rust-url developers.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
// Generated by make_idna_table.py
''')
txt = open("IdnaMappingTable.txt")
def escape_char(c):
return "\\u{%x}" % ord(c[0])
def char(s):
return chr(int(s, 16))
strtab = collections.OrderedDict()
strtab_offset = 0
def strtab_slice(s):
global strtab, strtab_offset
if s in strtab:
return strtab[s]
else:
utf8_len = len(s.encode('utf8'))
c = (strtab_offset, utf8_len)
strtab[s] = c
strtab_offset += utf8_len
return c
def rust_slice(s):
start = s[0]
length = s[1]
start_lo = start & 0xff
start_hi = start >> 8
assert length <= 255
assert start_hi <= 255
return "(StringTableSlice { byte_start_lo: %d, byte_start_hi: %d, byte_len: %d })" % (start_lo, start_hi, length)
ranges = []
for line in txt:
# remove comments
line, _, _ = line.partition('#')
# skip empty lines
if len(line.strip()) == 0:
continue
fields = line.split(';')
if fields[0].strip() == 'D800..DFFF':
continue # Surrogates don't occur in Rust strings.
first, _, last = fields[0].strip().partition('..')
if not last:
last = first
mapping = fields[1].strip().replace('_', ' ').title().replace(' ', '')
unicode_str = None
if len(fields) > 2:
if fields[2].strip():
unicode_str = u''.join(char(c) for c in fields[2].strip().split(' '))
elif mapping == "Deviation":
unicode_str = u''
if len(fields) > 3:
assert fields[3].strip() in ('NV8', 'XV8'), fields[3]
assert mapping == 'Valid', mapping
mapping = 'DisallowedIdna2008'
ranges.append((first, last, mapping, unicode_str))
def mergeable_key(r):
mapping = r[2]
# These types have associated data, so we should not merge them.
if mapping in ('Mapped', 'Deviation', 'DisallowedStd3Mapped'):
return r
assert mapping in ('Valid', 'Ignored', 'Disallowed', 'DisallowedStd3Valid', 'DisallowedIdna2008')
return mapping
grouped_ranges = itertools.groupby(ranges, key=mergeable_key)
optimized_ranges = []
for (k, g) in grouped_ranges:
group = list(g)
if len(group) == 1:
optimized_ranges.append(group[0])
continue
# Assert that nothing in the group has an associated unicode string.
for g in group:
if g[3] is not None and len(g[3]) > 2:
assert not g[3][2].strip()
# Assert that consecutive members of the group don't leave gaps in
# the codepoint space.
a, b = itertools.tee(group)
next(b, None)
for (g1, g2) in zip(a, b):
last_char = int(g1[1], 16)
next_char = int(g2[0], 16)
if last_char + 1 == next_char:
continue
# There's a gap where surrogates would appear, but we don't have to
# worry about that gap, as surrogates never appear in Rust strings.
# Assert we're seeing the surrogate case here.
assert last_char == 0xd7ff
assert next_char == 0xe000
optimized_ranges.append((group[0][0], group[-1][1]) + group[0][2:])
def is_single_char_range(r):
(first, last, _, _) = r
return first == last
# We can reduce the size of the character range table and the index table to about 1/4
# by merging runs of single character ranges and using character offsets from the start
# of that range to retrieve the correct `Mapping` value
def merge_single_char_ranges(ranges):
current = []
for r in ranges:
if not current or is_single_char_range(current[-1]) and is_single_char_range(r):
current.append(r)
continue
if len(current) != 0:
ret = current
current = [r]
yield ret
continue
current.append(r)
ret = current
current = []
yield ret
yield current
optimized_ranges = list(merge_single_char_ranges(optimized_ranges))
SINGLE_MARKER = 1 << 15
print("static TABLE: &[(char, u16)] = &[")
offset = 0
for ranges in optimized_ranges:
assert offset < SINGLE_MARKER
block_len = len(ranges)
single = SINGLE_MARKER if block_len == 1 else 0
index = offset | single
offset += block_len
start = escape_char(char(ranges[0][0]))
print(" ('%s', %s)," % (start, index))
print("];\n")
print("static MAPPING_TABLE: &[Mapping] = &[")
for ranges in optimized_ranges:
for (first, last, mapping, unicode_str) in ranges:
if unicode_str is not None:
mapping += rust_slice(strtab_slice(unicode_str))
print(" %s," % mapping)
print("];\n")
def escape_str(s):
return [escape_char(c) for c in s]
print("static STRING_TABLE: &str = \"%s\";"
% '\\\n '.join(itertools.chain(*[escape_str(s) for s in strtab.keys()])))

269
third_party/rust/idna/src/punycode.rs поставляемый
Просмотреть файл

@ -15,17 +15,17 @@
use alloc::{string::String, vec::Vec};
use core::char;
use core::u32;
use core::fmt::Write;
use core::marker::PhantomData;
// Bootstring parameters for Punycode
static BASE: u32 = 36;
static T_MIN: u32 = 1;
static T_MAX: u32 = 26;
static SKEW: u32 = 38;
static DAMP: u32 = 700;
static INITIAL_BIAS: u32 = 72;
static INITIAL_N: u32 = 0x80;
static DELIMITER: char = '-';
const BASE: u32 = 36;
const T_MIN: u32 = 1;
const T_MAX: u32 = 26;
const SKEW: u32 = 38;
const DAMP: u32 = 700;
const INITIAL_BIAS: u32 = 72;
const INITIAL_N: u32 = 0x80;
#[inline]
fn adapt(mut delta: u32, num_points: u32, first_time: bool) -> u32 {
@ -41,10 +41,17 @@ fn adapt(mut delta: u32, num_points: u32, first_time: bool) -> u32 {
/// Convert Punycode to an Unicode `String`.
///
/// This is a convenience wrapper around `decode`.
/// Return None on malformed input or overflow.
/// Overflow can only happen on inputs that take more than
/// 63 encoded bytes, the DNS limit on domain name labels.
#[inline]
pub fn decode_to_string(input: &str) -> Option<String> {
decode(input).map(|chars| chars.into_iter().collect())
Some(
Decoder::default()
.decode::<u8, ExternalCaller>(input.as_bytes())
.ok()?
.collect(),
)
}
/// Convert Punycode to Unicode.
@ -53,33 +60,130 @@ pub fn decode_to_string(input: &str) -> Option<String> {
/// Overflow can only happen on inputs that take more than
/// 63 encoded bytes, the DNS limit on domain name labels.
pub fn decode(input: &str) -> Option<Vec<char>> {
Some(Decoder::default().decode(input).ok()?.collect())
Some(
Decoder::default()
.decode::<u8, ExternalCaller>(input.as_bytes())
.ok()?
.collect(),
)
}
/// Marker for internal vs. external caller to retain old API behavior
/// while tweaking behavior for internal callers.
///
/// External callers need overflow checks when encoding, but internal
/// callers don't, because `PUNYCODE_ENCODE_MAX_INPUT_LENGTH` is set
/// to 1000, and per RFC 3492 section 6.4, the integer variable does
/// not need to be able to represent values larger than
/// (char::MAX - INITIAL_N) * (PUNYCODE_ENCODE_MAX_INPUT_LENGTH + 1),
/// which is less than u32::MAX.
///
/// External callers need to handle upper-case ASCII when decoding,
/// but internal callers don't, because the internal code calls the
/// decoder only with lower-case inputs.
pub(crate) trait PunycodeCaller {
const EXTERNAL_CALLER: bool;
}
pub(crate) struct InternalCaller;
impl PunycodeCaller for InternalCaller {
const EXTERNAL_CALLER: bool = false;
}
struct ExternalCaller;
impl PunycodeCaller for ExternalCaller {
const EXTERNAL_CALLER: bool = true;
}
pub(crate) trait PunycodeCodeUnit {
fn is_delimiter(&self) -> bool;
fn is_ascii(&self) -> bool;
fn digit(&self) -> Option<u32>;
fn char(&self) -> char;
fn char_ascii_lower_case(&self) -> char;
}
impl PunycodeCodeUnit for u8 {
fn is_delimiter(&self) -> bool {
*self == b'-'
}
fn is_ascii(&self) -> bool {
*self < 0x80
}
fn digit(&self) -> Option<u32> {
let byte = *self;
Some(match byte {
byte @ b'0'..=b'9' => byte - b'0' + 26,
byte @ b'A'..=b'Z' => byte - b'A',
byte @ b'a'..=b'z' => byte - b'a',
_ => return None,
} as u32)
}
fn char(&self) -> char {
char::from(*self)
}
fn char_ascii_lower_case(&self) -> char {
char::from(self.to_ascii_lowercase())
}
}
impl PunycodeCodeUnit for char {
fn is_delimiter(&self) -> bool {
*self == '-'
}
fn is_ascii(&self) -> bool {
debug_assert!(false); // Unused
true
}
fn digit(&self) -> Option<u32> {
let byte = *self;
Some(match byte {
byte @ '0'..='9' => u32::from(byte) - u32::from('0') + 26,
// byte @ 'A'..='Z' => u32::from(byte) - u32::from('A'), // XXX not needed if no public input
byte @ 'a'..='z' => u32::from(byte) - u32::from('a'),
_ => return None,
})
}
fn char(&self) -> char {
debug_assert!(false); // Unused
*self
}
fn char_ascii_lower_case(&self) -> char {
// No need to actually lower-case!
*self
}
}
#[derive(Default)]
pub(crate) struct Decoder {
insertions: Vec<(usize, char)>,
insertions: smallvec::SmallVec<[(usize, char); 59]>,
}
impl Decoder {
/// Split the input iterator and return a Vec with insertions of encoded characters
pub(crate) fn decode<'a>(&'a mut self, input: &'a str) -> Result<Decode<'a>, ()> {
pub(crate) fn decode<'a, T: PunycodeCodeUnit + Copy, C: PunycodeCaller>(
&'a mut self,
input: &'a [T],
) -> Result<Decode<'a, T, C>, ()> {
self.insertions.clear();
// Handle "basic" (ASCII) code points.
// They are encoded as-is before the last delimiter, if any.
let (base, input) = match input.rfind(DELIMITER) {
None => ("", input),
Some(position) => (
let (base, input) = if let Some(position) = input.iter().rposition(|c| c.is_delimiter()) {
(
&input[..position],
if position > 0 {
&input[position + 1..]
} else {
input
},
),
)
} else {
(&input[..0], input)
};
if !base.is_ascii() {
if C::EXTERNAL_CALLER && !base.iter().all(|c| c.is_ascii()) {
return Err(());
}
@ -87,8 +191,8 @@ impl Decoder {
let mut length = base_len as u32;
let mut code_point = INITIAL_N;
let mut bias = INITIAL_BIAS;
let mut i = 0;
let mut iter = input.bytes();
let mut i = 0u32;
let mut iter = input.iter();
loop {
let previous_i = i;
let mut weight = 1;
@ -101,16 +205,13 @@ impl Decoder {
// Decode a generalized variable-length integer into delta,
// which gets added to i.
loop {
let digit = match byte {
byte @ b'0'..=b'9' => byte - b'0' + 26,
byte @ b'A'..=b'Z' => byte - b'A',
byte @ b'a'..=b'z' => byte - b'a',
_ => return Err(()),
} as u32;
if digit > (u32::MAX - i) / weight {
return Err(()); // Overflow
}
i += digit * weight;
let digit = if let Some(digit) = byte.digit() {
digit
} else {
return Err(());
};
let product = digit.checked_mul(weight).ok_or(())?;
i = i.checked_add(product).ok_or(())?;
let t = if k <= bias {
T_MIN
} else if k >= bias + T_MAX {
@ -121,10 +222,7 @@ impl Decoder {
if digit < t {
break;
}
if weight > u32::MAX / (BASE - t) {
return Err(()); // Overflow
}
weight *= BASE - t;
weight = weight.checked_mul(BASE - t).ok_or(())?;
k += BASE;
byte = match iter.next() {
None => return Err(()), // End of input before the end of this delta
@ -133,13 +231,10 @@ impl Decoder {
}
bias = adapt(i - previous_i, length + 1, previous_i == 0);
if i / (length + 1) > u32::MAX - code_point {
return Err(()); // Overflow
}
// i was supposed to wrap around from length+1 to 0,
// incrementing code_point each time.
code_point += i / (length + 1);
code_point = code_point.checked_add(i / (length + 1)).ok_or(())?;
i %= length + 1;
let c = match char::from_u32(code_point) {
Some(c) => c,
@ -159,24 +254,30 @@ impl Decoder {
self.insertions.sort_by_key(|(i, _)| *i);
Ok(Decode {
base: base.chars(),
base: base.iter(),
insertions: &self.insertions,
inserted: 0,
position: 0,
len: base_len + self.insertions.len(),
phantom: PhantomData::<C>,
})
}
}
pub(crate) struct Decode<'a> {
base: core::str::Chars<'a>,
pub(crate) struct Decode<'a, T, C>
where
T: PunycodeCodeUnit + Copy,
C: PunycodeCaller,
{
base: core::slice::Iter<'a, T>,
pub(crate) insertions: &'a [(usize, char)],
inserted: usize,
position: usize,
len: usize,
phantom: PhantomData<C>,
}
impl<'a> Iterator for Decode<'a> {
impl<'a, T: PunycodeCodeUnit + Copy, C: PunycodeCaller> Iterator for Decode<'a, T, C> {
type Item = char;
fn next(&mut self) -> Option<Self::Item> {
@ -191,7 +292,11 @@ impl<'a> Iterator for Decode<'a> {
}
if let Some(c) = self.base.next() {
self.position += 1;
return Some(c);
return Some(if C::EXTERNAL_CALLER {
c.char()
} else {
c.char_ascii_lower_case()
});
} else if self.inserted >= self.insertions.len() {
return None;
}
@ -204,7 +309,7 @@ impl<'a> Iterator for Decode<'a> {
}
}
impl<'a> ExactSizeIterator for Decode<'a> {
impl<'a, T: PunycodeCodeUnit + Copy, C: PunycodeCaller> ExactSizeIterator for Decode<'a, T, C> {
fn len(&self) -> usize {
self.len - self.position
}
@ -219,7 +324,9 @@ pub fn encode_str(input: &str) -> Option<String> {
return None;
}
let mut buf = String::with_capacity(input.len());
encode_into(input.chars(), &mut buf).ok().map(|()| buf)
encode_into::<_, _, ExternalCaller>(input.chars(), &mut buf)
.ok()
.map(|()| buf)
}
/// Convert Unicode to Punycode.
@ -231,30 +338,58 @@ pub fn encode(input: &[char]) -> Option<String> {
return None;
}
let mut buf = String::with_capacity(input.len());
encode_into(input.iter().copied(), &mut buf)
encode_into::<_, _, ExternalCaller>(input.iter().copied(), &mut buf)
.ok()
.map(|()| buf)
}
pub(crate) fn encode_into<I>(input: I, output: &mut String) -> Result<(), ()>
pub(crate) enum PunycodeEncodeError {
Overflow,
Sink,
}
impl From<core::fmt::Error> for PunycodeEncodeError {
fn from(_: core::fmt::Error) -> Self {
PunycodeEncodeError::Sink
}
}
pub(crate) fn encode_into<I, W, C>(input: I, output: &mut W) -> Result<(), PunycodeEncodeError>
where
I: Iterator<Item = char> + Clone,
W: Write + ?Sized,
C: PunycodeCaller,
{
// Handle "basic" (ASCII) code points. They are encoded as-is.
let (mut input_length, mut basic_length) = (0u32, 0);
for c in input.clone() {
input_length = input_length.checked_add(1).ok_or(())?;
input_length = input_length
.checked_add(1)
.ok_or(PunycodeEncodeError::Overflow)?;
if c.is_ascii() {
output.push(c);
output.write_char(c)?;
basic_length += 1;
}
}
if !C::EXTERNAL_CALLER {
// We should never get an overflow here with the internal caller being
// length-limited, but let's check anyway once here trusting the math
// from RFC 3492 section 6.4 and then omit the overflow checks in the
// loop below.
let len_plus_one = input_length
.checked_add(1)
.ok_or(PunycodeEncodeError::Overflow)?;
len_plus_one
.checked_mul(u32::from(char::MAX) - INITIAL_N)
.ok_or(PunycodeEncodeError::Overflow)?;
}
if basic_length > 0 {
output.push('-')
output.write_char('-')?;
}
let mut code_point = INITIAL_N;
let mut delta = 0;
let mut delta = 0u32;
let mut bias = INITIAL_BIAS;
let mut processed = basic_length;
while processed < input_length {
@ -266,16 +401,26 @@ where
.filter(|&c| c >= code_point)
.min()
.unwrap();
if min_code_point - code_point > (u32::MAX - delta) / (processed + 1) {
return Err(()); // Overflow
}
// Increase delta to advance the decoders <code_point,i> state to <min_code_point,0>
delta += (min_code_point - code_point) * (processed + 1);
if C::EXTERNAL_CALLER {
let product = (min_code_point - code_point)
.checked_mul(processed + 1)
.ok_or(PunycodeEncodeError::Overflow)?;
delta = delta
.checked_add(product)
.ok_or(PunycodeEncodeError::Overflow)?;
} else {
delta += (min_code_point - code_point) * (processed + 1);
}
code_point = min_code_point;
for c in input.clone() {
let c = c as u32;
if c < code_point {
delta = delta.checked_add(1).ok_or(())?;
if C::EXTERNAL_CALLER {
delta = delta.checked_add(1).ok_or(PunycodeEncodeError::Overflow)?;
} else {
delta += 1;
}
}
if c == code_point {
// Represent delta as a generalized variable-length integer:
@ -293,11 +438,11 @@ where
break;
}
let value = t + ((q - t) % (BASE - t));
output.push(value_to_digit(value));
output.write_char(value_to_digit(value))?;
q = (q - t) / (BASE - t);
k += BASE;
}
output.push(value_to_digit(q));
output.write_char(value_to_digit(q))?;
bias = adapt(delta, processed + 1, processed == basic_length);
delta = 0;
processed += 1;
@ -323,6 +468,10 @@ fn value_to_digit(value: u32) -> char {
#[cfg(target_pointer_width = "64")]
fn huge_encode() {
let mut buf = String::new();
assert!(encode_into(std::iter::repeat('ß').take(u32::MAX as usize + 1), &mut buf).is_err());
assert!(encode_into::<_, _, ExternalCaller>(
std::iter::repeat('ß').take(u32::MAX as usize + 1),
&mut buf
)
.is_err());
assert_eq!(buf.len(), 0);
}

2403
third_party/rust/idna/src/uts46.rs поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

15256
third_party/rust/idna/src/uts46_mapping_table.rs поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

11032
third_party/rust/idna/tests/IdnaTestV2.txt поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

197
third_party/rust/idna/tests/deprecated.rs поставляемый Normal file
Просмотреть файл

@ -0,0 +1,197 @@
// Copyright 2013-2014 The rust-url developers.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
#![allow(clippy::assigning_clones)]
#![allow(deprecated)]
use crate::test::TestFn;
use std::char;
use std::fmt::Write;
use idna::Errors;
pub fn collect_tests<F: FnMut(String, TestFn)>(add_test: &mut F) {
// https://www.unicode.org/Public/idna/13.0.0/IdnaTestV2.txt
for (i, line) in include_str!("IdnaTestV2.txt").lines().enumerate() {
if line.is_empty() || line.starts_with('#') {
continue;
}
// Remove comments
let line = match line.find('#') {
Some(index) => &line[0..index],
None => line,
};
let mut pieces = line.split(';').map(|x| x.trim()).collect::<Vec<&str>>();
let source = unescape(pieces.remove(0));
// ToUnicode
let mut to_unicode = unescape(pieces.remove(0));
if to_unicode.is_empty() {
to_unicode = source.clone();
}
let to_unicode_status = status(pieces.remove(0));
// ToAsciiN
let to_ascii_n = pieces.remove(0);
let to_ascii_n = if to_ascii_n.is_empty() {
to_unicode.clone()
} else {
to_ascii_n.to_owned()
};
let to_ascii_n_status = pieces.remove(0);
let to_ascii_n_status = if to_ascii_n_status.is_empty() {
to_unicode_status.clone()
} else {
status(to_ascii_n_status)
};
// ToAsciiT
let to_ascii_t = pieces.remove(0);
let to_ascii_t = if to_ascii_t.is_empty() {
to_ascii_n.clone()
} else {
to_ascii_t.to_owned()
};
let to_ascii_t_status = pieces.remove(0);
let to_ascii_t_status = if to_ascii_t_status.is_empty() {
to_ascii_n_status.clone()
} else {
status(to_ascii_t_status)
};
let test_name = format!("UTS #46 (deprecated API) line {}", i + 1);
add_test(
test_name,
TestFn::DynTestFn(Box::new(move || {
let config = idna::Config::default()
.use_std3_ascii_rules(true)
.verify_dns_length(true)
.check_hyphens(true);
// http://unicode.org/reports/tr46/#Deviations
// applications that perform IDNA2008 lookup are not required to check
// for these contexts, so we skip all tests annotated with C*
// Everybody ignores V2
// https://github.com/servo/rust-url/pull/240
// https://github.com/whatwg/url/issues/53#issuecomment-181528158
// http://www.unicode.org/review/pri317/
// "The special error codes X3 and X4_2 are now returned where a toASCII error code
// was formerly being generated in toUnicode due to an empty label."
// This is not implemented yet, so we skip toUnicode X4_2 tests for now, too.
let (to_unicode_value, to_unicode_result) =
config.transitional_processing(false).to_unicode(&source);
let to_unicode_result = to_unicode_result.map(|()| to_unicode_value);
check(
&source,
(&to_unicode, &to_unicode_status),
to_unicode_result,
|e| e == "X4_2" || e == "V2",
);
let to_ascii_n_result = config.transitional_processing(false).to_ascii(&source);
check(
&source,
(&to_ascii_n, &to_ascii_n_status),
to_ascii_n_result,
|e| e == "V2",
);
let to_ascii_t_result = config.transitional_processing(true).to_ascii(&source);
check(
&source,
(&to_ascii_t, &to_ascii_t_status),
to_ascii_t_result,
|e| e == "V2",
);
})),
)
}
}
#[allow(clippy::redundant_clone)]
fn check<F>(source: &str, expected: (&str, &[&str]), actual: Result<String, Errors>, ignore: F)
where
F: Fn(&str) -> bool,
{
if !expected.1.is_empty() {
if !expected.1.iter().copied().any(ignore) {
let res = actual.ok();
assert_eq!(
res.clone(),
None,
"Expected error {:?}. result: {} | source: {}",
expected.1,
res.unwrap(),
source,
);
}
} else {
assert!(
actual.is_ok(),
"Couldn't parse {} | error: {:?}",
source,
actual.err().unwrap(),
);
assert_eq!(actual.unwrap(), expected.0, "source: {}", source);
}
}
fn unescape(input: &str) -> String {
let mut output = String::new();
let mut chars = input.chars();
loop {
match chars.next() {
None => return output,
Some(c) => {
if c == '\\' {
match chars.next().unwrap() {
'\\' => output.push('\\'),
'u' => {
let c1 = chars.next().unwrap().to_digit(16).unwrap();
let c2 = chars.next().unwrap().to_digit(16).unwrap();
let c3 = chars.next().unwrap().to_digit(16).unwrap();
let c4 = chars.next().unwrap().to_digit(16).unwrap();
match char::from_u32(((c1 * 16 + c2) * 16 + c3) * 16 + c4) {
Some(c) => output.push(c),
None => {
write!(&mut output, "\\u{:X}{:X}{:X}{:X}", c1, c2, c3, c4)
.expect("Could not write to output");
}
};
}
_ => panic!("Invalid test data input"),
}
} else {
output.push(c);
}
}
}
}
}
fn status(status: &str) -> Vec<&str> {
if status.is_empty() || status == "[]" {
return Vec::new();
}
let mut result = status.split(", ").collect::<Vec<_>>();
assert!(result[0].starts_with('['));
result[0] = &result[0][1..];
let idx = result.len() - 1;
let last = &mut result[idx];
assert!(last.ends_with(']'));
*last = &last[..last.len() - 1];
result
}

13
third_party/rust/idna/tests/punycode.rs поставляемый
Просмотреть файл

@ -7,7 +7,7 @@
// except according to those terms.
use crate::test::TestFn;
use idna::punycode::{decode, encode_str};
use idna::punycode::{decode, decode_to_string, encode_str};
use serde_json::map::Map;
use serde_json::Value;
use std::panic::catch_unwind;
@ -28,6 +28,17 @@ fn one_test(decoded: &str, encoded: &str) {
}
}
match decode_to_string(encoded) {
None => panic!("Decoding {} failed.", encoded),
Some(result) => assert!(
result == decoded,
"Incorrect decoding of \"{}\":\n \"{}\"\n!= \"{}\"\n",
encoded,
result,
decoded
),
}
match encode_str(decoded) {
None => panic!("Encoding {} failed.", decoded),
Some(result) => assert!(

Просмотреть файл

@ -1,6 +1,6 @@
[
{
"description": "These tests are copied from https://github.com/bestiejs/punycode.js/blob/master/tests/tests.js , used under the MIT license.",
"description": "These tests are copied from https://github.com/mathiasbynens/punycode.js/blob/main/tests/tests.js , used under the MIT license.",
"decoded": "",
"encoded": ""
},

2
third_party/rust/idna/tests/tests.rs поставляемый
Просмотреть файл

@ -1,5 +1,6 @@
use tester as test;
mod deprecated;
mod punycode;
mod uts46;
@ -19,6 +20,7 @@ fn main() {
})
};
punycode::collect_tests(&mut add_test);
deprecated::collect_tests(&mut add_test);
uts46::collect_tests(&mut add_test);
}
test::test_main(&std::env::args().collect::<Vec<_>>(), tests, None)

41
third_party/rust/idna/tests/unit.rs поставляемый
Просмотреть файл

@ -1,5 +1,6 @@
#![allow(deprecated)]
use assert_matches::assert_matches;
use unicode_normalization::char::is_combining_mark;
/// https://github.com/servo/rust-url/issues/373
#[test]
@ -28,15 +29,21 @@ fn test_punycode_prefix_without_length_check() {
.check_hyphens(true)
.use_std3_ascii_rules(true);
assert_eq!(config.to_ascii("xn--").unwrap(), "");
assert!(config.to_ascii("xn--").is_err());
assert!(config.to_ascii("xn---").is_err());
assert!(config.to_ascii("xn-----").is_err());
assert_eq!(config.to_ascii("xn--.").unwrap(), ".");
assert_eq!(config.to_ascii("xn--...").unwrap(), "...");
assert_eq!(config.to_ascii(".xn--").unwrap(), ".");
assert_eq!(config.to_ascii("...xn--").unwrap(), "...");
assert_eq!(config.to_ascii("xn--.xn--").unwrap(), ".");
assert_eq!(config.to_ascii("xn--.example.org").unwrap(), ".example.org");
assert!(config.to_ascii("xn--.").is_err());
assert!(config.to_ascii("xn--...").is_err());
assert!(config.to_ascii(".xn--").is_err());
assert!(config.to_ascii("...xn--").is_err());
assert!(config.to_ascii("xn--.xn--").is_err());
assert!(config.to_ascii("xn--.example.org").is_err());
}
#[test]
fn test_punycode_invalid_encoding() {
let config = idna::Config::default();
assert!(config.to_ascii("xn--55555577").is_err());
}
// http://www.unicode.org/reports/tr46/#Table_Example_Processing
@ -85,10 +92,10 @@ fn test_examples() {
fn test_v5() {
let config = idna::Config::default()
.verify_dns_length(true)
.use_std3_ascii_rules(true);
.use_std3_ascii_rules(true)
.check_hyphens(true);
// IdnaTest:784 蔏。𑰺
assert!(is_combining_mark('\u{11C3A}'));
assert!(config.to_ascii("\u{11C3A}").is_err());
assert!(config.to_ascii("\u{850f}.\u{11C3A}").is_err());
assert!(config.to_ascii("\u{850f}\u{ff61}\u{11C3A}").is_err());
@ -98,7 +105,8 @@ fn test_v5() {
fn test_v8_bidi_rules() {
let config = idna::Config::default()
.verify_dns_length(true)
.use_std3_ascii_rules(true);
.use_std3_ascii_rules(true)
.check_hyphens(true);
assert_eq!(config.to_ascii("abc").unwrap(), "abc");
assert_eq!(config.to_ascii("123").unwrap(), "123");
@ -118,18 +126,11 @@ fn test_v8_bidi_rules() {
#[test]
fn emoji_domains() {
// HOT BEVERAGE is allowed here...
let config = idna::Config::default()
.verify_dns_length(true)
.use_std3_ascii_rules(true);
assert_eq!(config.to_ascii("☕.com").unwrap(), "xn--53h.com");
// ... but not here
let config = idna::Config::default()
.verify_dns_length(true)
.use_std3_ascii_rules(true)
.use_idna_2008_rules(true);
let error = format!("{:?}", config.to_ascii("☕.com").unwrap_err());
assert!(error.contains("disallowed_in_idna_2008"));
.check_hyphens(true);
assert_eq!(config.to_ascii("☕.com").unwrap(), "xn--53h.com");
}
#[test]

374
third_party/rust/idna/tests/unitbis.rs поставляемый Normal file
Просмотреть файл

@ -0,0 +1,374 @@
use idna::uts46::AsciiDenyList;
use idna::uts46::DnsLength;
use idna::uts46::Hyphens;
/// https://github.com/servo/rust-url/issues/373
#[test]
fn test_punycode_prefix_with_length_check() {
let config = idna::uts46::Uts46::new();
assert!(config
.to_ascii(
b"xn--",
AsciiDenyList::STD3,
Hyphens::Check,
DnsLength::Verify
)
.is_err());
assert!(config
.to_ascii(
b"xn---",
AsciiDenyList::STD3,
Hyphens::Check,
DnsLength::Verify
)
.is_err());
assert!(config
.to_ascii(
b"xn-----",
AsciiDenyList::STD3,
Hyphens::Check,
DnsLength::Verify,
)
.is_err());
assert!(config
.to_ascii(
b"xn--.",
AsciiDenyList::STD3,
Hyphens::Check,
DnsLength::Verify
)
.is_err());
assert!(config
.to_ascii(
b"xn--...",
AsciiDenyList::STD3,
Hyphens::Check,
DnsLength::Verify,
)
.is_err());
assert!(config
.to_ascii(
b".xn--",
AsciiDenyList::STD3,
Hyphens::Check,
DnsLength::Verify,
)
.is_err());
assert!(config
.to_ascii(
b"...xn--",
AsciiDenyList::STD3,
Hyphens::Check,
DnsLength::Verify,
)
.is_err());
assert!(config
.to_ascii(
b"xn--.xn--",
AsciiDenyList::STD3,
Hyphens::Check,
DnsLength::Verify,
)
.is_err());
assert!(config
.to_ascii(
b"xn--.example.org",
AsciiDenyList::STD3,
Hyphens::Check,
DnsLength::Verify,
)
.is_err());
}
/// https://github.com/servo/rust-url/issues/373
#[test]
fn test_punycode_prefix_without_length_check() {
let config = idna::uts46::Uts46::new();
assert!(config
.to_ascii(
b"xn--",
AsciiDenyList::URL,
Hyphens::Allow,
DnsLength::Ignore
)
.is_err());
assert!(config
.to_ascii(
b"xn---",
AsciiDenyList::URL,
Hyphens::Allow,
DnsLength::Ignore
)
.is_err());
assert!(config
.to_ascii(
b"xn-----",
AsciiDenyList::URL,
Hyphens::Allow,
DnsLength::Ignore
)
.is_err());
assert!(config
.to_ascii(
b"xn--.",
AsciiDenyList::URL,
Hyphens::Allow,
DnsLength::Ignore
)
.is_err());
assert!(config
.to_ascii(
b"xn--...",
AsciiDenyList::URL,
Hyphens::Allow,
DnsLength::Ignore
)
.is_err());
assert!(config
.to_ascii(
b".xn--",
AsciiDenyList::URL,
Hyphens::Allow,
DnsLength::Ignore
)
.is_err());
assert!(config
.to_ascii(
b"...xn--",
AsciiDenyList::URL,
Hyphens::Allow,
DnsLength::Ignore
)
.is_err());
assert!(config
.to_ascii(
b"xn--.xn--",
AsciiDenyList::URL,
Hyphens::Allow,
DnsLength::Ignore
)
.is_err());
assert!(config
.to_ascii(
b"xn--.example.org",
AsciiDenyList::URL,
Hyphens::Allow,
DnsLength::Ignore
)
.is_err());
}
/*
// http://www.unicode.org/reports/tr46/#Table_Example_Processing
#[test]
fn test_examples() {
let codec = idna::uts46bis::Uts46::new();
let mut out = String::new();
assert_matches!(codec.to_unicode("Bloß.de", &mut out), Ok(()));
assert_eq!(out, "bloß.de");
out.clear();
assert_matches!(codec.to_unicode("xn--blo-7ka.de", &mut out), Ok(()));
assert_eq!(out, "bloß.de");
out.clear();
assert_matches!(codec.to_unicode("u\u{308}.com", &mut out), Ok(()));
assert_eq!(out, "ü.com");
out.clear();
assert_matches!(codec.to_unicode("xn--tda.com", &mut out), Ok(()));
assert_eq!(out, "ü.com");
out.clear();
assert_matches!(codec.to_unicode("xn--u-ccb.com", &mut out), Err(_));
out.clear();
assert_matches!(codec.to_unicode("a⒈com", &mut out), Err(_));
out.clear();
assert_matches!(codec.to_unicode("xn--a-ecp.ru", &mut out), Err(_));
out.clear();
assert_matches!(codec.to_unicode("xn--0.pt", &mut out), Err(_));
out.clear();
assert_matches!(codec.to_unicode("日本語。JP", &mut out), Ok(()));
assert_eq!(out, "日本語.jp");
out.clear();
assert_matches!(codec.to_unicode("☕.us", &mut out), Ok(()));
assert_eq!(out, "☕.us");
}
*/
#[test]
fn test_v5() {
let config = idna::uts46::Uts46::new();
// IdnaTest:784 蔏。𑰺
assert!(config
.to_ascii(
"\u{11C3A}".as_bytes(),
AsciiDenyList::STD3,
Hyphens::Check,
DnsLength::Verify,
)
.is_err());
assert!(config
.to_ascii(
"\u{850f}.\u{11C3A}".as_bytes(),
AsciiDenyList::STD3,
Hyphens::Check,
DnsLength::Verify,
)
.is_err());
assert!(config
.to_ascii(
"\u{850f}\u{ff61}\u{11C3A}".as_bytes(),
AsciiDenyList::STD3,
Hyphens::Check,
DnsLength::Verify,
)
.is_err());
}
#[test]
fn test_v8_bidi_rules() {
let config = idna::uts46::Uts46::new();
assert_eq!(
config
.to_ascii(
b"abc",
AsciiDenyList::STD3,
Hyphens::Check,
DnsLength::Verify,
)
.unwrap(),
"abc"
);
assert_eq!(
config
.to_ascii(
b"123",
AsciiDenyList::STD3,
Hyphens::Check,
DnsLength::Verify,
)
.unwrap(),
"123"
);
assert_eq!(
config
.to_ascii(
"אבּג".as_bytes(),
AsciiDenyList::STD3,
Hyphens::Check,
DnsLength::Verify,
)
.unwrap(),
"xn--kdb3bdf"
);
assert_eq!(
config
.to_ascii(
"ابج".as_bytes(),
AsciiDenyList::STD3,
Hyphens::Check,
DnsLength::Verify,
)
.unwrap(),
"xn--mgbcm"
);
assert_eq!(
config
.to_ascii(
"abc.ابج".as_bytes(),
AsciiDenyList::STD3,
Hyphens::Check,
DnsLength::Verify,
)
.unwrap(),
"abc.xn--mgbcm"
);
assert_eq!(
config
.to_ascii(
"אבּג.ابج".as_bytes(),
AsciiDenyList::STD3,
Hyphens::Check,
DnsLength::Verify,
)
.unwrap(),
"xn--kdb3bdf.xn--mgbcm"
);
// Bidi domain names cannot start with digits
assert!(config
.to_ascii(
"0a.\u{05D0}".as_bytes(),
AsciiDenyList::STD3,
Hyphens::Check,
DnsLength::Verify,
)
.is_err());
assert!(config
.to_ascii(
"0à.\u{05D0}".as_bytes(),
AsciiDenyList::STD3,
Hyphens::Check,
DnsLength::Verify,
)
.is_err());
// Bidi chars may be punycode-encoded
assert!(config
.to_ascii(
b"xn--0ca24w",
AsciiDenyList::STD3,
Hyphens::Check,
DnsLength::Verify,
)
.is_err());
}
#[test]
fn emoji_domains() {
// HOT BEVERAGE is allowed here...
let config = idna::uts46::Uts46::new();
assert_eq!(
config
.to_ascii(
"☕.com".as_bytes(),
AsciiDenyList::STD3,
Hyphens::Check,
DnsLength::Verify,
)
.unwrap(),
"xn--53h.com"
);
}
#[test]
fn unicode_before_delimiter() {
let config = idna::uts46::Uts46::new();
assert!(config
.to_ascii(
"xn--f\u{34a}-PTP".as_bytes(),
AsciiDenyList::STD3,
Hyphens::Check,
DnsLength::Verify,
)
.is_err());
}
#[test]
fn upper_case_ascii_in_punycode() {
let config = idna::uts46::Uts46::new();
let (unicode, result) =
config.to_unicode("xn--A-1ga".as_bytes(), AsciiDenyList::STD3, Hyphens::Check);
assert!(result.is_ok());
assert_eq!(&unicode, "");
}

101
third_party/rust/idna/tests/uts46.rs поставляемый
Просмотреть файл

@ -6,10 +6,16 @@
// option. This file may not be copied, modified, or distributed
// except according to those terms.
#![allow(clippy::assigning_clones)]
use crate::test::TestFn;
use std::char;
use std::fmt::Write;
use idna::uts46::verify_dns_length;
use idna::uts46::ProcessingError;
use idna::uts46::ProcessingSuccess;
use idna::uts46::{AsciiDenyList, DnsLength, ErrorPolicy, Hyphens};
use idna::Errors;
pub fn collect_tests<F: FnMut(String, TestFn)>(add_test: &mut F) {
@ -49,28 +55,11 @@ pub fn collect_tests<F: FnMut(String, TestFn)>(add_test: &mut F) {
status(to_ascii_n_status)
};
// ToAsciiT
let to_ascii_t = pieces.remove(0);
let to_ascii_t = if to_ascii_t.is_empty() {
to_ascii_n.clone()
} else {
to_ascii_t.to_owned()
};
let to_ascii_t_status = pieces.remove(0);
let to_ascii_t_status = if to_ascii_t_status.is_empty() {
to_ascii_n_status.clone()
} else {
status(to_ascii_t_status)
};
let test_name = format!("UTS #46 line {}", i + 1);
add_test(
test_name,
TestFn::DynTestFn(Box::new(move || {
let config = idna::Config::default()
.use_std3_ascii_rules(true)
.verify_dns_length(true)
.check_hyphens(true);
let config = idna::uts46::Uts46::new();
// http://unicode.org/reports/tr46/#Deviations
// applications that perform IDNA2008 lookup are not required to check
@ -86,29 +75,85 @@ pub fn collect_tests<F: FnMut(String, TestFn)>(add_test: &mut F) {
// This is not implemented yet, so we skip toUnicode X4_2 tests for now, too.
let (to_unicode_value, to_unicode_result) =
config.transitional_processing(false).to_unicode(&source);
let to_unicode_result = to_unicode_result.map(|()| to_unicode_value);
config.to_unicode(source.as_bytes(), AsciiDenyList::STD3, Hyphens::Check);
let to_unicode_result = to_unicode_result.map(|()| to_unicode_value.into_owned());
check(
&source,
(&to_unicode, &to_unicode_status),
to_unicode_result,
|e| e.starts_with('C') || e == "V2" || e == "X4_2",
|e| e == "X4_2",
);
let to_ascii_n_result = config.transitional_processing(false).to_ascii(&source);
let to_ascii_n_result = config.to_ascii(
source.as_bytes(),
AsciiDenyList::STD3,
Hyphens::Check,
DnsLength::VerifyAllowRootDot,
);
check(
&source,
(&to_ascii_n, &to_ascii_n_status),
to_ascii_n_result,
|e| e.starts_with('C') || e == "V2",
to_ascii_n_result.map(|cow| cow.into_owned()),
|_| false,
);
let to_ascii_t_result = config.transitional_processing(true).to_ascii(&source);
let mut to_unicode_simultaneous = String::new();
let mut to_ascii_simultaneous = String::new();
let (to_unicode_simultaneous_result, to_ascii_simultaneous_result) = match config
.process(
source.as_bytes(),
AsciiDenyList::STD3,
Hyphens::Check,
ErrorPolicy::MarkErrors,
|_, _, _| true,
&mut to_unicode_simultaneous,
Some(&mut to_ascii_simultaneous),
) {
Ok(ProcessingSuccess::Passthrough) => (
Ok(source.to_string()),
if verify_dns_length(&source, true) {
Ok(source.to_string())
} else {
Err(Errors::default())
},
),
Ok(ProcessingSuccess::WroteToSink) => {
if to_ascii_simultaneous.is_empty() {
(
Ok(to_unicode_simultaneous.clone()),
if verify_dns_length(&to_unicode_simultaneous, true) {
Ok(to_unicode_simultaneous)
} else {
Err(Errors::default())
},
)
} else {
(
Ok(to_unicode_simultaneous),
if verify_dns_length(&to_ascii_simultaneous, true) {
Ok(to_ascii_simultaneous)
} else {
Err(Errors::default())
},
)
}
}
Err(ProcessingError::ValidityError) => {
(Err(Errors::default()), Err(Errors::default()))
}
Err(ProcessingError::SinkError) => unreachable!(),
};
check(
&source,
(&to_ascii_t, &to_ascii_t_status),
to_ascii_t_result,
|e| e.starts_with('C') || e == "V2",
(&to_unicode, &to_unicode_status),
to_unicode_simultaneous_result,
|e| e == "X4_2",
);
check(
&source,
(&to_ascii_n, &to_ascii_n_status),
to_ascii_simultaneous_result,
|_| false,
);
})),
)

Просмотреть файл

@ -1 +0,0 @@
{"files":{"COPYRIGHT":"23860c2a7b5d96b21569afedf033469bab9fe14a1b24a35068b8641c578ce24d","Cargo.toml":"d43bfc158330a3a780af52ff0e82d88c8b54707ddf0469e6e27749c8ded4d1b7","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"7b63ecd5f1902af1b63729947373683c32745c16a10e8e6292e2e2dcd7e90ae0","README.md":"60162364e07490163f7a8e199c7a0a8ace165ae5aa7e4b6f16ff1617ddef5867","benches/bench.rs":"827e5343b059a732904be29717c2797203bfd0a633edf08042afea65372a3e2c","scripts/unicode.py":"6b1d9025fa9970c23b9721c6704aa085263408d645cf9c469295978010fd7504","src/__test_api.rs":"78e21bfa0b98894f545c8ed3e31cec20d7a48951a7f3ed69a6130c4b3d463aee","src/decompose.rs":"c0eb774843a545356e63bbcd7fb926f80d3c97ef4601ca3701fc34154f2e9905","src/lib.rs":"1983769ea083caa36b0736c87cf2a98e91c2b900f1d5dec64e327360fa862386","src/lookups.rs":"962f9909b32e02b8a2a05836135d9cd39bb1ce01f7c659de99cbd8a3a3c78574","src/no_std_prelude.rs":"602e81e67b8952b6571826f431e3b6787be3073bc10f38a0d3374278f81a6a1f","src/normalize.rs":"de2670b4437d335d42884af844a750f70e541467ecd34077dfe032103cb9b041","src/perfect_hash.rs":"400c84e2f467f61bd55d55d08672da6a9ad7a57c938ce5d0c701a6994b1b273b","src/quick_check.rs":"9756312d75fc31b67fca954e44a4812945a7e436b03ba18b9a2441f6de570f6f","src/recompose.rs":"a6228ad7561a5c7a1ef1d510159bdde1eea8a161007c80e470432e9b844d5536","src/replace.rs":"b24c904f3e00851a78820e30ddfa4ff10c795f8925fd0ee7f5870f31fdfa770b","src/stream_safe.rs":"383d71f0da401af8e735877e43855c7e16cb06deb2263539cdec2a407dbe257d","src/tables.rs":"3d9983a4e24c5b1e5dc272a025cdc729b7107f9a52a1fc89eca598e69af36c3a","src/test.rs":"3af8ad8c6bd2cc1ca44660bd265ad813c88d3074b448df4d9ff376b25fb77d26"},"package":"5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921"}

Просмотреть файл

@ -1,7 +0,0 @@
Licensed under the Apache License, Version 2.0
<LICENSE-APACHE or
http://www.apache.org/licenses/LICENSE-2.0> or the MIT
license <LICENSE-MIT or http://opensource.org/licenses/MIT>,
at your option. All files in the project carrying such
notice may not be copied, modified, or distributed except
according to those terms.

Просмотреть файл

@ -1,52 +0,0 @@
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
#
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
# to registry (e.g., crates.io) dependencies.
#
# If you are reading this file be aware that the original Cargo.toml
# will likely look very different (and much more reasonable).
# See Cargo.toml.orig for the original contents.
[package]
edition = "2018"
name = "unicode-normalization"
version = "0.1.22"
authors = [
"kwantam <kwantam@gmail.com>",
"Manish Goregaokar <manishsmail@gmail.com>",
]
exclude = [
"target/*",
"Cargo.lock",
"scripts/tmp",
"*.txt",
"tests/*",
]
description = """
This crate provides functions for normalization of
Unicode strings, including Canonical and Compatible
Decomposition and Recomposition, as described in
Unicode Standard Annex #15.
"""
homepage = "https://github.com/unicode-rs/unicode-normalization"
documentation = "https://docs.rs/unicode-normalization/"
readme = "README.md"
keywords = [
"text",
"unicode",
"normalization",
"decomposition",
"recomposition",
]
license = "MIT/Apache-2.0"
repository = "https://github.com/unicode-rs/unicode-normalization"
[dependencies.tinyvec]
version = "1"
features = ["alloc"]
[features]
default = ["std"]
std = []

Просмотреть файл

@ -1,201 +0,0 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

Просмотреть файл

@ -1,39 +0,0 @@
# unicode-normalization
[![Build Status](https://travis-ci.org/unicode-rs/unicode-normalization.svg)](https://travis-ci.org/unicode-rs/unicode-normalization)
[![Docs](https://docs.rs/unicode-normalization/badge.svg)](https://docs.rs/unicode-normalization/)
Unicode character composition and decomposition utilities
as described in
[Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/).
This crate requires Rust 1.36+.
```rust
extern crate unicode_normalization;
use unicode_normalization::char::compose;
use unicode_normalization::UnicodeNormalization;
fn main() {
assert_eq!(compose('A','\u{30a}'), Some('Å'));
let s = "ÅΩ";
let c = s.nfc().collect::<String>();
assert_eq!(c, "ÅΩ");
}
```
## crates.io
You can use this package in your project by adding the following
to your `Cargo.toml`:
```toml
[dependencies]
unicode-normalization = "0.1.22"
```
## `no_std` + `alloc` support
This crate is completely `no_std` + `alloc` compatible. This can be enabled by disabling the `std` feature, i.e. specifying `default-features = false` for this crate on your `Cargo.toml`.

Просмотреть файл

@ -1,127 +0,0 @@
#![feature(test)]
extern crate test;
extern crate unicode_normalization;
use std::fs;
use test::Bencher;
use unicode_normalization::UnicodeNormalization;
const ASCII: &'static str = "all types of normalized";
const NFC: &'static str = "Introducci\u{00f3}n a Unicode.pdf";
const NFD: &'static str = "Introduccio\u{0301}n a Unicode.pdf";
#[bench]
fn bench_is_nfc_ascii(b: &mut Bencher) {
b.iter(|| unicode_normalization::is_nfc(ASCII));
}
#[bench]
fn bench_is_nfc_normalized(b: &mut Bencher) {
b.iter(|| unicode_normalization::is_nfc(NFC));
}
#[bench]
fn bench_is_nfc_not_normalized(b: &mut Bencher) {
b.iter(|| unicode_normalization::is_nfc(NFD));
}
#[bench]
fn bench_is_nfd_ascii(b: &mut Bencher) {
b.iter(|| unicode_normalization::is_nfd(ASCII));
}
#[bench]
fn bench_is_nfd_normalized(b: &mut Bencher) {
b.iter(|| unicode_normalization::is_nfd(NFD));
}
#[bench]
fn bench_is_nfd_not_normalized(b: &mut Bencher) {
b.iter(|| unicode_normalization::is_nfd(NFC));
}
#[bench]
fn bench_is_nfc_stream_safe_ascii(b: &mut Bencher) {
b.iter(|| unicode_normalization::is_nfc_stream_safe(ASCII));
}
#[bench]
fn bench_is_nfc_stream_safe_normalized(b: &mut Bencher) {
b.iter(|| unicode_normalization::is_nfc_stream_safe(NFC));
}
#[bench]
fn bench_is_nfc_stream_safe_not_normalized(b: &mut Bencher) {
b.iter(|| unicode_normalization::is_nfc_stream_safe(NFD));
}
#[bench]
fn bench_is_nfd_stream_safe_ascii(b: &mut Bencher) {
b.iter(|| unicode_normalization::is_nfd_stream_safe(ASCII));
}
#[bench]
fn bench_is_nfd_stream_safe_normalized(b: &mut Bencher) {
b.iter(|| unicode_normalization::is_nfd_stream_safe(NFD));
}
#[bench]
fn bench_is_nfd_stream_safe_not_normalized(b: &mut Bencher) {
b.iter(|| unicode_normalization::is_nfd_stream_safe(NFC));
}
#[bench]
fn bench_nfc_ascii(b: &mut Bencher) {
b.iter(|| ASCII.nfc().count());
}
#[bench]
fn bench_nfd_ascii(b: &mut Bencher) {
b.iter(|| ASCII.nfd().count());
}
#[bench]
fn bench_nfc_long(b: &mut Bencher) {
let long = fs::read_to_string("benches/long.txt").unwrap();
b.iter(|| long.nfc().count());
}
#[bench]
fn bench_nfd_long(b: &mut Bencher) {
let long = fs::read_to_string("benches/long.txt").unwrap();
b.iter(|| long.nfd().count());
}
#[bench]
fn bench_nfkc_ascii(b: &mut Bencher) {
b.iter(|| ASCII.nfkc().count());
}
#[bench]
fn bench_nfkd_ascii(b: &mut Bencher) {
b.iter(|| ASCII.nfkd().count());
}
#[bench]
fn bench_nfkc_long(b: &mut Bencher) {
let long = fs::read_to_string("benches/long.txt").unwrap();
b.iter(|| long.nfkc().count());
}
#[bench]
fn bench_nfkd_long(b: &mut Bencher) {
let long = fs::read_to_string("benches/long.txt").unwrap();
b.iter(|| long.nfkd().count());
}
#[bench]
fn bench_streamsafe_ascii(b: &mut Bencher) {
b.iter(|| ASCII.stream_safe().count());
}
#[bench]
fn bench_streamsafe_adversarial(b: &mut Bencher) {
let s = "bo\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}oom";
b.iter(|| s.stream_safe().count());
}

Просмотреть файл

@ -1,621 +0,0 @@
#!/usr/bin/env python
#
# Copyright 2011-2018 The Rust Project Developers. See the COPYRIGHT
# file at the top-level directory of this distribution and at
# http://rust-lang.org/COPYRIGHT.
#
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
# option. This file may not be copied, modified, or distributed
# except according to those terms.
# This script uses the following Unicode tables:
# - DerivedNormalizationProps.txt
# - NormalizationTest.txt
# - UnicodeData.txt
# - StandardizedVariants.txt
#
# Since this should not require frequent updates, we just store this
# out-of-line and check the tables.rs and normalization_tests.rs files into git.
import collections
import urllib.request
UNICODE_VERSION = "15.0.0"
UCD_URL = "https://www.unicode.org/Public/%s/ucd/" % UNICODE_VERSION
PREAMBLE = """// Copyright 2012-2018 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
#![allow(missing_docs)]
"""
NormalizationTest = collections.namedtuple(
"NormalizationTest",
["source", "nfc", "nfd", "nfkc", "nfkd"],
)
# Mapping taken from Table 12 from:
# http://www.unicode.org/reports/tr44/#General_Category_Values
expanded_categories = {
'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'],
'Lm': ['L'], 'Lo': ['L'],
'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'],
'Nd': ['N'], 'Nl': ['N'], 'No': ['No'],
'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'],
'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'],
'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'],
'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'],
'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
}
# Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior
# http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior
S_BASE, L_COUNT, V_COUNT, T_COUNT = 0xAC00, 19, 21, 28
S_COUNT = L_COUNT * V_COUNT * T_COUNT
class UnicodeData(object):
def __init__(self):
self._load_unicode_data()
self.norm_props = self._load_norm_props()
self.norm_tests = self._load_norm_tests()
self.canon_comp = self._compute_canonical_comp()
self.canon_fully_decomp, self.compat_fully_decomp = self._compute_fully_decomposed()
self.cjk_compat_variants_fully_decomp = {}
self._load_cjk_compat_ideograph_variants()
def stats(name, table):
count = sum(len(v) for v in table.values())
print("%s: %d chars => %d decomposed chars" % (name, len(table), count))
print("Decomposition table stats:")
stats("Canonical decomp", self.canon_decomp)
stats("Compatible decomp", self.compat_decomp)
stats("Canonical fully decomp", self.canon_fully_decomp)
stats("Compatible fully decomp", self.compat_fully_decomp)
stats("CJK Compat Variants fully decomp", self.cjk_compat_variants_fully_decomp)
self.ss_leading, self.ss_trailing = self._compute_stream_safe_tables()
def _fetch(self, filename):
resp = urllib.request.urlopen(UCD_URL + filename)
return resp.read().decode('utf-8')
def _load_unicode_data(self):
self.name_to_char_int = {}
self.combining_classes = {}
self.compat_decomp = {}
self.canon_decomp = {}
self.general_category_mark = []
self.general_category_public_assigned = []
assigned_start = 0;
prev_char_int = -1;
prev_name = "";
for line in self._fetch("UnicodeData.txt").splitlines():
# See ftp://ftp.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html
pieces = line.split(';')
assert len(pieces) == 15
char, name, category, cc, decomp = pieces[0], pieces[1], pieces[2], pieces[3], pieces[5]
char_int = int(char, 16)
name = pieces[1].strip()
self.name_to_char_int[name] = char_int
if cc != '0':
self.combining_classes[char_int] = cc
if decomp.startswith('<'):
self.compat_decomp[char_int] = [int(c, 16) for c in decomp.split()[1:]]
elif decomp != '':
self.canon_decomp[char_int] = [int(c, 16) for c in decomp.split()]
if category == 'M' or 'M' in expanded_categories.get(category, []):
self.general_category_mark.append(char_int)
assert category != 'Cn', "Unexpected: Unassigned codepoint in UnicodeData.txt"
if category not in ['Co', 'Cs']:
if char_int != prev_char_int + 1 and not is_first_and_last(prev_name, name):
self.general_category_public_assigned.append((assigned_start, prev_char_int))
assigned_start = char_int
prev_char_int = char_int
prev_name = name;
self.general_category_public_assigned.append((assigned_start, prev_char_int))
def _load_cjk_compat_ideograph_variants(self):
for line in self._fetch("StandardizedVariants.txt").splitlines():
strip_comments = line.split('#', 1)[0].strip()
if not strip_comments:
continue
variation_sequence, description, differences = strip_comments.split(';')
description = description.strip()
# Don't use variations that only apply in particular shaping environments.
if differences:
continue
# Look for entries where the description field is a codepoint name.
if description not in self.name_to_char_int:
continue
# Only consider the CJK Compatibility Ideographs.
if not description.startswith('CJK COMPATIBILITY IDEOGRAPH-'):
continue
char_int = self.name_to_char_int[description]
assert not char_int in self.combining_classes, "Unexpected: CJK compat variant with a combining class"
assert not char_int in self.compat_decomp, "Unexpected: CJK compat variant and compatibility decomposition"
assert len(self.canon_decomp[char_int]) == 1, "Unexpected: CJK compat variant and non-singleton canonical decomposition"
# If we ever need to handle Hangul here, we'll need to handle it separately.
assert not (S_BASE <= char_int < S_BASE + S_COUNT)
cjk_compat_variant_parts = [int(c, 16) for c in variation_sequence.split()]
for c in cjk_compat_variant_parts:
assert not c in self.canon_decomp, "Unexpected: CJK compat variant is unnormalized (canon)"
assert not c in self.compat_decomp, "Unexpected: CJK compat variant is unnormalized (compat)"
self.cjk_compat_variants_fully_decomp[char_int] = cjk_compat_variant_parts
def _load_norm_props(self):
props = collections.defaultdict(list)
for line in self._fetch("DerivedNormalizationProps.txt").splitlines():
(prop_data, _, _) = line.partition("#")
prop_pieces = prop_data.split(";")
if len(prop_pieces) < 2:
continue
assert len(prop_pieces) <= 3
(low, _, high) = prop_pieces[0].strip().partition("..")
prop = prop_pieces[1].strip()
data = None
if len(prop_pieces) == 3:
data = prop_pieces[2].strip()
props[prop].append((low, high, data))
return props
def _load_norm_tests(self):
tests = []
for line in self._fetch("NormalizationTest.txt").splitlines():
(test_data, _, _) = line.partition("#")
test_pieces = test_data.split(";")
if len(test_pieces) < 5:
continue
source, nfc, nfd, nfkc, nfkd = [[c.strip() for c in p.split()] for p in test_pieces[:5]]
tests.append(NormalizationTest(source, nfc, nfd, nfkc, nfkd))
return tests
def _compute_canonical_comp(self):
canon_comp = {}
comp_exclusions = [
(int(low, 16), int(high or low, 16))
for low, high, _ in self.norm_props["Full_Composition_Exclusion"]
]
for char_int, decomp in self.canon_decomp.items():
if any(lo <= char_int <= hi for lo, hi in comp_exclusions):
continue
assert len(decomp) == 2
assert (decomp[0], decomp[1]) not in canon_comp
canon_comp[(decomp[0], decomp[1])] = char_int
return canon_comp
def _compute_fully_decomposed(self):
"""
Even though the decomposition algorithm is recursive, it is possible
to precompute the recursion at table generation time with modest
increase to the table size. Then, for these precomputed tables, we
note that 1) compatible decomposition is a subset of canonical
decomposition and 2) they mostly agree on their intersection.
Therefore, we don't store entries in the compatible table for
characters that decompose the same way under canonical decomposition.
Decomposition table stats:
Canonical decomp: 2060 chars => 3085 decomposed chars
Compatible decomp: 3662 chars => 5440 decomposed chars
Canonical fully decomp: 2060 chars => 3404 decomposed chars
Compatible fully decomp: 3678 chars => 5599 decomposed chars
The upshot is that decomposition code is very simple and easy to inline
at mild code size cost.
"""
def _decompose(char_int, compatible):
# 7-bit ASCII never decomposes
if char_int <= 0x7f:
yield char_int
return
# Assert that we're handling Hangul separately.
assert not (S_BASE <= char_int < S_BASE + S_COUNT)
decomp = self.canon_decomp.get(char_int)
if decomp is not None:
for decomposed_ch in decomp:
for fully_decomposed_ch in _decompose(decomposed_ch, compatible):
yield fully_decomposed_ch
return
if compatible and char_int in self.compat_decomp:
for decomposed_ch in self.compat_decomp[char_int]:
for fully_decomposed_ch in _decompose(decomposed_ch, compatible):
yield fully_decomposed_ch
return
yield char_int
return
end_codepoint = max(
max(self.canon_decomp.keys()),
max(self.compat_decomp.keys()),
)
canon_fully_decomp = {}
compat_fully_decomp = {}
for char_int in range(0, end_codepoint + 1):
# Always skip Hangul, since it's more efficient to represent its
# decomposition programmatically.
if S_BASE <= char_int < S_BASE + S_COUNT:
continue
canon = list(_decompose(char_int, False))
if not (len(canon) == 1 and canon[0] == char_int):
canon_fully_decomp[char_int] = canon
compat = list(_decompose(char_int, True))
if not (len(compat) == 1 and compat[0] == char_int):
compat_fully_decomp[char_int] = compat
# Since canon_fully_decomp is a subset of compat_fully_decomp, we don't
# need to store their overlap when they agree. When they don't agree,
# store the decomposition in the compatibility table since we'll check
# that first when normalizing to NFKD.
assert set(canon_fully_decomp) <= set(compat_fully_decomp)
for ch in set(canon_fully_decomp) & set(compat_fully_decomp):
if canon_fully_decomp[ch] == compat_fully_decomp[ch]:
del compat_fully_decomp[ch]
return canon_fully_decomp, compat_fully_decomp
def _compute_stream_safe_tables(self):
"""
To make a text stream-safe with the Stream-Safe Text Process (UAX15-D4),
we need to be able to know the number of contiguous non-starters *after*
applying compatibility decomposition to each character.
We can do this incrementally by computing the number of leading and
trailing non-starters for each character's compatibility decomposition
with the following rules:
1) If a character is not affected by compatibility decomposition, look
up its canonical combining class to find out if it's a non-starter.
2) All Hangul characters are starters, even under decomposition.
3) Otherwise, very few decomposing characters have a nonzero count
of leading or trailing non-starters, so store these characters
with their associated counts in a separate table.
"""
leading_nonstarters = {}
trailing_nonstarters = {}
for c in set(self.canon_fully_decomp) | set(self.compat_fully_decomp):
decomposed = self.compat_fully_decomp.get(c) or self.canon_fully_decomp[c]
num_leading = 0
for d in decomposed:
if d not in self.combining_classes:
break
num_leading += 1
num_trailing = 0
for d in reversed(decomposed):
if d not in self.combining_classes:
break
num_trailing += 1
if num_leading > 0:
leading_nonstarters[c] = num_leading
if num_trailing > 0:
trailing_nonstarters[c] = num_trailing
return leading_nonstarters, trailing_nonstarters
hexify = lambda c: '{:04X}'.format(c)
# Test whether `first` and `last` are corresponding "<..., First>" and
# "<..., Last>" markers.
def is_first_and_last(first, last):
if not first.startswith('<') or not first.endswith(', First>'):
return False
if not last.startswith('<') or not last.endswith(', Last>'):
return False
return first[1:-8] == last[1:-7]
def gen_mph_data(name, d, kv_type, kv_callback):
(salt, keys) = minimal_perfect_hash(d)
out.write("pub(crate) const %s_SALT: &[u16] = &[\n" % name.upper())
for s in salt:
out.write(" 0x{:x},\n".format(s))
out.write("];\n")
out.write("pub(crate) const {}_KV: &[{}] = &[\n".format(name.upper(), kv_type))
for k in keys:
out.write(" {},\n".format(kv_callback(k)))
out.write("];\n\n")
def gen_combining_class(combining_classes, out):
gen_mph_data('canonical_combining_class', combining_classes, 'u32',
lambda k: "0x{:X}".format(int(combining_classes[k]) | (k << 8)))
def gen_composition_table(canon_comp, out):
table = {}
for (c1, c2), c3 in canon_comp.items():
if c1 < 0x10000 and c2 < 0x10000:
table[(c1 << 16) | c2] = c3
(salt, keys) = minimal_perfect_hash(table)
gen_mph_data('COMPOSITION_TABLE', table, '(u32, char)',
lambda k: "(0x%s, '\\u{%s}')" % (hexify(k), hexify(table[k])))
out.write("pub(crate) fn composition_table_astral(c1: char, c2: char) -> Option<char> {\n")
out.write(" match (c1, c2) {\n")
for (c1, c2), c3 in sorted(canon_comp.items()):
if c1 >= 0x10000 and c2 >= 0x10000:
out.write(" ('\\u{%s}', '\\u{%s}') => Some('\\u{%s}'),\n" % (hexify(c1), hexify(c2), hexify(c3)))
out.write(" _ => None,\n")
out.write(" }\n")
out.write("}\n")
def gen_decomposition_tables(canon_decomp, compat_decomp, cjk_compat_variants_decomp, out):
tables = [(canon_decomp, 'canonical'), (compat_decomp, 'compatibility'), (cjk_compat_variants_decomp, 'cjk_compat_variants')]
for table, name in tables:
offsets = {}
offset = 0
out.write("pub(crate) const %s_DECOMPOSED_CHARS: &[char] = &[\n" % name.upper())
for k, v in table.items():
offsets[k] = offset
offset += len(v)
for c in v:
out.write(" '\\u{%s}',\n" % hexify(c))
# The largest offset must fit in a u16.
assert offset < 65536
out.write("];\n")
gen_mph_data(name + '_decomposed', table, "(u32, (u16, u16))",
lambda k: "(0x{:x}, ({}, {}))".format(k, offsets[k], len(table[k])))
def gen_qc_match(prop_table, out):
out.write(" match c {\n")
for low, high, data in prop_table:
assert data in ('N', 'M')
result = "No" if data == 'N' else "Maybe"
if high:
out.write(r" '\u{%s}'...'\u{%s}' => %s," % (low, high, result))
else:
out.write(r" '\u{%s}' => %s," % (low, result))
out.write("\n")
out.write(" _ => Yes,\n")
out.write(" }\n")
def gen_nfc_qc(prop_tables, out):
out.write("#[inline]\n")
out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
out.write("pub fn qc_nfc(c: char) -> IsNormalized {\n")
gen_qc_match(prop_tables['NFC_QC'], out)
out.write("}\n")
def gen_nfkc_qc(prop_tables, out):
out.write("#[inline]\n")
out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
out.write("pub fn qc_nfkc(c: char) -> IsNormalized {\n")
gen_qc_match(prop_tables['NFKC_QC'], out)
out.write("}\n")
def gen_nfd_qc(prop_tables, out):
out.write("#[inline]\n")
out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
out.write("pub fn qc_nfd(c: char) -> IsNormalized {\n")
gen_qc_match(prop_tables['NFD_QC'], out)
out.write("}\n")
def gen_nfkd_qc(prop_tables, out):
out.write("#[inline]\n")
out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
out.write("pub fn qc_nfkd(c: char) -> IsNormalized {\n")
gen_qc_match(prop_tables['NFKD_QC'], out)
out.write("}\n")
def gen_combining_mark(general_category_mark, out):
gen_mph_data('combining_mark', general_category_mark, 'u32',
lambda k: '0x{:04x}'.format(k))
def gen_public_assigned(general_category_public_assigned, out):
# This could be done as a hash but the table is somewhat small.
out.write("#[inline]\n")
out.write("pub fn is_public_assigned(c: char) -> bool {\n")
out.write(" match c {\n")
start = True
for first, last in general_category_public_assigned:
if start:
out.write(" ")
start = False
else:
out.write(" | ")
if first == last:
out.write("'\\u{%s}'\n" % hexify(first))
else:
out.write("'\\u{%s}'..='\\u{%s}'\n" % (hexify(first), hexify(last)))
out.write(" => true,\n")
out.write(" _ => false,\n")
out.write(" }\n")
out.write("}\n")
out.write("\n")
def gen_stream_safe(leading, trailing, out):
# This could be done as a hash but the table is very small.
out.write("#[inline]\n")
out.write("pub fn stream_safe_leading_nonstarters(c: char) -> usize {\n")
out.write(" match c {\n")
for char, num_leading in sorted(leading.items()):
out.write(" '\\u{%s}' => %d,\n" % (hexify(char), num_leading))
out.write(" _ => 0,\n")
out.write(" }\n")
out.write("}\n")
out.write("\n")
gen_mph_data('trailing_nonstarters', trailing, 'u32',
lambda k: "0x{:X}".format(int(trailing[k]) | (k << 8)))
def gen_tests(tests, out):
out.write("""#[derive(Debug)]
pub struct NormalizationTest {
pub source: &'static str,
pub nfc: &'static str,
pub nfd: &'static str,
pub nfkc: &'static str,
pub nfkd: &'static str,
}
""")
out.write("pub const NORMALIZATION_TESTS: &[NormalizationTest] = &[\n")
str_literal = lambda s: '"%s"' % "".join("\\u{%s}" % c for c in s)
for test in tests:
out.write(" NormalizationTest {\n")
out.write(" source: %s,\n" % str_literal(test.source))
out.write(" nfc: %s,\n" % str_literal(test.nfc))
out.write(" nfd: %s,\n" % str_literal(test.nfd))
out.write(" nfkc: %s,\n" % str_literal(test.nfkc))
out.write(" nfkd: %s,\n" % str_literal(test.nfkd))
out.write(" },\n")
out.write("];\n")
# Guaranteed to be less than n.
def my_hash(x, salt, n):
# This is hash based on the theory that multiplication is efficient
mask_32 = 0xffffffff
y = ((x + salt) * 2654435769) & mask_32
y ^= (x * 0x31415926) & mask_32
return (y * n) >> 32
# Compute minimal perfect hash function, d can be either a dict or list of keys.
def minimal_perfect_hash(d):
n = len(d)
buckets = dict((h, []) for h in range(n))
for key in d:
h = my_hash(key, 0, n)
buckets[h].append(key)
bsorted = [(len(buckets[h]), h) for h in range(n)]
bsorted.sort(reverse = True)
claimed = [False] * n
salts = [0] * n
keys = [0] * n
for (bucket_size, h) in bsorted:
# Note: the traditional perfect hashing approach would also special-case
# bucket_size == 1 here and assign any empty slot, rather than iterating
# until rehash finds an empty slot. But we're not doing that so we can
# avoid the branch.
if bucket_size == 0:
break
else:
for salt in range(1, 32768):
rehashes = [my_hash(key, salt, n) for key in buckets[h]]
# Make sure there are no rehash collisions within this bucket.
if all(not claimed[hash] for hash in rehashes):
if len(set(rehashes)) < bucket_size:
continue
salts[h] = salt
for key in buckets[h]:
rehash = my_hash(key, salt, n)
claimed[rehash] = True
keys[rehash] = key
break
if salts[h] == 0:
print("minimal perfect hashing failed")
# Note: if this happens (because of unfortunate data), then there are
# a few things that could be done. First, the hash function could be
# tweaked. Second, the bucket order could be scrambled (especially the
# singletons). Right now, the buckets are sorted, which has the advantage
# of being deterministic.
#
# As a more extreme approach, the singleton bucket optimization could be
# applied (give the direct address for singleton buckets, rather than
# relying on a rehash). That is definitely the more standard approach in
# the minimal perfect hashing literature, but in testing the branch was a
# significant slowdown.
exit(1)
return (salts, keys)
if __name__ == '__main__':
data = UnicodeData()
with open("tables.rs", "w", newline = "\n") as out:
out.write(PREAMBLE)
out.write("use crate::quick_check::IsNormalized;\n")
out.write("use crate::quick_check::IsNormalized::*;\n")
out.write("\n")
version = "(%s, %s, %s)" % tuple(UNICODE_VERSION.split("."))
out.write("#[allow(unused)]\n")
out.write("pub const UNICODE_VERSION: (u8, u8, u8) = %s;\n\n" % version)
gen_combining_class(data.combining_classes, out)
out.write("\n")
gen_composition_table(data.canon_comp, out)
out.write("\n")
gen_decomposition_tables(data.canon_fully_decomp, data.compat_fully_decomp, data.cjk_compat_variants_fully_decomp, out)
gen_combining_mark(data.general_category_mark, out)
out.write("\n")
gen_public_assigned(data.general_category_public_assigned, out)
out.write("\n")
gen_nfc_qc(data.norm_props, out)
out.write("\n")
gen_nfkc_qc(data.norm_props, out)
out.write("\n")
gen_nfd_qc(data.norm_props, out)
out.write("\n")
gen_nfkd_qc(data.norm_props, out)
out.write("\n")
gen_stream_safe(data.ss_leading, data.ss_trailing, out)
out.write("\n")
with open("normalization_tests.rs", "w", newline = "\n") as out:
out.write(PREAMBLE)
gen_tests(data.norm_tests, out)

Просмотреть файл

@ -1,18 +0,0 @@
// This crate comprises hacks and glue required to test private functions from tests/
//
// Keep this as slim as possible.
//
// If you're caught using this outside this crates tests/, you get to clean up the mess.
#[cfg(not(feature = "std"))]
use crate::no_std_prelude::*;
use crate::stream_safe::StreamSafe;
pub fn stream_safe(s: &str) -> String {
StreamSafe::new(s.chars()).collect()
}
pub mod quick_check {
pub use crate::quick_check::*;
}

Просмотреть файл

@ -1,161 +0,0 @@
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use core::fmt::{self, Write};
use core::iter::Fuse;
use core::ops::Range;
use tinyvec::TinyVec;
#[derive(Clone)]
enum DecompositionType {
Canonical,
Compatible,
}
/// External iterator for a string decomposition's characters.
#[derive(Clone)]
pub struct Decompositions<I> {
kind: DecompositionType,
iter: Fuse<I>,
// This buffer stores pairs of (canonical combining class, character),
// pushed onto the end in text order.
//
// It's divided into up to three sections:
// 1) A prefix that is free space;
// 2) "Ready" characters which are sorted and ready to emit on demand;
// 3) A "pending" block which stills needs more characters for us to be able
// to sort in canonical order and is not safe to emit.
buffer: TinyVec<[(u8, char); 4]>,
ready: Range<usize>,
}
#[inline]
pub fn new_canonical<I: Iterator<Item = char>>(iter: I) -> Decompositions<I> {
Decompositions {
kind: self::DecompositionType::Canonical,
iter: iter.fuse(),
buffer: TinyVec::new(),
ready: 0..0,
}
}
#[inline]
pub fn new_compatible<I: Iterator<Item = char>>(iter: I) -> Decompositions<I> {
Decompositions {
kind: self::DecompositionType::Compatible,
iter: iter.fuse(),
buffer: TinyVec::new(),
ready: 0..0,
}
}
impl<I> Decompositions<I> {
#[inline]
fn push_back(&mut self, ch: char) {
let class = super::char::canonical_combining_class(ch);
if class == 0 {
self.sort_pending();
self.buffer.push((class, ch));
self.ready.end = self.buffer.len();
} else {
self.buffer.push((class, ch));
}
}
#[inline]
fn sort_pending(&mut self) {
// NB: `sort_by_key` is stable, so it will preserve the original text's
// order within a combining class.
self.buffer[self.ready.end..].sort_by_key(|k| k.0);
}
#[inline]
fn reset_buffer(&mut self) {
// Equivalent to `self.buffer.drain(0..self.ready.end)`
// but faster than drain() if the buffer is a SmallVec or TinyVec
let pending = self.buffer.len() - self.ready.end;
for i in 0..pending {
self.buffer[i] = self.buffer[i + self.ready.end];
}
self.buffer.truncate(pending);
self.ready = 0..0;
}
#[inline]
fn increment_next_ready(&mut self) {
let next = self.ready.start + 1;
if next == self.ready.end {
self.reset_buffer();
} else {
self.ready.start = next;
}
}
}
impl<I: Iterator<Item = char>> Iterator for Decompositions<I> {
type Item = char;
#[inline]
fn next(&mut self) -> Option<char> {
while self.ready.end == 0 {
match (self.iter.next(), &self.kind) {
(Some(ch), &DecompositionType::Canonical) => {
super::char::decompose_canonical(ch, |d| self.push_back(d));
}
(Some(ch), &DecompositionType::Compatible) => {
super::char::decompose_compatible(ch, |d| self.push_back(d));
}
(None, _) => {
if self.buffer.is_empty() {
return None;
} else {
self.sort_pending();
self.ready.end = self.buffer.len();
// This implementation means that we can call `next`
// on an exhausted iterator; the last outer `next` call
// will result in an inner `next` call. To make this
// safe, we use `fuse`.
break;
}
}
}
}
// We can assume here that, if `self.ready.end` is greater than zero,
// it's also greater than `self.ready.start`. That's because we only
// increment `self.ready.start` inside `increment_next_ready`, and
// whenever it reaches equality with `self.ready.end`, we reset both
// to zero, maintaining the invariant that:
// self.ready.start < self.ready.end || self.ready.end == self.ready.start == 0
//
// This less-than-obviously-safe implementation is chosen for performance,
// minimizing the number & complexity of branches in `next` in the common
// case of buffering then unbuffering a single character with each call.
let (_, ch) = self.buffer[self.ready.start];
self.increment_next_ready();
Some(ch)
}
fn size_hint(&self) -> (usize, Option<usize>) {
let (lower, _) = self.iter.size_hint();
(lower, None)
}
}
impl<I: Iterator<Item = char> + Clone> fmt::Display for Decompositions<I> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
for c in self.clone() {
f.write_char(c)?;
}
Ok(())
}
}

Просмотреть файл

@ -1,235 +0,0 @@
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//! Unicode character composition and decomposition utilities
//! as described in
//! [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/).
//!
//! ```rust
//! extern crate unicode_normalization;
//!
//! use unicode_normalization::char::compose;
//! use unicode_normalization::UnicodeNormalization;
//!
//! fn main() {
//! assert_eq!(compose('A','\u{30a}'), Some('Å'));
//!
//! let s = "ÅΩ";
//! let c = s.nfc().collect::<String>();
//! assert_eq!(c, "ÅΩ");
//! }
//! ```
//!
//! # crates.io
//!
//! You can use this package in your project by adding the following
//! to your `Cargo.toml`:
//!
//! ```toml
//! [dependencies]
//! unicode-normalization = "0.1.20"
//! ```
#![deny(missing_docs, unsafe_code)]
#![doc(
html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png"
)]
#![cfg_attr(not(feature = "std"), no_std)]
#[cfg(not(feature = "std"))]
extern crate alloc;
#[cfg(feature = "std")]
extern crate core;
extern crate tinyvec;
pub use crate::decompose::Decompositions;
pub use crate::quick_check::{
is_nfc, is_nfc_quick, is_nfc_stream_safe, is_nfc_stream_safe_quick, is_nfd, is_nfd_quick,
is_nfd_stream_safe, is_nfd_stream_safe_quick, is_nfkc, is_nfkc_quick, is_nfkd, is_nfkd_quick,
IsNormalized,
};
pub use crate::recompose::Recompositions;
pub use crate::replace::Replacements;
pub use crate::stream_safe::StreamSafe;
pub use crate::tables::UNICODE_VERSION;
use core::{
str::Chars,
option,
};
mod no_std_prelude;
mod decompose;
mod lookups;
mod normalize;
mod perfect_hash;
mod quick_check;
mod recompose;
mod replace;
mod stream_safe;
#[rustfmt::skip]
mod tables;
#[doc(hidden)]
pub mod __test_api;
#[cfg(test)]
mod test;
/// Methods for composing and decomposing characters.
pub mod char {
pub use crate::normalize::{
compose, decompose_canonical, decompose_cjk_compat_variants, decompose_compatible,
};
pub use crate::lookups::{canonical_combining_class, is_combining_mark};
/// Return whether the given character is assigned (`General_Category` != `Unassigned`)
/// and not Private-Use (`General_Category` != `Private_Use`), in the supported version
/// of Unicode.
pub use crate::tables::is_public_assigned;
}
/// Methods for iterating over strings while applying Unicode normalizations
/// as described in
/// [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/).
pub trait UnicodeNormalization<I: Iterator<Item = char>> {
/// Returns an iterator over the string in Unicode Normalization Form D
/// (canonical decomposition).
fn nfd(self) -> Decompositions<I>;
/// Returns an iterator over the string in Unicode Normalization Form KD
/// (compatibility decomposition).
fn nfkd(self) -> Decompositions<I>;
/// An Iterator over the string in Unicode Normalization Form C
/// (canonical decomposition followed by canonical composition).
fn nfc(self) -> Recompositions<I>;
/// An Iterator over the string in Unicode Normalization Form KC
/// (compatibility decomposition followed by canonical composition).
fn nfkc(self) -> Recompositions<I>;
/// A transformation which replaces CJK Compatibility Ideograph codepoints
/// with normal forms using Standardized Variation Sequences. This is not
/// part of the canonical or compatibility decomposition algorithms, but
/// performing it before those algorithms produces normalized output which
/// better preserves the intent of the original text.
///
/// Note that many systems today ignore variation selectors, so these
/// may not immediately help text display as intended, but they at
/// least preserve the information in a standardized form, giving
/// implementations the option to recognize them.
fn cjk_compat_variants(self) -> Replacements<I>;
/// An Iterator over the string with Conjoining Grapheme Joiner characters
/// inserted according to the Stream-Safe Text Process (UAX15-D4)
fn stream_safe(self) -> StreamSafe<I>;
}
impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
#[inline]
fn nfd(self) -> Decompositions<Chars<'a>> {
decompose::new_canonical(self.chars())
}
#[inline]
fn nfkd(self) -> Decompositions<Chars<'a>> {
decompose::new_compatible(self.chars())
}
#[inline]
fn nfc(self) -> Recompositions<Chars<'a>> {
recompose::new_canonical(self.chars())
}
#[inline]
fn nfkc(self) -> Recompositions<Chars<'a>> {
recompose::new_compatible(self.chars())
}
#[inline]
fn cjk_compat_variants(self) -> Replacements<Chars<'a>> {
replace::new_cjk_compat_variants(self.chars())
}
#[inline]
fn stream_safe(self) -> StreamSafe<Chars<'a>> {
StreamSafe::new(self.chars())
}
}
impl UnicodeNormalization<option::IntoIter<char>> for char {
#[inline]
fn nfd(self) -> Decompositions<option::IntoIter<char>> {
decompose::new_canonical(Some(self).into_iter())
}
#[inline]
fn nfkd(self) -> Decompositions<option::IntoIter<char>> {
decompose::new_compatible(Some(self).into_iter())
}
#[inline]
fn nfc(self) -> Recompositions<option::IntoIter<char>> {
recompose::new_canonical(Some(self).into_iter())
}
#[inline]
fn nfkc(self) -> Recompositions<option::IntoIter<char>> {
recompose::new_compatible(Some(self).into_iter())
}
#[inline]
fn cjk_compat_variants(self) -> Replacements<option::IntoIter<char>> {
replace::new_cjk_compat_variants(Some(self).into_iter())
}
#[inline]
fn stream_safe(self) -> StreamSafe<option::IntoIter<char>> {
StreamSafe::new(Some(self).into_iter())
}
}
impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I {
#[inline]
fn nfd(self) -> Decompositions<I> {
decompose::new_canonical(self)
}
#[inline]
fn nfkd(self) -> Decompositions<I> {
decompose::new_compatible(self)
}
#[inline]
fn nfc(self) -> Recompositions<I> {
recompose::new_canonical(self)
}
#[inline]
fn nfkc(self) -> Recompositions<I> {
recompose::new_compatible(self)
}
#[inline]
fn cjk_compat_variants(self) -> Replacements<I> {
replace::new_cjk_compat_variants(self)
}
#[inline]
fn stream_safe(self) -> StreamSafe<I> {
StreamSafe::new(self)
}
}

Просмотреть файл

@ -1,138 +0,0 @@
// Copyright 2019 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//! Lookups of unicode properties using minimal perfect hashing.
use crate::perfect_hash::mph_lookup;
use crate::tables::*;
/// Look up the canonical combining class for a codepoint.
///
/// The value returned is as defined in the Unicode Character Database.
pub fn canonical_combining_class(c: char) -> u8 {
mph_lookup(
c.into(),
CANONICAL_COMBINING_CLASS_SALT,
CANONICAL_COMBINING_CLASS_KV,
u8_lookup_fk,
u8_lookup_fv,
0,
)
}
pub(crate) fn composition_table(c1: char, c2: char) -> Option<char> {
if c1 < '\u{10000}' && c2 < '\u{10000}' {
mph_lookup(
(c1 as u32) << 16 | (c2 as u32),
COMPOSITION_TABLE_SALT,
COMPOSITION_TABLE_KV,
pair_lookup_fk,
pair_lookup_fv_opt,
None,
)
} else {
composition_table_astral(c1, c2)
}
}
pub(crate) fn canonical_fully_decomposed(c: char) -> Option<&'static [char]> {
mph_lookup(
c.into(),
CANONICAL_DECOMPOSED_SALT,
CANONICAL_DECOMPOSED_KV,
pair_lookup_fk,
pair_lookup_fv_opt,
None,
)
.map(|(start, len)| &CANONICAL_DECOMPOSED_CHARS[start as usize..][..len as usize])
}
pub(crate) fn compatibility_fully_decomposed(c: char) -> Option<&'static [char]> {
mph_lookup(
c.into(),
COMPATIBILITY_DECOMPOSED_SALT,
COMPATIBILITY_DECOMPOSED_KV,
pair_lookup_fk,
pair_lookup_fv_opt,
None,
)
.map(|(start, len)| &COMPATIBILITY_DECOMPOSED_CHARS[start as usize..][..len as usize])
}
pub(crate) fn cjk_compat_variants_fully_decomposed(c: char) -> Option<&'static [char]> {
mph_lookup(
c.into(),
CJK_COMPAT_VARIANTS_DECOMPOSED_SALT,
CJK_COMPAT_VARIANTS_DECOMPOSED_KV,
pair_lookup_fk,
pair_lookup_fv_opt,
None,
)
.map(|(start, len)| &CJK_COMPAT_VARIANTS_DECOMPOSED_CHARS[start as usize..][..len as usize])
}
/// Return whether the given character is a combining mark (`General_Category=Mark`)
pub fn is_combining_mark(c: char) -> bool {
mph_lookup(
c.into(),
COMBINING_MARK_SALT,
COMBINING_MARK_KV,
bool_lookup_fk,
bool_lookup_fv,
false,
)
}
pub fn stream_safe_trailing_nonstarters(c: char) -> usize {
mph_lookup(
c.into(),
TRAILING_NONSTARTERS_SALT,
TRAILING_NONSTARTERS_KV,
u8_lookup_fk,
u8_lookup_fv,
0,
) as usize
}
/// Extract the key in a 24 bit key and 8 bit value packed in a u32.
#[inline]
fn u8_lookup_fk(kv: u32) -> u32 {
kv >> 8
}
/// Extract the value in a 24 bit key and 8 bit value packed in a u32.
#[inline]
fn u8_lookup_fv(kv: u32) -> u8 {
(kv & 0xff) as u8
}
/// Extract the key for a boolean lookup.
#[inline]
fn bool_lookup_fk(kv: u32) -> u32 {
kv
}
/// Extract the value for a boolean lookup.
#[inline]
fn bool_lookup_fv(_kv: u32) -> bool {
true
}
/// Extract the key in a pair.
#[inline]
fn pair_lookup_fk<T>(kv: (u32, T)) -> u32 {
kv.0
}
/// Extract the value in a pair, returning an option.
#[inline]
fn pair_lookup_fv_opt<T>(kv: (u32, T)) -> Option<T> {
Some(kv.1)
}

Просмотреть файл

@ -1,6 +0,0 @@
#[cfg(not(feature = "std"))]
pub use alloc::{
str::Chars,
string::{String, ToString},
vec::Vec,
};

Просмотреть файл

@ -1,201 +0,0 @@
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//! Functions for computing canonical and compatible decompositions for Unicode characters.
use crate::lookups::{
canonical_fully_decomposed, cjk_compat_variants_fully_decomposed,
compatibility_fully_decomposed, composition_table,
};
use core::{char, ops::FnMut};
/// Compute canonical Unicode decomposition for character.
/// See [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/)
/// for more information.
#[inline]
pub fn decompose_canonical<F>(c: char, emit_char: F)
where
F: FnMut(char),
{
decompose(c, canonical_fully_decomposed, emit_char)
}
/// Compute canonical or compatible Unicode decomposition for character.
/// See [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/)
/// for more information.
#[inline]
pub fn decompose_compatible<F: FnMut(char)>(c: char, emit_char: F) {
let decompose_char =
|c| compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c));
decompose(c, decompose_char, emit_char)
}
/// Compute standard-variation decomposition for character.
///
/// [Standardized Variation Sequences] are used instead of the standard canonical
/// decompositions, notably for CJK codepoints with singleton canonical decompositions,
/// to avoid losing information. See the
/// [Unicode Variation Sequence FAQ](http://unicode.org/faq/vs.html) and the
/// "Other Enhancements" section of the
/// [Unicode 6.3 Release Summary](https://www.unicode.org/versions/Unicode6.3.0/#Summary)
/// for more information.
#[inline]
pub fn decompose_cjk_compat_variants<F>(c: char, mut emit_char: F)
where
F: FnMut(char),
{
// 7-bit ASCII never decomposes
if c <= '\x7f' {
emit_char(c);
return;
}
// Don't perform decomposition for Hangul
if let Some(decomposed) = cjk_compat_variants_fully_decomposed(c) {
for &d in decomposed {
emit_char(d);
}
return;
}
// Finally bottom out.
emit_char(c);
}
#[inline]
fn decompose<D, F>(c: char, decompose_char: D, mut emit_char: F)
where
D: Fn(char) -> Option<&'static [char]>,
F: FnMut(char),
{
// 7-bit ASCII never decomposes
if c <= '\x7f' {
emit_char(c);
return;
}
// Perform decomposition for Hangul
if is_hangul_syllable(c) {
decompose_hangul(c, emit_char);
return;
}
if let Some(decomposed) = decompose_char(c) {
for &d in decomposed {
emit_char(d);
}
return;
}
// Finally bottom out.
emit_char(c);
}
/// Compose two characters into a single character, if possible.
/// See [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/)
/// for more information.
pub fn compose(a: char, b: char) -> Option<char> {
compose_hangul(a, b).or_else(|| composition_table(a, b))
}
// Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior
// http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior
const S_BASE: u32 = 0xAC00;
const L_BASE: u32 = 0x1100;
const V_BASE: u32 = 0x1161;
const T_BASE: u32 = 0x11A7;
const L_COUNT: u32 = 19;
const V_COUNT: u32 = 21;
const T_COUNT: u32 = 28;
const N_COUNT: u32 = V_COUNT * T_COUNT;
const S_COUNT: u32 = L_COUNT * N_COUNT;
const S_LAST: u32 = S_BASE + S_COUNT - 1;
const L_LAST: u32 = L_BASE + L_COUNT - 1;
const V_LAST: u32 = V_BASE + V_COUNT - 1;
const T_LAST: u32 = T_BASE + T_COUNT - 1;
// Composition only occurs for `TPart`s in `U+11A8 ... U+11C2`,
// i.e. `T_BASE + 1 ... T_LAST`.
const T_FIRST: u32 = T_BASE + 1;
pub(crate) fn is_hangul_syllable(c: char) -> bool {
(c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT)
}
// Decompose a precomposed Hangul syllable
#[allow(unsafe_code)]
#[inline(always)]
fn decompose_hangul<F>(s: char, mut emit_char: F)
where
F: FnMut(char),
{
let s_index = s as u32 - S_BASE;
let l_index = s_index / N_COUNT;
unsafe {
emit_char(char::from_u32_unchecked(L_BASE + l_index));
let v_index = (s_index % N_COUNT) / T_COUNT;
emit_char(char::from_u32_unchecked(V_BASE + v_index));
let t_index = s_index % T_COUNT;
if t_index > 0 {
emit_char(char::from_u32_unchecked(T_BASE + t_index));
}
}
}
#[inline]
pub(crate) fn hangul_decomposition_length(s: char) -> usize {
let si = s as u32 - S_BASE;
let ti = si % T_COUNT;
if ti > 0 {
3
} else {
2
}
}
// Compose a pair of Hangul Jamo
#[allow(unsafe_code)]
#[inline(always)]
#[allow(ellipsis_inclusive_range_patterns)]
fn compose_hangul(a: char, b: char) -> Option<char> {
let (a, b) = (a as u32, b as u32);
match (a, b) {
// Compose a leading consonant and a vowel together into an LV_Syllable
(L_BASE...L_LAST, V_BASE...V_LAST) => {
let l_index = a - L_BASE;
let v_index = b - V_BASE;
let lv_index = l_index * N_COUNT + v_index * T_COUNT;
let s = S_BASE + lv_index;
Some(unsafe { char::from_u32_unchecked(s) })
}
// Compose an LV_Syllable and a trailing consonant into an LVT_Syllable
(S_BASE...S_LAST, T_FIRST...T_LAST) if (a - S_BASE) % T_COUNT == 0 => {
Some(unsafe { char::from_u32_unchecked(a + (b - T_BASE)) })
}
_ => None,
}
}
#[cfg(test)]
mod tests {
use super::compose_hangul;
// Regression test from a bugfix where we were composing an LV_Syllable with
// T_BASE directly. (We should only compose an LV_Syllable with a character
// in the range `T_BASE + 1 ... T_LAST`.)
#[test]
fn test_hangul_composition() {
assert_eq!(compose_hangul('\u{c8e0}', '\u{11a7}'), None);
}
}

Просмотреть файл

@ -1,50 +0,0 @@
// Copyright 2019 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//! Support for lookups based on minimal perfect hashing.
// This function is based on multiplication being fast and is "good enough". Also
// it can share some work between the unsalted and salted versions.
#[inline]
fn my_hash(key: u32, salt: u32, n: usize) -> usize {
let y = key.wrapping_add(salt).wrapping_mul(2654435769);
let y = y ^ key.wrapping_mul(0x31415926);
(((y as u64) * (n as u64)) >> 32) as usize
}
/// Do a lookup using minimal perfect hashing.
///
/// The table is stored as a sequence of "salt" values, then a sequence of
/// values that contain packed key/value pairs. The strategy is to hash twice.
/// The first hash retrieves a salt value that makes the second hash unique.
/// The hash function doesn't have to be very good, just good enough that the
/// resulting map is unique.
#[inline]
pub(crate) fn mph_lookup<KV, V, FK, FV>(
x: u32,
salt: &[u16],
kv: &[KV],
fk: FK,
fv: FV,
default: V,
) -> V
where
KV: Copy,
FK: Fn(KV) -> u32,
FV: Fn(KV) -> V,
{
let s = salt[my_hash(x, 0, salt.len())] as u32;
let key_val = kv[my_hash(x, s, salt.len())];
if x == fk(key_val) {
fv(key_val)
} else {
default
}
}

Просмотреть файл

@ -1,187 +0,0 @@
use crate::lookups::canonical_combining_class;
use crate::stream_safe;
use crate::tables;
use crate::UnicodeNormalization;
/// The QuickCheck algorithm can quickly determine if a text is or isn't
/// normalized without any allocations in many cases, but it has to be able to
/// return `Maybe` when a full decomposition and recomposition is necessary.
#[derive(Debug, Eq, PartialEq)]
pub enum IsNormalized {
/// The text is definitely normalized.
Yes,
/// The text is definitely not normalized.
No,
/// The text may be normalized.
Maybe,
}
// https://unicode.org/reports/tr15/#Detecting_Normalization_Forms
#[inline]
fn quick_check<F, I>(s: I, is_allowed: F, stream_safe: bool) -> IsNormalized
where
I: Iterator<Item = char>,
F: Fn(char) -> IsNormalized,
{
let mut last_cc = 0u8;
let mut nonstarter_count = 0;
let mut result = IsNormalized::Yes;
for ch in s {
// For ASCII we know it's always allowed and a starter
if ch <= '\x7f' {
last_cc = 0;
nonstarter_count = 0;
continue;
}
// Otherwise, lookup the combining class and QC property
let cc = canonical_combining_class(ch);
if last_cc > cc && cc != 0 {
return IsNormalized::No;
}
match is_allowed(ch) {
IsNormalized::Yes => (),
IsNormalized::No => return IsNormalized::No,
IsNormalized::Maybe => {
result = IsNormalized::Maybe;
}
}
if stream_safe {
let decomp = stream_safe::classify_nonstarters(ch);
// If we're above `MAX_NONSTARTERS`, we're definitely *not*
// stream-safe normalized.
if nonstarter_count + decomp.leading_nonstarters > stream_safe::MAX_NONSTARTERS {
return IsNormalized::No;
}
if decomp.leading_nonstarters == decomp.decomposition_len {
nonstarter_count += decomp.decomposition_len;
} else {
nonstarter_count = decomp.trailing_nonstarters;
}
}
last_cc = cc;
}
result
}
/// Quickly check if a string is in NFC, potentially returning
/// `IsNormalized::Maybe` if further checks are necessary. In this case a check
/// like `s.chars().nfc().eq(s.chars())` should suffice.
#[inline]
pub fn is_nfc_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
quick_check(s, tables::qc_nfc, false)
}
/// Quickly check if a string is in NFKC.
#[inline]
pub fn is_nfkc_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
quick_check(s, tables::qc_nfkc, false)
}
/// Quickly check if a string is in NFD.
#[inline]
pub fn is_nfd_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
quick_check(s, tables::qc_nfd, false)
}
/// Quickly check if a string is in NFKD.
#[inline]
pub fn is_nfkd_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
quick_check(s, tables::qc_nfkd, false)
}
/// Quickly check if a string is Stream-Safe NFC.
#[inline]
pub fn is_nfc_stream_safe_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
quick_check(s, tables::qc_nfc, true)
}
/// Quickly check if a string is Stream-Safe NFD.
#[inline]
pub fn is_nfd_stream_safe_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
quick_check(s, tables::qc_nfd, true)
}
/// Authoritatively check if a string is in NFC.
#[inline]
pub fn is_nfc(s: &str) -> bool {
match is_nfc_quick(s.chars()) {
IsNormalized::Yes => true,
IsNormalized::No => false,
IsNormalized::Maybe => s.chars().eq(s.chars().nfc()),
}
}
/// Authoritatively check if a string is in NFKC.
#[inline]
pub fn is_nfkc(s: &str) -> bool {
match is_nfkc_quick(s.chars()) {
IsNormalized::Yes => true,
IsNormalized::No => false,
IsNormalized::Maybe => s.chars().eq(s.chars().nfkc()),
}
}
/// Authoritatively check if a string is in NFD.
#[inline]
pub fn is_nfd(s: &str) -> bool {
match is_nfd_quick(s.chars()) {
IsNormalized::Yes => true,
IsNormalized::No => false,
IsNormalized::Maybe => s.chars().eq(s.chars().nfd()),
}
}
/// Authoritatively check if a string is in NFKD.
#[inline]
pub fn is_nfkd(s: &str) -> bool {
match is_nfkd_quick(s.chars()) {
IsNormalized::Yes => true,
IsNormalized::No => false,
IsNormalized::Maybe => s.chars().eq(s.chars().nfkd()),
}
}
/// Authoritatively check if a string is Stream-Safe NFC.
#[inline]
pub fn is_nfc_stream_safe(s: &str) -> bool {
match is_nfc_stream_safe_quick(s.chars()) {
IsNormalized::Yes => true,
IsNormalized::No => false,
IsNormalized::Maybe => s.chars().eq(s.chars().stream_safe().nfc()),
}
}
/// Authoritatively check if a string is Stream-Safe NFD.
#[inline]
pub fn is_nfd_stream_safe(s: &str) -> bool {
match is_nfd_stream_safe_quick(s.chars()) {
IsNormalized::Yes => true,
IsNormalized::No => false,
IsNormalized::Maybe => s.chars().eq(s.chars().stream_safe().nfd()),
}
}
#[cfg(test)]
mod tests {
use super::{is_nfc_stream_safe_quick, is_nfd_stream_safe_quick, IsNormalized};
#[test]
fn test_stream_safe_nfd() {
let okay = "Da\u{031b}\u{0316}\u{0317}\u{0318}\u{0319}\u{031c}\u{031d}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{031a}ngerzone";
assert_eq!(is_nfd_stream_safe_quick(okay.chars()), IsNormalized::Yes);
let too_much = "Da\u{031b}\u{0316}\u{0317}\u{0318}\u{0319}\u{031c}\u{031d}\u{031e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{031a}ngerzone";
assert_eq!(is_nfd_stream_safe_quick(too_much.chars()), IsNormalized::No);
}
#[test]
fn test_stream_safe_nfc() {
let okay = "ok\u{e0}\u{031b}\u{0316}\u{0317}\u{0318}\u{0319}\u{031c}\u{031d}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{031a}y";
assert_eq!(is_nfc_stream_safe_quick(okay.chars()), IsNormalized::Maybe);
let too_much = "not ok\u{e0}\u{031b}\u{0316}\u{0317}\u{0318}\u{0319}\u{031c}\u{031d}\u{031e}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{031a}y";
assert_eq!(is_nfc_stream_safe_quick(too_much.chars()), IsNormalized::No);
}
}

Просмотреть файл

@ -1,154 +0,0 @@
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use crate::decompose::Decompositions;
use core::fmt::{self, Write};
use tinyvec::TinyVec;
#[derive(Clone)]
enum RecompositionState {
Composing,
Purging(usize),
Finished(usize),
}
/// External iterator for a string recomposition's characters.
#[derive(Clone)]
pub struct Recompositions<I> {
iter: Decompositions<I>,
state: RecompositionState,
buffer: TinyVec<[char; 4]>,
composee: Option<char>,
last_ccc: Option<u8>,
}
#[inline]
pub fn new_canonical<I: Iterator<Item = char>>(iter: I) -> Recompositions<I> {
Recompositions {
iter: super::decompose::new_canonical(iter),
state: self::RecompositionState::Composing,
buffer: TinyVec::new(),
composee: None,
last_ccc: None,
}
}
#[inline]
pub fn new_compatible<I: Iterator<Item = char>>(iter: I) -> Recompositions<I> {
Recompositions {
iter: super::decompose::new_compatible(iter),
state: self::RecompositionState::Composing,
buffer: TinyVec::new(),
composee: None,
last_ccc: None,
}
}
impl<I: Iterator<Item = char>> Iterator for Recompositions<I> {
type Item = char;
#[inline]
fn next(&mut self) -> Option<char> {
use self::RecompositionState::*;
loop {
match self.state {
Composing => {
for ch in self.iter.by_ref() {
let ch_class = super::char::canonical_combining_class(ch);
let k = match self.composee {
None => {
if ch_class != 0 {
return Some(ch);
}
self.composee = Some(ch);
continue;
}
Some(k) => k,
};
match self.last_ccc {
None => match super::char::compose(k, ch) {
Some(r) => {
self.composee = Some(r);
continue;
}
None => {
if ch_class == 0 {
self.composee = Some(ch);
return Some(k);
}
self.buffer.push(ch);
self.last_ccc = Some(ch_class);
}
},
Some(l_class) => {
if l_class >= ch_class {
// `ch` is blocked from `composee`
if ch_class == 0 {
self.composee = Some(ch);
self.last_ccc = None;
self.state = Purging(0);
return Some(k);
}
self.buffer.push(ch);
self.last_ccc = Some(ch_class);
continue;
}
match super::char::compose(k, ch) {
Some(r) => {
self.composee = Some(r);
continue;
}
None => {
self.buffer.push(ch);
self.last_ccc = Some(ch_class);
}
}
}
}
}
self.state = Finished(0);
if self.composee.is_some() {
return self.composee.take();
}
}
Purging(next) => match self.buffer.get(next).cloned() {
None => {
self.buffer.clear();
self.state = Composing;
}
s => {
self.state = Purging(next + 1);
return s;
}
},
Finished(next) => match self.buffer.get(next).cloned() {
None => {
self.buffer.clear();
return self.composee.take();
}
s => {
self.state = Finished(next + 1);
return s;
}
},
}
}
}
}
impl<I: Iterator<Item = char> + Clone> fmt::Display for Recompositions<I> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
for c in self.clone() {
f.write_char(c)?;
}
Ok(())
}
}

Просмотреть файл

@ -1,61 +0,0 @@
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use core::fmt::{self, Write};
use tinyvec::ArrayVec;
/// External iterator for replacements for a string's characters.
#[derive(Clone)]
pub struct Replacements<I> {
iter: I,
// At this time, the longest replacement sequence has length 2, so we just
// need buffer space for 1 codepoint.
buffer: Option<char>,
}
#[inline]
pub fn new_cjk_compat_variants<I: Iterator<Item = char>>(iter: I) -> Replacements<I> {
Replacements { iter, buffer: None }
}
impl<I: Iterator<Item = char>> Iterator for Replacements<I> {
type Item = char;
#[inline]
fn next(&mut self) -> Option<char> {
if let Some(c) = self.buffer.take() {
return Some(c);
}
match self.iter.next() {
Some(ch) => {
// At this time, the longest replacement sequence has length 2.
let mut buffer = ArrayVec::<[char; 2]>::new();
super::char::decompose_cjk_compat_variants(ch, |d| buffer.push(d));
self.buffer = buffer.get(1).copied();
Some(buffer[0])
}
None => None,
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
let (lower, _) = self.iter.size_hint();
(lower, None)
}
}
impl<I: Iterator<Item = char> + Clone> fmt::Display for Replacements<I> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
for c in self.clone() {
f.write_char(c)?;
}
Ok(())
}
}

Просмотреть файл

@ -1,170 +0,0 @@
use crate::lookups::{
canonical_combining_class, canonical_fully_decomposed, compatibility_fully_decomposed,
stream_safe_trailing_nonstarters,
};
use crate::normalize::{hangul_decomposition_length, is_hangul_syllable};
use crate::tables::stream_safe_leading_nonstarters;
pub(crate) const MAX_NONSTARTERS: usize = 30;
const COMBINING_GRAPHEME_JOINER: char = '\u{034F}';
/// UAX15-D4: This iterator keeps track of how many non-starters there have been
/// since the last starter in *NFKD* and will emit a Combining Grapheme Joiner
/// (U+034F) if the count exceeds 30.
pub struct StreamSafe<I> {
iter: I,
nonstarter_count: usize,
buffer: Option<char>,
}
impl<I> StreamSafe<I> {
pub(crate) fn new(iter: I) -> Self {
Self {
iter,
nonstarter_count: 0,
buffer: None,
}
}
}
impl<I: Iterator<Item = char>> Iterator for StreamSafe<I> {
type Item = char;
#[inline]
fn next(&mut self) -> Option<char> {
let next_ch = match self.buffer.take().or_else(|| self.iter.next()) {
None => return None,
Some(c) => c,
};
let d = classify_nonstarters(next_ch);
if self.nonstarter_count + d.leading_nonstarters > MAX_NONSTARTERS {
// Since we're emitting a CGJ, the suffix of the emitted string in NFKD has no trailing
// nonstarters, so we can reset the counter to zero. Put `next_ch` back into the
// iterator (via `self.buffer`), and we'll reclassify it next iteration.
self.nonstarter_count = 0;
self.buffer = Some(next_ch);
return Some(COMBINING_GRAPHEME_JOINER);
}
// Is the character all nonstarters in NFKD? If so, increment our counter of contiguous
// nonstarters in NKFD.
if d.leading_nonstarters == d.decomposition_len {
self.nonstarter_count += d.decomposition_len;
}
// Otherwise, reset the counter to the decomposition's number of trailing nonstarters.
else {
self.nonstarter_count = d.trailing_nonstarters;
}
Some(next_ch)
}
}
#[derive(Debug)]
pub(crate) struct Decomposition {
pub(crate) leading_nonstarters: usize,
pub(crate) trailing_nonstarters: usize,
pub(crate) decomposition_len: usize,
}
#[inline]
pub(crate) fn classify_nonstarters(c: char) -> Decomposition {
// As usual, fast path for ASCII (which is always a starter)
if c <= '\x7f' {
return Decomposition {
leading_nonstarters: 0,
trailing_nonstarters: 0,
decomposition_len: 1,
};
}
// Next, special case Hangul, since it's not handled by our tables.
if is_hangul_syllable(c) {
return Decomposition {
leading_nonstarters: 0,
trailing_nonstarters: 0,
decomposition_len: hangul_decomposition_length(c),
};
}
let decomp = compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c));
match decomp {
Some(decomp) => Decomposition {
leading_nonstarters: stream_safe_leading_nonstarters(c),
trailing_nonstarters: stream_safe_trailing_nonstarters(c),
decomposition_len: decomp.len(),
},
None => {
let is_nonstarter = canonical_combining_class(c) != 0;
let nonstarter = if is_nonstarter { 1 } else { 0 };
Decomposition {
leading_nonstarters: nonstarter,
trailing_nonstarters: nonstarter,
decomposition_len: 1,
}
}
}
}
#[cfg(test)]
mod tests {
use super::{classify_nonstarters, StreamSafe};
use crate::lookups::canonical_combining_class;
use crate::normalize::decompose_compatible;
#[cfg(not(feature = "std"))]
use crate::no_std_prelude::*;
use core::char;
fn stream_safe(s: &str) -> String {
StreamSafe::new(s.chars()).collect()
}
#[test]
fn test_simple() {
let technically_okay = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}ngerzone";
assert_eq!(stream_safe(technically_okay), technically_okay);
let too_much = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}ngerzone";
let fixed_it = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{034f}\u{032e}ngerzone";
assert_eq!(stream_safe(too_much), fixed_it);
let woah_nelly = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}ngerzone";
let its_cool = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{034f}\u{032e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{034f}\u{031d}\u{032e}ngerzone";
assert_eq!(stream_safe(woah_nelly), its_cool);
}
#[test]
fn test_all_nonstarters() {
let s = "\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}";
let expected = "\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{034F}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}";
assert_eq!(stream_safe(s), expected);
}
#[test]
fn test_classify_nonstarters() {
// Highest character in the `compat_fully_decomp` table is 2FA1D
for ch in 0..0x2FA1E {
let ch = match char::from_u32(ch) {
Some(c) => c,
None => continue,
};
let c = classify_nonstarters(ch);
let mut s = Vec::new();
decompose_compatible(ch, |c| s.push(c));
assert_eq!(s.len(), c.decomposition_len);
let num_leading = s
.iter()
.take_while(|&c| canonical_combining_class(*c) != 0)
.count();
let num_trailing = s
.iter()
.rev()
.take_while(|&c| canonical_combining_class(*c) != 0)
.count();
assert_eq!(num_leading, c.leading_nonstarters);
assert_eq!(num_trailing, c.trailing_nonstarters);
}
}
}

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,130 +0,0 @@
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use super::char::is_combining_mark;
use super::UnicodeNormalization;
use core::char;
#[cfg(not(feature = "std"))]
use crate::no_std_prelude::*;
#[test]
fn test_nfd() {
macro_rules! t {
($input: expr, $expected: expr) => {
assert_eq!($input.nfd().to_string(), $expected);
// A dummy iterator that is not std::str::Chars directly;
// note that `id_func` is used to ensure `Clone` implementation
assert_eq!(
$input.chars().map(|c| c).nfd().collect::<String>(),
$expected
);
};
}
t!("abc", "abc");
t!("\u{1e0b}\u{1c4}", "d\u{307}\u{1c4}");
t!("\u{2026}", "\u{2026}");
t!("\u{2126}", "\u{3a9}");
t!("\u{1e0b}\u{323}", "d\u{323}\u{307}");
t!("\u{1e0d}\u{307}", "d\u{323}\u{307}");
t!("a\u{301}", "a\u{301}");
t!("\u{301}a", "\u{301}a");
t!("\u{d4db}", "\u{1111}\u{1171}\u{11b6}");
t!("\u{ac1c}", "\u{1100}\u{1162}");
}
#[test]
fn test_nfkd() {
macro_rules! t {
($input: expr, $expected: expr) => {
assert_eq!($input.nfkd().to_string(), $expected);
};
}
t!("abc", "abc");
t!("\u{1e0b}\u{1c4}", "d\u{307}DZ\u{30c}");
t!("\u{2026}", "...");
t!("\u{2126}", "\u{3a9}");
t!("\u{1e0b}\u{323}", "d\u{323}\u{307}");
t!("\u{1e0d}\u{307}", "d\u{323}\u{307}");
t!("a\u{301}", "a\u{301}");
t!("\u{301}a", "\u{301}a");
t!("\u{d4db}", "\u{1111}\u{1171}\u{11b6}");
t!("\u{ac1c}", "\u{1100}\u{1162}");
}
#[test]
fn test_nfc() {
macro_rules! t {
($input: expr, $expected: expr) => {
assert_eq!($input.nfc().to_string(), $expected);
};
}
t!("abc", "abc");
t!("\u{1e0b}\u{1c4}", "\u{1e0b}\u{1c4}");
t!("\u{2026}", "\u{2026}");
t!("\u{2126}", "\u{3a9}");
t!("\u{1e0b}\u{323}", "\u{1e0d}\u{307}");
t!("\u{1e0d}\u{307}", "\u{1e0d}\u{307}");
t!("a\u{301}", "\u{e1}");
t!("\u{301}a", "\u{301}a");
t!("\u{d4db}", "\u{d4db}");
t!("\u{ac1c}", "\u{ac1c}");
t!(
"a\u{300}\u{305}\u{315}\u{5ae}b",
"\u{e0}\u{5ae}\u{305}\u{315}b"
);
}
#[test]
fn test_nfkc() {
macro_rules! t {
($input: expr, $expected: expr) => {
assert_eq!($input.nfkc().to_string(), $expected);
};
}
t!("abc", "abc");
t!("\u{1e0b}\u{1c4}", "\u{1e0b}D\u{17d}");
t!("\u{2026}", "...");
t!("\u{2126}", "\u{3a9}");
t!("\u{1e0b}\u{323}", "\u{1e0d}\u{307}");
t!("\u{1e0d}\u{307}", "\u{1e0d}\u{307}");
t!("a\u{301}", "\u{e1}");
t!("\u{301}a", "\u{301}a");
t!("\u{d4db}", "\u{d4db}");
t!("\u{ac1c}", "\u{ac1c}");
t!(
"a\u{300}\u{305}\u{315}\u{5ae}b",
"\u{e0}\u{5ae}\u{305}\u{315}b"
);
}
#[test]
fn test_normalize_char() {
assert_eq!('\u{2126}'.nfd().to_string(), "\u{3a9}")
}
#[test]
fn test_is_combining_mark_ascii() {
for cp in 0..0x7f {
assert!(!is_combining_mark(char::from_u32(cp).unwrap()));
}
}
#[test]
fn test_is_combining_mark_misc() {
// https://github.com/unicode-rs/unicode-normalization/issues/16
// U+11C3A BHAIKSUKI VOWEL SIGN O
// Category: Mark, Nonspacing [Mn]
assert!(is_combining_mark('\u{11C3A}'));
// U+11C3F BHAIKSUKI SIGN VIRAMA
// Category: Mark, Nonspacing [Mn]
assert!(is_combining_mark('\u{11C3F}'));
}

2
third_party/rust/url/.cargo-checksum.json поставляемый
Просмотреть файл

@ -1 +1 @@
{"files":{"Cargo.toml":"33e77fef5d9e5592daeff71b551f983f19ddd9d31a7c002e642a3c40d8b08123","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"76e972ac0f4ddb116e86e10100132a783931a596e7b9872eaa31be15cd4d751d","README.md":"71b01ec6f2f4ce47235ee430ba0c41afac563403a9dbcda23a584c3e915395ac","src/host.rs":"5e25476aaec0153b64d35b53940a72a1ec58e29a0e1fde36944f52eeb945c5f6","src/lib.rs":"e017abe9c33881a96b5daafeee65b1814b4418f5fb0c96d0aaea65a14c9292c9","src/origin.rs":"19a4b451e8615bfef7239d2fc719c489398fe5044edb0df7c84b54eef4ceba1b","src/parser.rs":"5427cd15caedc8e3c1418cc576a7263e96df26a51ad3ce88f8c32d3eb7d6dd2c","src/path_segments.rs":"29db87b6902da4ab1ae925b3874afdeff42b8ddfb46356af6a83b86f34e03b14","src/quirks.rs":"c9311e3dd6f701fb4b8e438b3e3960ff6f8c78a67ae763f3640b178f15c60e45","src/slicing.rs":"3b1aaad36ba7e89f50c90d1ceddda1f8ba52a364c153541ac5c9ce54dacb6724","tests/expected_failures.txt":"f222a5e2f7bdfbd724cf7fb8e35e71a0fe1f3ac9c2771919d7ff5ba9e51c5769","tests/setters_tests.json":"a3a4cbd7b798bc2c4d9656dc50be7397a5a5ed1f0b52daa1da1ad654d38c1dcd","tests/unit.rs":"1abe0a410c5078e1ad9de8c93f2f2ae660ddb47b7efaac9047e952457b068ded","tests/urltestdata.json":"58d67bea710d5f46324fe6841df5fd82090fe4ec2d882bc0fc7c1784d4771884","tests/wpt.rs":"6302c008cde6e7c0df8626701cc825731363722d02e35804bb370c385b455145"},"package":"31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633"}
{"files":{"Cargo.toml":"4108358208f628a0e61af3ebe88aedbe585983c518a456644df398012781f136","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"76e972ac0f4ddb116e86e10100132a783931a596e7b9872eaa31be15cd4d751d","README.md":"6111161470aa4d5f2e3806936df0c874b8eca5f8c8cd2d71a60eb6c2cbb776ab","src/host.rs":"9de249e8af8fcd0caf673b37a66ba070dfa1b231ee06a981526a9f863c3acf13","src/lib.rs":"4b7ec6a4f2ee7a63ac332f4609c4f9f648861e7ea0967b80efdf27c52a07f154","src/origin.rs":"19a4b451e8615bfef7239d2fc719c489398fe5044edb0df7c84b54eef4ceba1b","src/parser.rs":"ca317fdf927628351991c73437aa91d36e26637574e6551200125e32f46e60cd","src/path_segments.rs":"29db87b6902da4ab1ae925b3874afdeff42b8ddfb46356af6a83b86f34e03b14","src/quirks.rs":"79818bd168b138e8edd30011033c1f6defb847fe96f8a57381cf9251c27e866b","src/slicing.rs":"3b1aaad36ba7e89f50c90d1ceddda1f8ba52a364c153541ac5c9ce54dacb6724","tests/expected_failures.txt":"fc4f619316f1fb117b01d8089c04b925b8db0652f46b8534a87e115c5544881b","tests/setters_tests.json":"a3a4cbd7b798bc2c4d9656dc50be7397a5a5ed1f0b52daa1da1ad654d38c1dcd","tests/unit.rs":"c895675581e737ad8e1536786f80385df0426495074ee6cc011830f45f16f6f7","tests/urltestdata.json":"58d67bea710d5f46324fe6841df5fd82090fe4ec2d882bc0fc7c1784d4771884","tests/wpt.rs":"8781251116a9de8169327ed40a0237ac6ff2f84e3d579d6fb6d7353362f9a48a"},"package":"f7c25da092f0a868cdf09e8674cd3b7ef3a7d92a24253e663a2fb85e2496de56"}

9
third_party/rust/url/Cargo.toml поставляемый
Просмотреть файл

@ -11,9 +11,9 @@
[package]
edition = "2018"
rust-version = "1.56"
rust-version = "1.67"
name = "url"
version = "2.5.0"
version = "2.5.1"
authors = ["The rust-url developers"]
include = [
"src/**/*",
@ -57,7 +57,7 @@ harness = false
version = "1.2.1"
[dependencies.idna]
version = "0.5.0"
version = "1.0.0"
[dependencies.percent-encoding]
version = "2.3.1"
@ -81,3 +81,6 @@ version = "1.0"
debugger_visualizer = []
default = []
expose_internals = []
[target."cfg(all(target_arch = \"wasm32\", target_os = \"unknown\"))".dev-dependencies.wasm-bindgen-test]
version = "0.3"

4
third_party/rust/url/README.md поставляемый
Просмотреть файл

@ -9,6 +9,6 @@ rust-url
URL library for Rust, based on the [URL Standard](https://url.spec.whatwg.org/).
[Documentation](https://docs.rs/url/)
[Documentation](https://docs.rs/url)
Please see [UPGRADING.md](https://github.com/servo/rust-url/blob/master/UPGRADING.md) if you are upgrading from a previous version.
Please see [UPGRADING.md](https://github.com/servo/rust-url/blob/main/UPGRADING.md) if you are upgrading from a previous version.

35
third_party/rust/url/src/host.rs поставляемый
Просмотреть файл

@ -6,6 +6,7 @@
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use std::borrow::Cow;
use std::cmp;
use std::fmt::{self, Formatter};
use std::net::{Ipv4Addr, Ipv6Addr};
@ -81,7 +82,7 @@ impl Host<String> {
}
return parse_ipv6addr(&input[1..input.len() - 1]).map(Host::Ipv6);
}
let domain = percent_decode(input.as_bytes()).decode_utf8_lossy();
let domain: Cow<'_, [u8]> = percent_decode(input.as_bytes()).into();
let domain = Self::domain_to_ascii(&domain)?;
@ -89,35 +90,11 @@ impl Host<String> {
return Err(ParseError::EmptyHost);
}
let is_invalid_domain_char = |c| {
matches!(
c,
'\0'..='\u{001F}'
| ' '
| '#'
| '%'
| '/'
| ':'
| '<'
| '>'
| '?'
| '@'
| '['
| '\\'
| ']'
| '^'
| '\u{007F}'
| '|'
)
};
if domain.find(is_invalid_domain_char).is_some() {
Err(ParseError::InvalidDomainCharacter)
} else if ends_in_a_number(&domain) {
if ends_in_a_number(&domain) {
let address = parse_ipv4addr(&domain)?;
Ok(Host::Ipv4(address))
} else {
Ok(Host::Domain(domain))
Ok(Host::Domain(domain.to_string()))
}
}
@ -162,8 +139,8 @@ impl Host<String> {
}
/// convert domain with idna
fn domain_to_ascii(domain: &str) -> Result<String, ParseError> {
idna::domain_to_ascii(domain).map_err(Into::into)
fn domain_to_ascii(domain: &[u8]) -> Result<Cow<'_, str>, ParseError> {
idna::domain_to_ascii_cow(domain, idna::AsciiDenyList::URL).map_err(Into::into)
}
}

42
third_party/rust/url/src/lib.rs поставляемый
Просмотреть файл

@ -134,7 +134,7 @@ url = { version = "2", features = ["debugger_visualizer"] }
*/
#![doc(html_root_url = "https://docs.rs/url/2.5.0")]
#![doc(html_root_url = "https://docs.rs/url/2.5.1")]
#![cfg_attr(
feature = "debugger_visualizer",
debugger_visualizer(natvis_file = "../../debug_metadata/url.natvis")
@ -146,15 +146,20 @@ pub use form_urlencoded;
extern crate serde;
use crate::host::HostInternal;
use crate::parser::{to_u32, Context, Parser, SchemeType, PATH_SEGMENT, USERINFO};
use crate::parser::{
to_u32, Context, Parser, SchemeType, PATH_SEGMENT, SPECIAL_PATH_SEGMENT, USERINFO,
};
use percent_encoding::{percent_decode, percent_encode, utf8_percent_encode};
use std::borrow::Borrow;
use std::cmp;
use std::fmt::{self, Write};
use std::hash;
#[cfg(any(unix, windows, target_os = "redox", target_os = "wasi"))]
use std::io;
use std::mem;
use std::net::{IpAddr, SocketAddr, ToSocketAddrs};
use std::net::IpAddr;
#[cfg(any(unix, windows, target_os = "redox", target_os = "wasi"))]
use std::net::{SocketAddr, ToSocketAddrs};
use std::ops::{Range, RangeFrom, RangeTo};
use std::path::{Path, PathBuf};
use std::str;
@ -214,6 +219,9 @@ pub struct ParseOptions<'a> {
impl<'a> ParseOptions<'a> {
/// Change the base URL
///
/// See the notes of [`Url::join`] for more details about how this base is considered
/// when parsing.
pub fn base_url(mut self, new: Option<&'a Url>) -> Self {
self.base_url = new;
self
@ -365,9 +373,14 @@ impl Url {
///
/// The inverse of this is [`make_relative`].
///
/// Note: a trailing slash is significant.
/// # Notes
///
/// - A trailing slash is significant.
/// Without it, the last path component is considered to be a “file” name
/// to be removed to get at the “directory” that is used as the base:
/// to be removed to get at the “directory” that is used as the base.
/// - A [scheme relative special URL](https://url.spec.whatwg.org/#scheme-relative-special-url-string)
/// as input replaces everything in the base URL after the scheme.
/// - An absolute URL (with a scheme) as input replaces the whole base URL (even the scheme).
///
/// # Examples
///
@ -375,14 +388,27 @@ impl Url {
/// use url::Url;
/// # use url::ParseError;
///
/// // Base without a trailing slash
/// # fn run() -> Result<(), ParseError> {
/// let base = Url::parse("https://example.net/a/b.html")?;
/// let url = base.join("c.png")?;
/// assert_eq!(url.as_str(), "https://example.net/a/c.png"); // Not /a/b.html/c.png
///
/// // Base with a trailing slash
/// let base = Url::parse("https://example.net/a/b/")?;
/// let url = base.join("c.png")?;
/// assert_eq!(url.as_str(), "https://example.net/a/b/c.png");
///
/// // Input as scheme relative special URL
/// let base = Url::parse("https://alice.com/a")?;
/// let url = base.join("//eve.com/b")?;
/// assert_eq!(url.as_str(), "https://eve.com/b");
///
/// // Input as absolute URL
/// let base = Url::parse("https://alice.com/a")?;
/// let url = base.join("http://eve.com/b")?;
/// assert_eq!(url.as_str(), "http://eve.com/b"); // http instead of https
/// # Ok(())
/// # }
/// # run().unwrap();
@ -1250,6 +1276,7 @@ impl Url {
/// })
/// }
/// ```
#[cfg(any(unix, windows, target_os = "redox", target_os = "wasi"))]
pub fn socket_addrs(
&self,
default_port_number: impl Fn() -> Option<u16>,
@ -1524,7 +1551,8 @@ impl Url {
}
}
/// Change this URLs query string.
/// Change this URLs query string. If `query` is `None`, this URL's
/// query string will be cleared.
///
/// # Examples
///
@ -2816,7 +2844,7 @@ fn path_to_file_url_segments(
serialization.push('/');
serialization.extend(percent_encode(
component.as_os_str().as_bytes(),
PATH_SEGMENT,
SPECIAL_PATH_SEGMENT,
));
}
if empty {

7
third_party/rust/url/src/parser.rs поставляемый
Просмотреть файл

@ -94,15 +94,18 @@ impl From<::idna::Errors> for ParseError {
}
macro_rules! syntax_violation_enum {
($($name: ident => $description: expr,)+) => {
($($name: ident => $description: literal,)+) => {
/// Non-fatal syntax violations that can occur during parsing.
///
/// This may be extended in the future so exhaustive matching is
/// discouraged with an unused variant.
/// forbidden.
#[derive(PartialEq, Eq, Clone, Copy, Debug)]
#[non_exhaustive]
pub enum SyntaxViolation {
$(
/// ```text
#[doc = $description]
/// ```
$name,
)+
}

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше