зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1889536 - Vendor idna 1.0.2 and icu_normalizer by updating the url crate. r=glandium,supply-chain-reviewers
Differential Revision: https://phabricator.services.mozilla.com/D206578
This commit is contained in:
Родитель
1ac67d31a5
Коммит
ce58d7f51e
|
@ -2864,6 +2864,30 @@ version = "1.5.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e"
|
||||
|
||||
[[package]]
|
||||
name = "icu_normalizer"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f"
|
||||
dependencies = [
|
||||
"displaydoc",
|
||||
"icu_collections",
|
||||
"icu_normalizer_data",
|
||||
"icu_properties",
|
||||
"icu_provider",
|
||||
"smallvec",
|
||||
"utf16_iter",
|
||||
"utf8_iter",
|
||||
"write16",
|
||||
"zerovec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "icu_normalizer_data"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516"
|
||||
|
||||
[[package]]
|
||||
name = "icu_properties"
|
||||
version = "1.5.0"
|
||||
|
@ -2961,12 +2985,14 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
|
|||
|
||||
[[package]]
|
||||
name = "idna"
|
||||
version = "0.5.0"
|
||||
version = "1.0.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6"
|
||||
checksum = "bd69211b9b519e98303c015e21a007e293db403b6c85b9b124e133d25e242cdd"
|
||||
dependencies = [
|
||||
"unicode-bidi",
|
||||
"unicode-normalization",
|
||||
"icu_normalizer",
|
||||
"icu_properties",
|
||||
"smallvec",
|
||||
"utf8_iter",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -5857,13 +5883,6 @@ dependencies = [
|
|||
"zerovec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tinyvec"
|
||||
version = "1.999.999"
|
||||
dependencies = [
|
||||
"smallvec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "to_shmem"
|
||||
version = "0.0.1"
|
||||
|
@ -6115,15 +6134,6 @@ version = "1.0.6"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-normalization"
|
||||
version = "0.1.22"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921"
|
||||
dependencies = [
|
||||
"tinyvec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicode-width"
|
||||
version = "0.1.10"
|
||||
|
@ -6371,9 +6381,9 @@ checksum = "2ace0b4755d0a2959962769239d56267f8a024fef2d9b32666b3dcd0946b0906"
|
|||
|
||||
[[package]]
|
||||
name = "url"
|
||||
version = "2.5.0"
|
||||
version = "2.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633"
|
||||
checksum = "f7c25da092f0a868cdf09e8674cd3b7ef3a7d92a24253e663a2fb85e2496de56"
|
||||
dependencies = [
|
||||
"form_urlencoded",
|
||||
"idna",
|
||||
|
@ -6382,10 +6392,16 @@ dependencies = [
|
|||
]
|
||||
|
||||
[[package]]
|
||||
name = "utf8_iter"
|
||||
version = "1.0.3"
|
||||
name = "utf16_iter"
|
||||
version = "1.0.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "64a8922555b9500e3d865caed19330172cd67cbf82203f1a3311d8c305cc9f33"
|
||||
checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246"
|
||||
|
||||
[[package]]
|
||||
name = "utf8_iter"
|
||||
version = "1.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
|
||||
|
||||
[[package]]
|
||||
name = "uuid"
|
||||
|
@ -6959,6 +6975,12 @@ dependencies = [
|
|||
"euclid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "write16"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936"
|
||||
|
||||
[[package]]
|
||||
name = "writeable"
|
||||
version = "0.5.5"
|
||||
|
|
|
@ -136,9 +136,6 @@ redox_users = { path = "build/rust/redox_users" }
|
|||
# Patch redox_syscall to an empty crate
|
||||
redox_syscall = { path = "build/rust/redox_syscall" }
|
||||
|
||||
# Override tinyvec with smallvec
|
||||
tinyvec = { path = "build/rust/tinyvec" }
|
||||
|
||||
# Patch base64 0.13 to 0.21
|
||||
base64 = { path = "build/rust/base64" }
|
||||
|
||||
|
|
|
@ -1,16 +0,0 @@
|
|||
[package]
|
||||
name = "tinyvec"
|
||||
version = "1.999.999"
|
||||
edition = "2018"
|
||||
license = "MPL-2.0"
|
||||
|
||||
[lib]
|
||||
path = "lib.rs"
|
||||
|
||||
[dependencies]
|
||||
smallvec = "1"
|
||||
|
||||
[features]
|
||||
alloc = []
|
||||
default = []
|
||||
std = ["alloc"]
|
|
@ -1,6 +0,0 @@
|
|||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
pub use smallvec::SmallVec as ArrayVec;
|
||||
pub use smallvec::SmallVec as TinyVec;
|
|
@ -2462,6 +2462,17 @@ who = "Makoto Kato <m_kato@ga2.so-net.ne.jp>"
|
|||
criteria = "safe-to-deploy"
|
||||
delta = "1.4.0 -> 1.5.0"
|
||||
|
||||
[[audits.icu_normalizer]]
|
||||
who = "Henri Sivonen <hsivonen@hsivonen.fi>"
|
||||
criteria = "safe-to-deploy"
|
||||
version = "1.5.0"
|
||||
notes = "I, Henri Sivonen, am the principal author of this crate."
|
||||
|
||||
[[audits.icu_normalizer_data]]
|
||||
who = "Henri Sivonen <hsivonen@hsivonen.fi>"
|
||||
criteria = "safe-to-deploy"
|
||||
version = "1.5.0"
|
||||
|
||||
[[audits.icu_properties]]
|
||||
who = "Jonathan Kew <jkew@mozilla.com>"
|
||||
criteria = "safe-to-deploy"
|
||||
|
@ -2575,6 +2586,12 @@ who = "Valentin Gosu <valentin.gosu@gmail.com>"
|
|||
criteria = "safe-to-deploy"
|
||||
delta = "0.4.0 -> 0.5.0"
|
||||
|
||||
[[audits.idna]]
|
||||
who = "Henri Sivonen <hsivonen@hsivonen.fi>"
|
||||
criteria = "safe-to-deploy"
|
||||
delta = "0.5.0 -> 1.0.2"
|
||||
notes = "In the 0.5.0 to 1.0.2 delta, I, Henri Sivonen, rewrote the non-Punycode internals of the crate and made the changes to the Punycode code."
|
||||
|
||||
[[audits.indexmap]]
|
||||
who = "Mike Hommey <mh+mozilla@glandium.org>"
|
||||
criteria = "safe-to-deploy"
|
||||
|
@ -4758,6 +4775,17 @@ who = "Valentin Gosu <valentin.gosu@gmail.com>"
|
|||
criteria = "safe-to-deploy"
|
||||
delta = "2.4.1 -> 2.5.0"
|
||||
|
||||
[[audits.url]]
|
||||
who = "Henri Sivonen <hsivonen@hsivonen.fi>"
|
||||
criteria = "safe-to-deploy"
|
||||
delta = "2.5.0 -> 2.5.1"
|
||||
|
||||
[[audits.utf16_iter]]
|
||||
who = "Henri Sivonen <hsivonen@hsivonen.fi>"
|
||||
criteria = "safe-to-deploy"
|
||||
version = "1.0.5"
|
||||
notes = "I, Henri Sivonen, wrote this crate."
|
||||
|
||||
[[audits.uuid]]
|
||||
who = "Gabriele Svelto <gsvelto@mozilla.com>"
|
||||
criteria = "safe-to-deploy"
|
||||
|
@ -5149,6 +5177,12 @@ criteria = "safe-to-deploy"
|
|||
version = "0.1.0"
|
||||
notes = "Written and maintained by Gfx team at Mozilla."
|
||||
|
||||
[[audits.write16]]
|
||||
who = "Henri Sivonen <hsivonen@hsivonen.fi>"
|
||||
criteria = "safe-to-deploy"
|
||||
version = "1.0.0"
|
||||
notes = "I, Henri Sivonen, wrote this (safe-code-only) crate."
|
||||
|
||||
[[audits.writeable]]
|
||||
who = "Makoto Kato <m_kato@ga2.so-net.ne.jp>"
|
||||
criteria = "safe-to-deploy"
|
||||
|
|
|
@ -809,6 +809,13 @@ user-id = 4484
|
|||
user-login = "hsivonen"
|
||||
user-name = "Henri Sivonen"
|
||||
|
||||
[[publisher.utf8_iter]]
|
||||
version = "1.0.4"
|
||||
when = "2023-12-01"
|
||||
user-id = 4484
|
||||
user-login = "hsivonen"
|
||||
user-name = "Henri Sivonen"
|
||||
|
||||
[[publisher.walkdir]]
|
||||
version = "2.3.2"
|
||||
when = "2021-03-22"
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
{"files":{"Cargo.toml":"83b30ee0282024b826e1ef2d28519c230f663d2d882e64017bbe692d62c58741","LICENSE":"f367c1b8e1aa262435251e442901da4607b4650e0e63a026f5044473ecfb90f2","README.md":"7c238039ae55d7dc74c6fe4d0db071db103c2740a1637943ded2a8c504c58b86","benches/bench.rs":"9cd781e3d0e8d772860cd332b4f403910f3ca52fd69a459f5ac95d28f0e25ac2","benches/canonical_composition.rs":"78c6a077a26efd61586386e4eb39b3fc5b1875c73fab26e86292bf2eeaa93709","benches/canonical_decomposition.rs":"c57ab476622ec5e42b65556fc76313b6755714e539847012074eaad79bc72794","benches/composing_normalizer_nfc.rs":"8c80e55ebbab2c93f4c01140de69eba94ab25777401fd68e69e45638268ffd23","benches/composing_normalizer_nfkc.rs":"64244b5e94adb859469311c4cfc72835aafd7c58cf0aee319aeee47220dd0c63","benches/data/README.md":"fa79b84815a228c3fbfa5d4c6d12885036994ca8ad61e683b2113cf2b428bb85","benches/data/TestNames_Japanese_h.txt":"6522f8ed794ad348c904079082ec3aa303ae7acf3f68bbc49fa0ee90eebf31e0","benches/data/TestNames_Japanese_k.txt":"e4e18804fe742ecd27ae48bc3564c6bc653180a3c649d43a2ab4d8b7f2607627","benches/data/TestNames_Korean.txt":"9cbf54d5ee16726c0fc9477366e273ba1b82e651c9e88e6c7532df5344f03920","benches/data/TestNames_Latin.txt":"3a30d450d259a6be4a6aee8eeef08d3767d11fcc047b8f58060c542efe1182d1","benches/data/TestNames_Thai.txt":"28d76ddb62d6f47646232860fce7440544f402158443889393fd7e8bf10e9c3d","benches/data/TestRandomWordsUDHR_ar.txt":"02a775153e9746ae938a9db0b60244f2c00d911bb72b611a3593b0991fd95723","benches/data/TestRandomWordsUDHR_de.txt":"100b9502e7ddcb2fcbd055cb7ec9113245105bd1c606cace5e5bc147cc18727b","benches/data/TestRandomWordsUDHR_el.txt":"d1a2f0f9efc9ce663026ca7c285177391937c90008479a8c5b909c300dc86972","benches/data/TestRandomWordsUDHR_es.txt":"deeebda09e0ce0f80dd805317e96d1a630908601ff2a4d1ccb0021b00b55814b","benches/data/TestRandomWordsUDHR_fr.txt":"5931edc9f1af2c27a0b35c9624732e70b87b0fd72ab486710f3aa6367c7ad35f","benches/data/TestRandomWordsUDHR_he.txt":"dc77a89ffb9803e5c574d87f4789cb17624df73e40a8a92961df8ea8be103425","benches/data/TestRandomWordsUDHR_pl.txt":"26c378295ee2ef75ccacea691df0456394184a9a5c9ce48b2bada169b2402bbb","benches/data/TestRandomWordsUDHR_ru.txt":"a1c339f6d7b69cf9154e855c290ab09eeaf167ebcdf6d4bcb917de039fba10ee","benches/data/TestRandomWordsUDHR_th.txt":"3ba518be9863c85c3ac80cbb12299e3594e6f5afed3406d910d948007adaaf4e","benches/data/TestRandomWordsUDHR_tr.txt":"815c7babbc7228ef89b56f29638aeb63013aeca0003a49e58994e26b41cba01c","benches/data/wotw.txt":"8f28e68041ce75bbf75e72e186a6145e4c2de9e7e62b9b86ce0621c527a23669","benches/decomposing_normalizer_nfd.rs":"9caf896987e509af1e37488592022a62e8960692909745c4d08a539e7f283146","benches/decomposing_normalizer_nfkd.rs":"ce1c64b789baa9b4c5fb6511a187014f913e99126f1c932a4a12dc9a29367508","src/error.rs":"c1d7089ec5b1d124e701dd41704a0b7c685a1d7f8ed7d9eed0faaf093d2485f2","src/lib.rs":"cf85240373c7bc9896444c599450bba7351339748b0349fccfdec6a65cce8c30","src/properties.rs":"e3314a9801cc64f64c79740faed8495cb828bfcf4b1e34c9f8251ea7ecebd4e5","src/provider.rs":"4fdca8144102c7775debead6b50e758bf9382743630bf14de8a9b12a79fc6fed","src/uts46.rs":"d2f3d36ea5cd365631cfbe83b855bfc533d14e17d50c1e1b33da4df1de25563e","tests/data/NormalizationTest.txt":"1b04c22b82064adf871e76fd2148cd749129163f7d05bd7ace923516a65afe02","tests/data/README.md":"521fcd44a1f10f21629df88113fa29ca9f4e1dfbeea79fda19a7dc8ba435e24b","tests/tests.rs":"ba36cb3e89d2ea5c0312ab7a8d46c8c36ea9f01d35f2842e4778c2dee30cba54"},"package":"19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f"}
|
|
@ -0,0 +1,144 @@
|
|||
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
|
||||
#
|
||||
# When uploading crates to the registry Cargo will automatically
|
||||
# "normalize" Cargo.toml files for maximal compatibility
|
||||
# with all versions of Cargo and also rewrite `path` dependencies
|
||||
# to registry (e.g., crates.io) dependencies.
|
||||
#
|
||||
# If you are reading this file be aware that the original Cargo.toml
|
||||
# will likely look very different (and much more reasonable).
|
||||
# See Cargo.toml.orig for the original contents.
|
||||
|
||||
[package]
|
||||
edition = "2021"
|
||||
rust-version = "1.67"
|
||||
name = "icu_normalizer"
|
||||
version = "1.5.0"
|
||||
authors = ["The ICU4X Project Developers"]
|
||||
include = [
|
||||
"data/**/*",
|
||||
"src/**/*",
|
||||
"examples/**/*",
|
||||
"benches/**/*",
|
||||
"tests/**/*",
|
||||
"Cargo.toml",
|
||||
"LICENSE",
|
||||
"README.md",
|
||||
]
|
||||
description = "API for normalizing text into Unicode Normalization Forms"
|
||||
homepage = "https://icu4x.unicode.org"
|
||||
readme = "README.md"
|
||||
categories = ["internationalization"]
|
||||
license = "Unicode-3.0"
|
||||
repository = "https://github.com/unicode-org/icu4x"
|
||||
|
||||
[package.metadata.docs.rs]
|
||||
all-features = true
|
||||
|
||||
[[bench]]
|
||||
name = "bench"
|
||||
harness = false
|
||||
|
||||
[dependencies.databake]
|
||||
version = "0.1.8"
|
||||
features = ["derive"]
|
||||
optional = true
|
||||
default-features = false
|
||||
|
||||
[dependencies.displaydoc]
|
||||
version = "0.2.3"
|
||||
default-features = false
|
||||
|
||||
[dependencies.icu_collections]
|
||||
version = "~1.5.0"
|
||||
default-features = false
|
||||
|
||||
[dependencies.icu_normalizer_data]
|
||||
version = "~1.5.0"
|
||||
optional = true
|
||||
default-features = false
|
||||
|
||||
[dependencies.icu_properties]
|
||||
version = "~1.5.0"
|
||||
default-features = false
|
||||
|
||||
[dependencies.icu_provider]
|
||||
version = "~1.5.0"
|
||||
features = ["macros"]
|
||||
default-features = false
|
||||
|
||||
[dependencies.serde]
|
||||
version = "1.0.110"
|
||||
features = [
|
||||
"derive",
|
||||
"alloc",
|
||||
]
|
||||
optional = true
|
||||
default-features = false
|
||||
|
||||
[dependencies.smallvec]
|
||||
version = "1.10.0"
|
||||
default-features = false
|
||||
|
||||
[dependencies.utf16_iter]
|
||||
version = "1.0.2"
|
||||
default-features = false
|
||||
|
||||
[dependencies.utf8_iter]
|
||||
version = "1.0.2"
|
||||
default-features = false
|
||||
|
||||
[dependencies.write16]
|
||||
version = "1.0.0"
|
||||
features = ["alloc"]
|
||||
default-features = false
|
||||
|
||||
[dependencies.zerovec]
|
||||
version = "0.10.2"
|
||||
default-features = false
|
||||
|
||||
[dev-dependencies.arraystring]
|
||||
version = "0.3.0"
|
||||
|
||||
[dev-dependencies.arrayvec]
|
||||
version = "0.7.2"
|
||||
|
||||
[dev-dependencies.atoi]
|
||||
version = "1.0.0"
|
||||
|
||||
[dev-dependencies.detone]
|
||||
version = "1.0.0"
|
||||
|
||||
[dev-dependencies.write16]
|
||||
version = "1.0.0"
|
||||
features = ["arrayvec"]
|
||||
default-features = false
|
||||
|
||||
[features]
|
||||
compiled_data = [
|
||||
"dep:icu_normalizer_data",
|
||||
"icu_properties/compiled_data",
|
||||
]
|
||||
datagen = [
|
||||
"serde",
|
||||
"dep:databake",
|
||||
"icu_collections/databake",
|
||||
"zerovec/databake",
|
||||
"icu_properties/datagen",
|
||||
]
|
||||
default = ["compiled_data"]
|
||||
experimental = []
|
||||
serde = [
|
||||
"dep:serde",
|
||||
"icu_collections/serde",
|
||||
"zerovec/serde",
|
||||
"icu_properties/serde",
|
||||
]
|
||||
std = [
|
||||
"icu_collections/std",
|
||||
"icu_properties/std",
|
||||
"icu_provider/std",
|
||||
]
|
||||
|
||||
[target."cfg(not(target_arch = \"wasm32\"))".dev-dependencies.criterion]
|
||||
version = "0.5.0"
|
|
@ -0,0 +1,46 @@
|
|||
UNICODE LICENSE V3
|
||||
|
||||
COPYRIGHT AND PERMISSION NOTICE
|
||||
|
||||
Copyright © 2020-2024 Unicode, Inc.
|
||||
|
||||
NOTICE TO USER: Carefully read the following legal agreement. BY
|
||||
DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR
|
||||
SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
|
||||
TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT
|
||||
DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a
|
||||
copy of data files and any associated documentation (the "Data Files") or
|
||||
software and any associated documentation (the "Software") to deal in the
|
||||
Data Files or Software without restriction, including without limitation
|
||||
the rights to use, copy, modify, merge, publish, distribute, and/or sell
|
||||
copies of the Data Files or Software, and to permit persons to whom the
|
||||
Data Files or Software are furnished to do so, provided that either (a)
|
||||
this copyright and permission notice appear with all copies of the Data
|
||||
Files or Software, or (b) this copyright and permission notice appear in
|
||||
associated Documentation.
|
||||
|
||||
THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
|
||||
KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
|
||||
THIRD PARTY RIGHTS.
|
||||
|
||||
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
|
||||
BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
|
||||
OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
||||
WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
|
||||
ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA
|
||||
FILES OR SOFTWARE.
|
||||
|
||||
Except as contained in this notice, the name of a copyright holder shall
|
||||
not be used in advertising or otherwise to promote the sale, use or other
|
||||
dealings in these Data Files or Software without prior written
|
||||
authorization of the copyright holder.
|
||||
|
||||
SPDX-License-Identifier: Unicode-3.0
|
||||
|
||||
—
|
||||
|
||||
Portions of ICU4X may have been adapted from ICU4C and/or ICU4J.
|
||||
ICU 1.8.1 to ICU 57.1 © 1995-2016 International Business Machines Corporation and others.
|
|
@ -0,0 +1,56 @@
|
|||
# icu_normalizer [![crates.io](https://img.shields.io/crates/v/icu_normalizer)](https://crates.io/crates/icu_normalizer)
|
||||
|
||||
<!-- cargo-rdme start -->
|
||||
|
||||
Normalizing text into Unicode Normalization Forms.
|
||||
|
||||
This module is published as its own crate ([`icu_normalizer`](https://docs.rs/icu_normalizer/latest/icu_normalizer/))
|
||||
and as part of the [`icu`](https://docs.rs/icu/latest/icu/) crate. See the latter for more details on the ICU4X project.
|
||||
|
||||
## Implementation notes
|
||||
|
||||
The normalizer operates on a lazy iterator over Unicode scalar values (Rust `char`) internally
|
||||
and iterating over guaranteed-valid UTF-8, potentially-invalid UTF-8, and potentially-invalid
|
||||
UTF-16 is a step that doesn’t leak into the normalizer internals. Ill-formed byte sequences are
|
||||
treated as U+FFFD.
|
||||
|
||||
The normalizer data layout is not based on the ICU4C design at all. Instead, the normalization
|
||||
data layout is a clean-slate design optimized for the concept of fusing the NFD decomposition
|
||||
into the collator. That is, the decomposing normalizer is a by-product of the collator-motivated
|
||||
data layout.
|
||||
|
||||
Notably, the decomposition data structure is optimized for a starter decomposing to itself,
|
||||
which is the most common case, and for a starter decomposing to a starter and a non-starter
|
||||
on the Basic Multilingual Plane. Notably, in this case, the collator makes use of the
|
||||
knowledge that the second character of such a decomposition is a non-starter. Therefore,
|
||||
decomposition into two starters is handled by generic fallback path that looks the
|
||||
decomposition from an array by offset and length instead of baking a BMP starter pair directly
|
||||
into a trie value.
|
||||
|
||||
The decompositions into non-starters are hard-coded. At present in Unicode, these appear
|
||||
to be special cases falling into three categories:
|
||||
|
||||
1. Deprecated combining marks.
|
||||
2. Particular Tibetan vowel sings.
|
||||
3. NFKD only: half-width kana voicing marks.
|
||||
|
||||
Hopefully Unicode never adds more decompositions into non-starters (other than a character
|
||||
decomposing to itself), but if it does, a code update is needed instead of a mere data update.
|
||||
|
||||
The composing normalizer builds on the decomposing normalizer by performing the canonical
|
||||
composition post-processing per spec. As an optimization, though, the composing normalizer
|
||||
attempts to pass through already-normalized text consisting of starters that never combine
|
||||
backwards and that map to themselves if followed by a character whose decomposition starts
|
||||
with a starter that never combines backwards.
|
||||
|
||||
As a difference with ICU4C, the composing normalizer has only the simplest possible
|
||||
passthrough (only one inversion list lookup per character in the best case) and the full
|
||||
decompose-then-canonically-compose behavior, whereas ICU4C has other paths between these
|
||||
extremes. The ICU4X collator doesn't make use of the FCD concept at all in order to avoid
|
||||
doing the work of checking whether the FCD condition holds.
|
||||
|
||||
<!-- cargo-rdme end -->
|
||||
|
||||
## More Information
|
||||
|
||||
For more information on development, authorship, contributing etc. please visit [`ICU4X home page`](https://github.com/unicode-org/icu4x).
|
|
@ -0,0 +1,24 @@
|
|||
// This file is part of ICU4X. For terms of use, please see the file
|
||||
// called LICENSE at the top level of the ICU4X source tree
|
||||
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
|
||||
|
||||
use criterion::{criterion_group, criterion_main};
|
||||
|
||||
mod canonical_composition;
|
||||
mod canonical_decomposition;
|
||||
mod composing_normalizer_nfc;
|
||||
mod composing_normalizer_nfkc;
|
||||
mod decomposing_normalizer_nfd;
|
||||
mod decomposing_normalizer_nfkd;
|
||||
|
||||
criterion_group!(
|
||||
benches,
|
||||
canonical_composition::criterion_benchmark,
|
||||
canonical_decomposition::criterion_benchmark,
|
||||
composing_normalizer_nfc::criterion_benchmark,
|
||||
composing_normalizer_nfkc::criterion_benchmark,
|
||||
decomposing_normalizer_nfd::criterion_benchmark,
|
||||
decomposing_normalizer_nfkd::criterion_benchmark,
|
||||
);
|
||||
|
||||
criterion_main!(benches);
|
|
@ -0,0 +1,186 @@
|
|||
// This file is part of ICU4X. For terms of use, please see the file
|
||||
// called LICENSE at the top level of the ICU4X source tree
|
||||
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
|
||||
|
||||
use criterion::{black_box, BenchmarkId, Criterion};
|
||||
use detone::IterDecomposeVietnamese;
|
||||
|
||||
use icu_normalizer::properties::{CanonicalComposition, CanonicalDecomposition, Decomposed};
|
||||
use icu_normalizer::ComposingNormalizer;
|
||||
|
||||
struct BenchDataContent {
|
||||
pub file_name: String,
|
||||
pub pairs: Vec<(char, char)>,
|
||||
}
|
||||
|
||||
fn strip_headers(content: &str) -> String {
|
||||
content
|
||||
.lines()
|
||||
.filter(|&s| !s.starts_with('#'))
|
||||
.map(|s| s.to_owned())
|
||||
.collect::<Vec<String>>()
|
||||
.join("\n")
|
||||
}
|
||||
|
||||
fn normalizer_bench_data() -> [BenchDataContent; 16] {
|
||||
let nfc_normalizer: ComposingNormalizer = ComposingNormalizer::new_nfc();
|
||||
|
||||
return [
|
||||
BenchDataContent {
|
||||
file_name: "TestNames_Latin".to_owned(),
|
||||
pairs: decompose_data(
|
||||
&nfc_normalizer
|
||||
.normalize(&strip_headers(include_str!("./data/TestNames_Latin.txt"))),
|
||||
),
|
||||
},
|
||||
BenchDataContent {
|
||||
file_name: "TestNames_Japanese_h".to_owned(),
|
||||
pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
|
||||
"./data/TestNames_Japanese_h.txt"
|
||||
)))),
|
||||
},
|
||||
BenchDataContent {
|
||||
file_name: "TestNames_Japanese_k".to_owned(),
|
||||
pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
|
||||
"./data/TestNames_Japanese_k.txt"
|
||||
)))),
|
||||
},
|
||||
BenchDataContent {
|
||||
file_name: "TestNames_Korean".to_owned(),
|
||||
pairs: decompose_data(
|
||||
&nfc_normalizer
|
||||
.normalize(&strip_headers(include_str!("./data/TestNames_Korean.txt"))),
|
||||
),
|
||||
},
|
||||
BenchDataContent {
|
||||
file_name: "TestRandomWordsUDHR_ar".to_owned(),
|
||||
#[cfg(debug_assertions)]
|
||||
pairs: Vec::new(),
|
||||
#[cfg(not(debug_assertions))]
|
||||
pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
|
||||
"./data/TestRandomWordsUDHR_ar.txt"
|
||||
)))),
|
||||
},
|
||||
BenchDataContent {
|
||||
file_name: "TestRandomWordsUDHR_de".to_owned(),
|
||||
pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
|
||||
"./data/TestRandomWordsUDHR_de.txt"
|
||||
)))),
|
||||
},
|
||||
BenchDataContent {
|
||||
file_name: "TestRandomWordsUDHR_el".to_owned(),
|
||||
pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
|
||||
"./data/TestRandomWordsUDHR_el.txt"
|
||||
)))),
|
||||
},
|
||||
BenchDataContent {
|
||||
file_name: "TestRandomWordsUDHR_es".to_owned(),
|
||||
pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
|
||||
"./data/TestRandomWordsUDHR_es.txt"
|
||||
)))),
|
||||
},
|
||||
BenchDataContent {
|
||||
file_name: "TestRandomWordsUDHR_fr".to_owned(),
|
||||
pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
|
||||
"./data/TestRandomWordsUDHR_fr.txt"
|
||||
)))),
|
||||
},
|
||||
BenchDataContent {
|
||||
file_name: "TestRandomWordsUDHR_he".to_owned(),
|
||||
pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
|
||||
"./data/TestRandomWordsUDHR_he.txt"
|
||||
)))),
|
||||
},
|
||||
BenchDataContent {
|
||||
file_name: "TestRandomWordsUDHR_pl".to_owned(),
|
||||
pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
|
||||
"./data/TestRandomWordsUDHR_pl.txt"
|
||||
)))),
|
||||
},
|
||||
BenchDataContent {
|
||||
file_name: "TestRandomWordsUDHR_ru".to_owned(),
|
||||
pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
|
||||
"./data/TestRandomWordsUDHR_ru.txt"
|
||||
)))),
|
||||
},
|
||||
BenchDataContent {
|
||||
file_name: "TestRandomWordsUDHR_th".to_owned(),
|
||||
#[cfg(debug_assertions)]
|
||||
pairs: Vec::new(),
|
||||
#[cfg(not(debug_assertions))]
|
||||
pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
|
||||
"./data/TestRandomWordsUDHR_th.txt"
|
||||
)))),
|
||||
},
|
||||
BenchDataContent {
|
||||
file_name: "TestRandomWordsUDHR_tr".to_owned(),
|
||||
pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
|
||||
"./data/TestRandomWordsUDHR_tr.txt"
|
||||
)))),
|
||||
},
|
||||
BenchDataContent {
|
||||
file_name: "udhr_vie".to_owned(),
|
||||
pairs: decompose_data(
|
||||
&nfc_normalizer.normalize(&strip_headers(include_str!("data/wotw.txt"))),
|
||||
),
|
||||
},
|
||||
BenchDataContent {
|
||||
file_name: "udhr_vie_detone".to_owned(),
|
||||
pairs: {
|
||||
let result: Vec<(char, char)> = nfc_normalizer
|
||||
.normalize(&strip_headers(include_str!("data/wotw.txt")))
|
||||
.chars()
|
||||
.filter_map(|c| {
|
||||
let mut iter = std::iter::once(c).decompose_vietnamese_tones(true);
|
||||
if let Some(base) = iter.next() {
|
||||
iter.next().map(|tone| (base, tone))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
assert!(!result.is_empty());
|
||||
result
|
||||
},
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
fn function_under_bench(
|
||||
canonical_composer: &CanonicalComposition,
|
||||
composable_points: &[(char, char)],
|
||||
) {
|
||||
for pair in composable_points.iter() {
|
||||
canonical_composer.compose(pair.0, pair.1);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn criterion_benchmark(criterion: &mut Criterion) {
|
||||
let group_name = "canonical_composition";
|
||||
let mut group = criterion.benchmark_group(group_name);
|
||||
|
||||
let composer = CanonicalComposition::new();
|
||||
|
||||
for bench_data_content in black_box(normalizer_bench_data()) {
|
||||
group.bench_function(
|
||||
BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)),
|
||||
|bencher| bencher.iter(|| function_under_bench(&composer, &bench_data_content.pairs)),
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
||||
|
||||
fn decompose_data(nfc: &str) -> Vec<(char, char)> {
|
||||
let decomposer = CanonicalDecomposition::new();
|
||||
nfc.chars()
|
||||
.map(|c| decomposer.decompose(c))
|
||||
.filter_map(|decomposed| {
|
||||
if let Decomposed::Expansion(a, b) = decomposed {
|
||||
Some((a, b))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
}
|
|
@ -0,0 +1,159 @@
|
|||
// This file is part of ICU4X. For terms of use, please see the file
|
||||
// called LICENSE at the top level of the ICU4X source tree
|
||||
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
|
||||
|
||||
use criterion::{black_box, BenchmarkId, Criterion};
|
||||
|
||||
use icu_normalizer::properties::CanonicalDecomposition;
|
||||
use icu_normalizer::{ComposingNormalizer, DecomposingNormalizer};
|
||||
|
||||
struct BenchDataContent {
|
||||
pub file_name: String,
|
||||
pub nfc: String,
|
||||
pub nfd: String,
|
||||
pub nfkc: String,
|
||||
pub nfkd: String,
|
||||
}
|
||||
|
||||
fn strip_headers(content: &str) -> String {
|
||||
content
|
||||
.lines()
|
||||
.filter(|&s| !s.starts_with('#'))
|
||||
.map(|s| s.to_owned())
|
||||
.collect::<Vec<String>>()
|
||||
.join("\n")
|
||||
}
|
||||
|
||||
fn normalizer_bench_data() -> [BenchDataContent; 15] {
|
||||
let nfc_normalizer: ComposingNormalizer = ComposingNormalizer::new_nfc();
|
||||
let nfd_normalizer: DecomposingNormalizer = DecomposingNormalizer::new_nfd();
|
||||
let nfkc_normalizer: ComposingNormalizer = ComposingNormalizer::new_nfkc();
|
||||
let nfkd_normalizer: DecomposingNormalizer = DecomposingNormalizer::new_nfkd();
|
||||
|
||||
let content_latin: (&str, &str) = (
|
||||
"TestNames_Latin",
|
||||
&strip_headers(include_str!("./data/TestNames_Latin.txt")),
|
||||
);
|
||||
let content_jp_h: (&str, &str) = (
|
||||
"TestNames_Japanese_h",
|
||||
&strip_headers(include_str!("./data/TestNames_Japanese_h.txt")),
|
||||
);
|
||||
let content_jp_k: (&str, &str) = (
|
||||
"TestNames_Japanese_k",
|
||||
&strip_headers(include_str!("./data/TestNames_Japanese_k.txt")),
|
||||
);
|
||||
let content_korean: (&str, &str) = (
|
||||
"TestNames_Korean",
|
||||
&strip_headers(include_str!("./data/TestNames_Korean.txt")),
|
||||
);
|
||||
let content_random_words_ar: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_ar",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_ar.txt")),
|
||||
);
|
||||
let content_random_words_de: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_de",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_de.txt")),
|
||||
);
|
||||
let content_random_words_el: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_el",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_el.txt")),
|
||||
);
|
||||
let content_random_words_es: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_es",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_es.txt")),
|
||||
);
|
||||
let content_random_words_fr: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_fr",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_fr.txt")),
|
||||
);
|
||||
let content_random_words_he: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_he",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_he.txt")),
|
||||
);
|
||||
let content_random_words_pl: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_pl",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_pl.txt")),
|
||||
);
|
||||
let content_random_words_ru: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_ru",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_ru.txt")),
|
||||
);
|
||||
let content_random_words_th: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_th",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_th.txt")),
|
||||
);
|
||||
let content_random_words_tr: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_tr",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_tr.txt")),
|
||||
);
|
||||
let content_viet: (&str, &str) = ("udhr_vie", &strip_headers(include_str!("data/wotw.txt")));
|
||||
|
||||
[
|
||||
content_latin,
|
||||
content_viet,
|
||||
content_jp_k,
|
||||
content_jp_h,
|
||||
content_korean,
|
||||
content_random_words_ru,
|
||||
content_random_words_ar,
|
||||
content_random_words_el,
|
||||
content_random_words_es,
|
||||
content_random_words_fr,
|
||||
content_random_words_tr,
|
||||
content_random_words_th,
|
||||
content_random_words_pl,
|
||||
content_random_words_he,
|
||||
content_random_words_de,
|
||||
]
|
||||
.map(|(file_name, raw_content)| BenchDataContent {
|
||||
file_name: file_name.to_owned(),
|
||||
nfc: nfc_normalizer.normalize(raw_content),
|
||||
nfd: nfd_normalizer.normalize(raw_content),
|
||||
nfkc: nfkc_normalizer.normalize(raw_content),
|
||||
nfkd: nfkd_normalizer.normalize(raw_content),
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(debug_assertions)]
|
||||
fn function_under_bench(
|
||||
_canonical_decomposer: &CanonicalDecomposition,
|
||||
_decomposable_points: &str,
|
||||
) {
|
||||
// using debug assertion fails some test.
|
||||
// "cargo test --bench bench" will pass
|
||||
// "cargo bench" will work as expected, because the profile doesn't include debug assertions.
|
||||
}
|
||||
|
||||
#[cfg(not(debug_assertions))]
|
||||
fn function_under_bench(canonical_decomposer: &CanonicalDecomposition, decomposable_points: &str) {
|
||||
decomposable_points.chars().for_each(|point| {
|
||||
canonical_decomposer.decompose(point);
|
||||
});
|
||||
}
|
||||
|
||||
pub fn criterion_benchmark(criterion: &mut Criterion) {
|
||||
let group_name = "canonical_decomposition";
|
||||
let mut group = criterion.benchmark_group(group_name);
|
||||
|
||||
let decomposer = CanonicalDecomposition::new();
|
||||
|
||||
for bench_data_content in black_box(normalizer_bench_data()) {
|
||||
group.bench_function(
|
||||
BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)),
|
||||
|bencher| bencher.iter(|| function_under_bench(&decomposer, &bench_data_content.nfc)),
|
||||
);
|
||||
group.bench_function(
|
||||
BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)),
|
||||
|bencher| bencher.iter(|| function_under_bench(&decomposer, &bench_data_content.nfd)),
|
||||
);
|
||||
group.bench_function(
|
||||
BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)),
|
||||
|bencher| bencher.iter(|| function_under_bench(&decomposer, &bench_data_content.nfkc)),
|
||||
);
|
||||
group.bench_function(
|
||||
BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)),
|
||||
|bencher| bencher.iter(|| function_under_bench(&decomposer, &bench_data_content.nfkd)),
|
||||
);
|
||||
}
|
||||
group.finish();
|
||||
}
|
|
@ -0,0 +1,230 @@
|
|||
// This file is part of ICU4X. For terms of use, please see the file
|
||||
// called LICENSE at the top level of the ICU4X source tree
|
||||
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
|
||||
|
||||
use criterion::{black_box, BenchmarkId, Criterion};
|
||||
|
||||
use icu_normalizer::{ComposingNormalizer, DecomposingNormalizer};
|
||||
|
||||
struct BenchDataContent {
|
||||
pub file_name: String,
|
||||
pub nfc: String,
|
||||
pub nfd: String,
|
||||
pub nfkc: String,
|
||||
pub nfkd: String,
|
||||
pub nfc_u16: Vec<u16>,
|
||||
pub nfd_u16: Vec<u16>,
|
||||
pub nfkc_u16: Vec<u16>,
|
||||
pub nfkd_u16: Vec<u16>,
|
||||
}
|
||||
|
||||
fn strip_headers(content: &str) -> String {
|
||||
content
|
||||
.lines()
|
||||
.filter(|&s| !s.starts_with('#'))
|
||||
.map(|s| s.to_owned())
|
||||
.collect::<Vec<String>>()
|
||||
.join("\n")
|
||||
}
|
||||
|
||||
fn normalizer_bench_data() -> [BenchDataContent; 15] {
|
||||
let nfc_normalizer: ComposingNormalizer = ComposingNormalizer::new_nfc();
|
||||
let nfd_normalizer: DecomposingNormalizer = DecomposingNormalizer::new_nfd();
|
||||
let nfkc_normalizer: ComposingNormalizer = ComposingNormalizer::new_nfkc();
|
||||
let nfkd_normalizer: DecomposingNormalizer = DecomposingNormalizer::new_nfkd();
|
||||
|
||||
let content_latin: (&str, &str) = (
|
||||
"TestNames_Latin",
|
||||
&strip_headers(include_str!("./data/TestNames_Latin.txt")),
|
||||
);
|
||||
let content_jp_h: (&str, &str) = (
|
||||
"TestNames_Japanese_h",
|
||||
&strip_headers(include_str!("./data/TestNames_Japanese_h.txt")),
|
||||
);
|
||||
let content_jp_k: (&str, &str) = (
|
||||
"TestNames_Japanese_k",
|
||||
&strip_headers(include_str!("./data/TestNames_Japanese_k.txt")),
|
||||
);
|
||||
let content_korean: (&str, &str) = (
|
||||
"TestNames_Korean",
|
||||
&strip_headers(include_str!("./data/TestNames_Korean.txt")),
|
||||
);
|
||||
let content_random_words_ar: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_ar",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_ar.txt")),
|
||||
);
|
||||
let content_random_words_de: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_de",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_de.txt")),
|
||||
);
|
||||
let content_random_words_el: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_el",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_el.txt")),
|
||||
);
|
||||
let content_random_words_es: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_es",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_es.txt")),
|
||||
);
|
||||
let content_random_words_fr: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_fr",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_fr.txt")),
|
||||
);
|
||||
let content_random_words_he: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_he",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_he.txt")),
|
||||
);
|
||||
let content_random_words_pl: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_pl",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_pl.txt")),
|
||||
);
|
||||
let content_random_words_ru: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_ru",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_ru.txt")),
|
||||
);
|
||||
let content_random_words_th: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_th",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_th.txt")),
|
||||
);
|
||||
let content_random_words_tr: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_tr",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_tr.txt")),
|
||||
);
|
||||
let content_viet: (&str, &str) = ("wotw", &strip_headers(include_str!("./data/wotw.txt")));
|
||||
|
||||
[
|
||||
content_latin,
|
||||
content_viet,
|
||||
content_jp_k,
|
||||
content_jp_h,
|
||||
content_korean,
|
||||
content_random_words_ru,
|
||||
content_random_words_ar,
|
||||
content_random_words_el,
|
||||
content_random_words_es,
|
||||
content_random_words_fr,
|
||||
content_random_words_tr,
|
||||
content_random_words_th,
|
||||
content_random_words_pl,
|
||||
content_random_words_he,
|
||||
content_random_words_de,
|
||||
]
|
||||
.map(|(file_name, raw_content)| {
|
||||
let nfc = &nfc_normalizer.normalize(raw_content);
|
||||
let nfd = &nfd_normalizer.normalize(raw_content);
|
||||
let nfkc = &nfkc_normalizer.normalize(raw_content);
|
||||
let nfkd = &nfkd_normalizer.normalize(raw_content);
|
||||
BenchDataContent {
|
||||
file_name: file_name.to_owned(),
|
||||
nfc: nfc.to_owned(),
|
||||
nfd: nfd.to_owned(),
|
||||
nfkc: nfkc.to_owned(),
|
||||
nfkd: nfkd.to_owned(),
|
||||
nfc_u16: nfc.encode_utf16().collect(),
|
||||
nfd_u16: nfd.encode_utf16().collect(),
|
||||
nfkc_u16: nfkc.encode_utf16().collect(),
|
||||
nfkd_u16: nfkd.encode_utf16().collect(),
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn function_under_bench(normalizer: &ComposingNormalizer, text: &str) {
|
||||
normalizer.normalize(text);
|
||||
}
|
||||
|
||||
fn function_under_bench_utf16(normalizer: &ComposingNormalizer, text: &[u16]) {
|
||||
normalizer.normalize_utf16(text);
|
||||
}
|
||||
|
||||
pub fn criterion_benchmark(criterion: &mut Criterion) {
|
||||
let group_name = "composing_normalizer_nfc";
|
||||
|
||||
let normalizer_under_bench: ComposingNormalizer = ComposingNormalizer::new_nfc();
|
||||
|
||||
let mut group = criterion.benchmark_group(group_name);
|
||||
|
||||
for bench_data_content in black_box(normalizer_bench_data()) {
|
||||
group.bench_function(
|
||||
BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)),
|
||||
|bencher| {
|
||||
bencher
|
||||
.iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfc))
|
||||
},
|
||||
);
|
||||
group.bench_function(
|
||||
BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)),
|
||||
|bencher| {
|
||||
bencher
|
||||
.iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfd))
|
||||
},
|
||||
);
|
||||
group.bench_function(
|
||||
BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)),
|
||||
|bencher| {
|
||||
bencher.iter(|| {
|
||||
function_under_bench(&normalizer_under_bench, &bench_data_content.nfkc)
|
||||
})
|
||||
},
|
||||
);
|
||||
group.bench_function(
|
||||
BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)),
|
||||
|bencher| {
|
||||
bencher.iter(|| {
|
||||
function_under_bench(&normalizer_under_bench, &bench_data_content.nfkd)
|
||||
})
|
||||
},
|
||||
);
|
||||
|
||||
// UTF_16
|
||||
group.bench_function(
|
||||
BenchmarkId::from_parameter(format!(
|
||||
"from_nfc_{}_utf_16",
|
||||
bench_data_content.file_name
|
||||
)),
|
||||
|bencher| {
|
||||
bencher.iter(|| {
|
||||
function_under_bench_utf16(&normalizer_under_bench, &bench_data_content.nfc_u16)
|
||||
})
|
||||
},
|
||||
);
|
||||
group.bench_function(
|
||||
BenchmarkId::from_parameter(format!(
|
||||
"from_nfd_{}_utf_16",
|
||||
bench_data_content.file_name
|
||||
)),
|
||||
|bencher| {
|
||||
bencher.iter(|| {
|
||||
function_under_bench_utf16(&normalizer_under_bench, &bench_data_content.nfd_u16)
|
||||
})
|
||||
},
|
||||
);
|
||||
group.bench_function(
|
||||
BenchmarkId::from_parameter(format!(
|
||||
"from_nfkc_{}_utf_16",
|
||||
bench_data_content.file_name
|
||||
)),
|
||||
|bencher| {
|
||||
bencher.iter(|| {
|
||||
function_under_bench_utf16(
|
||||
&normalizer_under_bench,
|
||||
&bench_data_content.nfkc_u16,
|
||||
)
|
||||
})
|
||||
},
|
||||
);
|
||||
group.bench_function(
|
||||
BenchmarkId::from_parameter(format!(
|
||||
"from_nfkd_{}_utf_16",
|
||||
bench_data_content.file_name
|
||||
)),
|
||||
|bencher| {
|
||||
bencher.iter(|| {
|
||||
function_under_bench_utf16(
|
||||
&normalizer_under_bench,
|
||||
&bench_data_content.nfkd_u16,
|
||||
)
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
group.finish();
|
||||
}
|
|
@ -0,0 +1,211 @@
|
|||
// This file is part of ICU4X. For terms of use, please see the file
|
||||
// called LICENSE at the top level of the ICU4X source tree
|
||||
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
|
||||
|
||||
use criterion::{black_box, BenchmarkId, Criterion};
|
||||
|
||||
use icu_normalizer::{ComposingNormalizer, DecomposingNormalizer};
|
||||
|
||||
struct BenchDataContent {
|
||||
pub file_name: String,
|
||||
pub nfc: String,
|
||||
pub nfd: String,
|
||||
pub nfkc: String,
|
||||
pub nfkd: String,
|
||||
pub nfc_u16: Vec<u16>,
|
||||
pub nfd_u16: Vec<u16>,
|
||||
pub nfkc_u16: Vec<u16>,
|
||||
pub nfkd_u16: Vec<u16>,
|
||||
}
|
||||
|
||||
fn strip_headers(content: &str) -> String {
|
||||
content
|
||||
.lines()
|
||||
.filter(|&s| !s.starts_with('#'))
|
||||
.map(|s| s.to_owned())
|
||||
.collect::<Vec<String>>()
|
||||
.join("\n")
|
||||
}
|
||||
|
||||
fn normalizer_bench_data() -> [BenchDataContent; 15] {
|
||||
let nfc_normalizer: ComposingNormalizer = ComposingNormalizer::new_nfc();
|
||||
let nfd_normalizer: DecomposingNormalizer = DecomposingNormalizer::new_nfd();
|
||||
let nfkc_normalizer: ComposingNormalizer = ComposingNormalizer::new_nfkc();
|
||||
let nfkd_normalizer: DecomposingNormalizer = DecomposingNormalizer::new_nfkd();
|
||||
|
||||
let content_latin: (&str, &str) = (
|
||||
"TestNames_Latin",
|
||||
&strip_headers(include_str!("./data/TestNames_Latin.txt")),
|
||||
);
|
||||
let content_jp_h: (&str, &str) = (
|
||||
"TestNames_Japanese_h",
|
||||
&strip_headers(include_str!("./data/TestNames_Japanese_h.txt")),
|
||||
);
|
||||
let content_jp_k: (&str, &str) = (
|
||||
"TestNames_Japanese_k",
|
||||
&strip_headers(include_str!("./data/TestNames_Japanese_k.txt")),
|
||||
);
|
||||
let content_korean: (&str, &str) = (
|
||||
"TestNames_Korean",
|
||||
&strip_headers(include_str!("./data/TestNames_Korean.txt")),
|
||||
);
|
||||
let content_random_words_ar: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_ar",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_ar.txt")),
|
||||
);
|
||||
let content_random_words_de: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_de",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_de.txt")),
|
||||
);
|
||||
let content_random_words_el: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_el",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_el.txt")),
|
||||
);
|
||||
let content_random_words_es: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_es",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_es.txt")),
|
||||
);
|
||||
let content_random_words_fr: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_fr",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_fr.txt")),
|
||||
);
|
||||
let content_random_words_he: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_he",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_he.txt")),
|
||||
);
|
||||
let content_random_words_pl: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_pl",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_pl.txt")),
|
||||
);
|
||||
let content_random_words_ru: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_ru",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_ru.txt")),
|
||||
);
|
||||
let content_random_words_th: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_th",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_th.txt")),
|
||||
);
|
||||
let content_random_words_tr: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_tr",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_tr.txt")),
|
||||
);
|
||||
let content_viet: (&str, &str) = ("wotw", &strip_headers(include_str!("./data/wotw.txt")));
|
||||
|
||||
[
|
||||
content_latin,
|
||||
content_viet,
|
||||
content_jp_k,
|
||||
content_jp_h,
|
||||
content_korean,
|
||||
content_random_words_ru,
|
||||
content_random_words_ar,
|
||||
content_random_words_el,
|
||||
content_random_words_es,
|
||||
content_random_words_fr,
|
||||
content_random_words_tr,
|
||||
content_random_words_th,
|
||||
content_random_words_pl,
|
||||
content_random_words_he,
|
||||
content_random_words_de,
|
||||
]
|
||||
.map(|(file_name, raw_content)| {
|
||||
let nfc = &nfc_normalizer.normalize(raw_content);
|
||||
let nfd = &nfd_normalizer.normalize(raw_content);
|
||||
let nfkc = &nfkc_normalizer.normalize(raw_content);
|
||||
let nfkd = &nfkd_normalizer.normalize(raw_content);
|
||||
BenchDataContent {
|
||||
file_name: file_name.to_owned(),
|
||||
nfc: nfc.to_owned(),
|
||||
nfd: nfd.to_owned(),
|
||||
nfkc: nfkc.to_owned(),
|
||||
nfkd: nfkd.to_owned(),
|
||||
nfc_u16: nfc.encode_utf16().collect(),
|
||||
nfd_u16: nfd.encode_utf16().collect(),
|
||||
nfkc_u16: nfkc.encode_utf16().collect(),
|
||||
nfkd_u16: nfkd.encode_utf16().collect(),
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn function_under_bench(normalizer: &ComposingNormalizer, text: &str) {
|
||||
normalizer.normalize(text);
|
||||
}
|
||||
|
||||
fn function_under_bench_u16(normalizer: &ComposingNormalizer, text: &[u16]) {
|
||||
normalizer.normalize_utf16(text);
|
||||
}
|
||||
|
||||
pub fn criterion_benchmark(criterion: &mut Criterion) {
|
||||
let group_name = "composing_normalizer_nfkc";
|
||||
|
||||
let normalizer_under_bench: ComposingNormalizer = ComposingNormalizer::new_nfkc();
|
||||
|
||||
let mut group = criterion.benchmark_group(group_name);
|
||||
|
||||
for bench_data_content in black_box(normalizer_bench_data()) {
|
||||
group.bench_function(
|
||||
BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)),
|
||||
|bencher| {
|
||||
bencher
|
||||
.iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfc))
|
||||
},
|
||||
);
|
||||
group.bench_function(
|
||||
BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)),
|
||||
|bencher| {
|
||||
bencher
|
||||
.iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfd))
|
||||
},
|
||||
);
|
||||
group.bench_function(
|
||||
BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)),
|
||||
|bencher| {
|
||||
bencher.iter(|| {
|
||||
function_under_bench(&normalizer_under_bench, &bench_data_content.nfkc)
|
||||
})
|
||||
},
|
||||
);
|
||||
group.bench_function(
|
||||
BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)),
|
||||
|bencher| {
|
||||
bencher.iter(|| {
|
||||
function_under_bench(&normalizer_under_bench, &bench_data_content.nfkd)
|
||||
})
|
||||
},
|
||||
);
|
||||
// UTF 16
|
||||
group.bench_function(
|
||||
BenchmarkId::from_parameter(format!("from_nfc_{}_u16", bench_data_content.file_name)),
|
||||
|bencher| {
|
||||
bencher.iter(|| {
|
||||
function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfc_u16)
|
||||
})
|
||||
},
|
||||
);
|
||||
group.bench_function(
|
||||
BenchmarkId::from_parameter(format!("from_nfd_{}_u16", bench_data_content.file_name)),
|
||||
|bencher| {
|
||||
bencher.iter(|| {
|
||||
function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfd_u16)
|
||||
})
|
||||
},
|
||||
);
|
||||
group.bench_function(
|
||||
BenchmarkId::from_parameter(format!("from_nfkc_{}_u16", bench_data_content.file_name)),
|
||||
|bencher| {
|
||||
bencher.iter(|| {
|
||||
function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkc_u16)
|
||||
})
|
||||
},
|
||||
);
|
||||
group.bench_function(
|
||||
BenchmarkId::from_parameter(format!("from_nfkd_{}_u16", bench_data_content.file_name)),
|
||||
|bencher| {
|
||||
bencher.iter(|| {
|
||||
function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkd_u16)
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
group.finish();
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
# Generating microbench data
|
||||
|
||||
The full versions of these files are located
|
||||
[in another part of the repository](https://github.com/unicode-org/icu/tree/main/icu4j/perf-tests/data).
|
||||
|
||||
## Sanitizing the file
|
||||
|
||||
```shell
|
||||
sed -i '/^#/d' ${filename}
|
||||
sed -i '/^$/d' ${filename}
|
||||
```
|
||||
|
||||
## Shuffling the file
|
||||
|
||||
```shell
|
||||
shuf -n 20 ${filename} -o ${filename}
|
||||
```
|
||||
|
||||
## Add back the header (if you plan on submitting the files)
|
||||
|
||||
```
|
||||
# This file is part of ICU4X. For terms of use, please see the file
|
||||
# called LICENSE at the top level of the ICU4X source tree
|
||||
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
|
||||
```
|
|
@ -0,0 +1,54 @@
|
|||
# This file is part of ICU4X. For terms of use, please see the file
|
||||
# called LICENSE at the top level of the ICU4X source tree
|
||||
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
|
||||
|
||||
かげやま,みのる
|
||||
むらかみ,とおる
|
||||
つじさわ,けい
|
||||
やすい,たかゆき
|
||||
むらさき,としお
|
||||
はせがわ,ひであき
|
||||
うるしばら,よしひこ
|
||||
ままだ,ひろし
|
||||
おおぼら,えいじろう
|
||||
おおば,まさひで
|
||||
きたばたけ,たかひこ
|
||||
はまさき,あつし
|
||||
ほりい,つねお
|
||||
もり,だいいち
|
||||
いとう,しんいち
|
||||
くにもと,じゅんじ
|
||||
おか,のりひと
|
||||
たに,よしあき
|
||||
しらがき,ひろあき
|
||||
しらはま,たけひろ
|
||||
むらかみ,やすひろ
|
||||
うめはら,たかし
|
||||
いわた,ひろし
|
||||
すぎえ,かつとし
|
||||
てらにし,ひろみつ
|
||||
まつおか,だいすけ
|
||||
もろほし,すすむ
|
||||
いしはら,たかし
|
||||
おしま,ひろお
|
||||
なかお,ゆうじ
|
||||
いかり,はるお
|
||||
きまち,まさき
|
||||
ふるかわ,みちお
|
||||
かねこ,しゅうへい
|
||||
なかがわ,ともみ
|
||||
ささき,しんご
|
||||
うちだ,たくじ
|
||||
うめだ,さかえ
|
||||
しばた,いくこ
|
||||
まきした,けいこ
|
||||
まつもと,しんいちろう
|
||||
たかの,かずよし
|
||||
いしわた,なおひさ
|
||||
いうち,まこと
|
||||
いまい,りほ
|
||||
みずた,のりあき
|
||||
かくたに,まなぶ
|
||||
わだ,ほまれ
|
||||
わかまつ,かずき
|
||||
かわぐち,ひろき
|
|
@ -0,0 +1,54 @@
|
|||
# This file is part of ICU4X. For terms of use, please see the file
|
||||
# called LICENSE at the top level of the ICU4X source tree
|
||||
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
|
||||
|
||||
ホリモト,ユウジ
|
||||
ハナミ,ヤスヒデ
|
||||
イシザカ,タカユキ
|
||||
ゼンケ,トシオ
|
||||
ハトリ,ユウコ
|
||||
ナガオカ,トモユキ
|
||||
コウダ,ケンイチ
|
||||
イシダ,ヒロシ
|
||||
ミワ,シゲユキ
|
||||
イシカワ,ヒロシ
|
||||
スズキ,ユウスケ
|
||||
オクダ,ヨシノリ
|
||||
シムラ,サカエ
|
||||
エビシマ,ヤスユキ
|
||||
イブカ,ヨシテル
|
||||
タノ,マコト
|
||||
ドウゾノ,セイヤ
|
||||
ヤマナカ,サツミ
|
||||
トミイエ,ハヤト
|
||||
アザミ,ツトム
|
||||
タナカ,キョウコ
|
||||
コジマ,アツシ
|
||||
フミハラ,カオリ
|
||||
スズキ,マサユキ
|
||||
ナトリ,ケンヤ
|
||||
スズキ,ユウコ
|
||||
スズキ,ヒサエ
|
||||
ナカガワ,カツヨシ
|
||||
スズキ,マサフミ
|
||||
マツヤマ,トシオ
|
||||
ヨシナガ,チカエ
|
||||
キタムラ,リカコ
|
||||
アオキ,タクオ
|
||||
ヤマグチ,ヤスヒロ
|
||||
スギムラ,シゲオ
|
||||
ウエスギ,マサミ
|
||||
マツムラ,シンイチ
|
||||
クバ,タカシ
|
||||
スドウ,タカトシ
|
||||
フジモト,ヒロシ
|
||||
イトウ,シュウイチ
|
||||
コバヤシ,カズミ
|
||||
タナカ,ヒロカツ
|
||||
イシダ,ツカサ
|
||||
ヤマダ,マサコ
|
||||
カミヤ,トミエ
|
||||
タケモト,ユウジ
|
||||
スミノ,コウジ
|
||||
ヒロハタ,タクヤ
|
||||
ミヒラ,リョウヘイ
|
|
@ -0,0 +1,54 @@
|
|||
# This file is part of ICU4X. For terms of use, please see the file
|
||||
# called LICENSE at the top level of the ICU4X source tree
|
||||
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
|
||||
|
||||
김명희
|
||||
홍차수
|
||||
허순재
|
||||
강영휘
|
||||
김운주
|
||||
이종환
|
||||
이은국
|
||||
강태호
|
||||
강일래
|
||||
김동현
|
||||
곽기자
|
||||
차재수
|
||||
표봉기
|
||||
문대원
|
||||
이형기
|
||||
최교표
|
||||
박식현
|
||||
홍종립
|
||||
서창수
|
||||
김쌍건
|
||||
서말도
|
||||
이병훈
|
||||
김희수
|
||||
박학태
|
||||
강태종
|
||||
조문란
|
||||
신범균
|
||||
백두진
|
||||
이철정
|
||||
김태중
|
||||
이성현
|
||||
김주조
|
||||
김강행
|
||||
이정길
|
||||
김완일
|
||||
권수자
|
||||
이춘철
|
||||
김판근
|
||||
김곡리
|
||||
이경형
|
||||
이운만
|
||||
손상철
|
||||
유기숙
|
||||
박정한
|
||||
조윤래
|
||||
유신호
|
||||
이두수
|
||||
김재률
|
||||
김성홍
|
||||
김혜경
|
|
@ -0,0 +1,54 @@
|
|||
# This file is part of ICU4X. For terms of use, please see the file
|
||||
# called LICENSE at the top level of the ICU4X source tree
|
||||
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
|
||||
|
||||
González, Joan
|
||||
Reinders, Jim
|
||||
Applebroog, Ida
|
||||
Kidd, Joseph Bartholomew
|
||||
Gulácsy, Lajos
|
||||
Letendre, Rita
|
||||
Zuccaro, Federico
|
||||
Apt the Elder, Ulrich
|
||||
Drummond, Arthur
|
||||
Manley, Thomas
|
||||
Broc, Jean
|
||||
Ramunno, Tony
|
||||
Simone dei Crocifissi
|
||||
Lane, Theodore
|
||||
Symonds, William Robert
|
||||
Johnson, Frank Tenney
|
||||
Cox, Gardner
|
||||
Bunbury, Charles
|
||||
Pedro de la Cuadra
|
||||
Payne, William
|
||||
Lucas, John Seymour
|
||||
Holsman, Elizabeth T.
|
||||
de Vries, Auke
|
||||
Laszlo, Philip Alexius de
|
||||
Shigemasa
|
||||
Wolfe, Ruth Mitchell
|
||||
Buck, John
|
||||
Baselitz, Georg
|
||||
Hook, Walter
|
||||
Segall, Lasar
|
||||
Brush, George deForest
|
||||
Master of Jánosrét
|
||||
Sutherland, Elizabeth Leveson-Gower, Countess of
|
||||
Tuckerman, Jane
|
||||
Varley, F.H.
|
||||
Fosso, Samuel
|
||||
Gardner, Daniel
|
||||
Sadler, Walter Dendy
|
||||
Clausen, Franciska
|
||||
Coman, Charlotte Buell
|
||||
Wakelin, Roland
|
||||
Payne, Jon, CML
|
||||
Campagna, Girolamo
|
||||
Wiener, Phyllis
|
||||
Sallee, Charles
|
||||
Fitzgerald, John Anster
|
||||
Gribbroek, Robert
|
||||
Laporte, John
|
||||
Lévy-Dhurmer, Lucien
|
||||
Young, Stephen Scott
|
|
@ -0,0 +1,54 @@
|
|||
# This file is part of ICU4X. For terms of use, please see the file
|
||||
# called LICENSE at the top level of the ICU4X source tree
|
||||
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
|
||||
|
||||
ณรงค์ โต๊ะเงิน
|
||||
กิตติ บุญวันต์
|
||||
สมหมาย ดาบทองดี
|
||||
ธวัชชัย อิสระนิมิตร
|
||||
วรรณา โสภณนรินทร์
|
||||
วินัย หมู่มิ่ง
|
||||
พัชรี ชูจิรวงศ์
|
||||
สมปอง จิวไพโรจน์กิจ
|
||||
บุญส่ง กวยรักษา
|
||||
นิพนธ์ นิ่มใหม่
|
||||
พัชรี สุวพรศิลป์
|
||||
เจริญ นววัฒนทรัพย์
|
||||
อรพินท์ แซ่เจี่ย
|
||||
ชัยพร สมใจนึก
|
||||
ประนอม โคศิลา
|
||||
ฉวีวรรณ ศรสังข์ทอง
|
||||
วัชรา เจริญรัตนพร
|
||||
สุภัท นกศิริ
|
||||
อู๋ มาลาเล็ก
|
||||
ประยูร ไชโย
|
||||
ละออ อยู่ยืนยง
|
||||
สมใจ วิวัฒน์วานิช
|
||||
จุมพล จันทรศรีเกษร
|
||||
พุฒ ดอกไม้จีน
|
||||
บุญชัย วรกิจพรสิน
|
||||
สมาน ธูปเทียน
|
||||
พงศ์ศักดิ์ แซ่แต้
|
||||
อำนาจ ไวจงเจริญ
|
||||
พรทิพย์ แซ่ลี้
|
||||
อุไรวรรณ สาครสินธุ์
|
||||
อำพล วีระตะนนท์
|
||||
สมจิตร ใจวังโลก
|
||||
สุเทพ ตันวินิจ
|
||||
สวาท ทรัพย์มาก
|
||||
สมศักดิ์ เจือจันทร์
|
||||
ดัสซันซิงห์ กุลาตี
|
||||
ธีร ศรแก้ว
|
||||
พรรณยุพา ฮ่อสกุล
|
||||
สำราญ จันทร์เอี่ยม
|
||||
พจน์ มั่นกันนาน
|
||||
สุธี บุณยเกียรติ
|
||||
บุญโชติ ทิพย์ประเสริฐสิน
|
||||
ประดิษฐ์ ทองพสิฐสมบัติ
|
||||
จำเนียร เพ็งเจริญ
|
||||
สมศักดิ์ อรุณรัตน์
|
||||
อนุชา จารุหิรัญสกุล
|
||||
พิกุล มโนภิญโญภิญญะ
|
||||
ผ่องศรี นกแก้ว
|
||||
อารี วิไลวรรณ
|
||||
ณรงค์วิทย์ วิทสัทธาวรกุล
|
|
@ -0,0 +1,54 @@
|
|||
# This file is part of ICU4X. For terms of use, please see the file
|
||||
# called LICENSE at the top level of the ICU4X source tree
|
||||
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
|
||||
|
||||
ممارسة مراعاة
|
||||
العنصرية
|
||||
حدود والشيخوخة
|
||||
بالحكم كهذا ينتفع
|
||||
البلاد
|
||||
تربية
|
||||
الغير التقدم والعدل
|
||||
نحو بالتعليم والحرية
|
||||
تأمين متساو
|
||||
للتعليم فيها
|
||||
آذت اعتداء للتعليم
|
||||
ليس المتأصلة
|
||||
والمساهمة الضروري تتناقض
|
||||
وتأسيس
|
||||
رضى
|
||||
شرعي الطبية
|
||||
لكيلا الجمعية والحرية
|
||||
للرجال التزوج
|
||||
بالكرامة
|
||||
حرية بين
|
||||
هذه العيش تنظر
|
||||
قيد
|
||||
يقررها والصداقة
|
||||
اعتُمد وينبغي اجتماعي
|
||||
حرمان
|
||||
للإدراك بأجر إنتاجه
|
||||
التربية القانون
|
||||
لإنصافه وتأسيس وسمعته
|
||||
أساسه للرجال
|
||||
كافة
|
||||
المجهود دولي أينما
|
||||
وإلى
|
||||
بنشاط تجري
|
||||
والأمم مثل لحقوق
|
||||
الإنسان بشروط بحماية
|
||||
شرفه
|
||||
كما الوظائف
|
||||
حياته ديسمبر
|
||||
ولما
|
||||
هذه
|
||||
غاية جديد إنسان
|
||||
حرية
|
||||
متهم الوطنية قدمًا
|
||||
التملك وضع
|
||||
شرعية ويعبر تأدية
|
||||
بنظام عمل والأخلاق
|
||||
التملك لشخصيته يلجأ
|
||||
بحال يضطر ولا
|
||||
الانضمام بالكرامة
|
||||
عضوا
|
|
@ -0,0 +1,54 @@
|
|||
# This file is part of ICU4X. For terms of use, please see the file
|
||||
# called LICENSE at the top level of the ICU4X source tree
|
||||
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
|
||||
|
||||
Herrschaft Freiheiten Not
|
||||
Gewalt
|
||||
stets anderer begründet
|
||||
erhobenen innerstaatliche
|
||||
Heiratsfähige freie
|
||||
offenstehen Begrenzung grausamer
|
||||
Maßnahmen höchste
|
||||
unentbehrlich privat
|
||||
erniedrigender
|
||||
Verachtung freie
|
||||
innezuhaben innerstaatlichen
|
||||
kommen
|
||||
werden gleichgültig
|
||||
Würde überall höchste
|
||||
Schutzmaßnahmen den Pflichten
|
||||
Wille Bestimmung
|
||||
Leibeigenschaft einschließlich für
|
||||
gleiche bekräftigt Gewissens
|
||||
Wohles
|
||||
Generalversammlung
|
||||
Volkes
|
||||
Völkern gegenwärtig Zusammenarbeit
|
||||
Heiratsfähige sowie Jeder
|
||||
Stellung
|
||||
Lebensstandard
|
||||
seinem
|
||||
Rede strafbaren Sicherheit
|
||||
mit
|
||||
Kulthandlungen Grund
|
||||
ärztlicher
|
||||
Auflösung Anforderungen anzugehören
|
||||
Furcht
|
||||
keine Geburt
|
||||
Wohles Furcht genügen
|
||||
befriedigende Medien
|
||||
anzugehören Urlaub Vereinigungen
|
||||
hinzuwirken verboten Resolution
|
||||
kommen
|
||||
sozialer vor irgendein
|
||||
Bestimmung Bestimmung
|
||||
Fall natürliche kein
|
||||
Geschlecht Aufhetzung eigenen
|
||||
seinen
|
||||
über
|
||||
Unterlassung Berücksichtigung
|
||||
war
|
||||
Rufes stets
|
||||
Volkes anderer Beschränkungen
|
||||
Handlungen dessen
|
||||
Die
|
|
@ -0,0 +1,54 @@
|
|||
# This file is part of ICU4X. For terms of use, please see the file
|
||||
# called LICENSE at the top level of the ICU4X source tree
|
||||
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
|
||||
|
||||
προάγει αλληλογραφία
|
||||
λογική έχει
|
||||
ιδρύει ζωή τεχνική
|
||||
δυνατότητες
|
||||
περιορισμό συνόλου
|
||||
ασκεί παραγνώριση συναφθεί
|
||||
αναγνωρίζουν ποινικής εκδηλώνει
|
||||
κοινότητας διακυβέρνηση στα
|
||||
απέναντι υψηλή
|
||||
περιστάσεων αξιόποινη
|
||||
σεβασμό
|
||||
συντήρησής κατά εξασφαλίσουν
|
||||
παραβιάζουν συμπληρώνεται νόμο
|
||||
άμεσα
|
||||
σημαίνει καθεστώς
|
||||
ΑΝΘΡΩΠΙΝΑ θέλησης ανθρωπίνων
|
||||
ΔΙΑΚΗΡΥΞΗ αθλιότητα ασφάλιση
|
||||
μέσο
|
||||
ίση Εχει
|
||||
ειρήνης Κάθε
|
||||
μέλη μορφή
|
||||
όσο
|
||||
κρατείται Στο Διακηρύσσει
|
||||
οικονομικών έκφρασης εξασφαλίζεται
|
||||
κάθε
|
||||
περίπτωση απολαμβάνουν
|
||||
ποινικό γεροντική
|
||||
είναι μαζί δικαστήρια
|
||||
μαζί προοπτική
|
||||
δική
|
||||
βαρβαρότητας
|
||||
οικονομικών εξασφαλίσει
|
||||
υποχρεώσεις οδήγησαν
|
||||
Οικουμενική Διακήρυξης γονείς
|
||||
στις μυστική αντιπροσώπους
|
||||
Διακήρυξης άδειες βιοτικό
|
||||
αναπηρία ομάδα
|
||||
πραγματικό
|
||||
καλύτερες
|
||||
ανάπαυση
|
||||
δίκαιες ένα δικαίου
|
||||
μετέχει στους
|
||||
θρησκευτικών ποινικής
|
||||
Κανείς ίσα
|
||||
πεποιθήσεις
|
||||
πολιτικές ανάλογα δουλεία
|
||||
πολιτικές ιατρική ωσότου
|
||||
ηθικής χωρίς
|
||||
ανδρών ικανό
|
||||
καθώς
|
|
@ -0,0 +1,54 @@
|
|||
# This file is part of ICU4X. For terms of use, please see the file
|
||||
# called LICENSE at the top level of the ICU4X source tree
|
||||
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
|
||||
|
||||
duración común
|
||||
delito reconocimiento alimentación
|
||||
inalienables
|
||||
entre seguridad escogidos
|
||||
comportarse dignidad
|
||||
autónomo gobierno tiempo
|
||||
omisiones
|
||||
comisión
|
||||
Derechos territorios
|
||||
debe
|
||||
han
|
||||
regresar inalienables
|
||||
regresar
|
||||
desempleo científico
|
||||
arbitrariamente proclamada
|
||||
están contraerse esposos
|
||||
cualesquiera
|
||||
salir carácter desarrollo
|
||||
solamente justas
|
||||
personalidad una
|
||||
cuanto
|
||||
garantice resolución
|
||||
concepción
|
||||
tomar impondrá
|
||||
cualquier reconocimiento
|
||||
obligatoria obligatoria satisfactoria
|
||||
acusación sin
|
||||
artísticas penal culturales
|
||||
pagadas examen
|
||||
Además Organización dignidad
|
||||
opresión esposos ejercidos
|
||||
barbarie están mientras
|
||||
por
|
||||
idioma
|
||||
recursos pagadas
|
||||
materia Nada ella
|
||||
con injerencias
|
||||
inspirándose
|
||||
organización
|
||||
gozar jurisdicción
|
||||
que
|
||||
asegurar
|
||||
humana libertad
|
||||
nadie equivalente
|
||||
escoger remuneración
|
||||
torturas
|
||||
individuos poder
|
||||
disfruten seres Preámbulo
|
||||
desempleo
|
||||
liberados
|
|
@ -0,0 +1,54 @@
|
|||
# This file is part of ICU4X. For terms of use, please see the file
|
||||
# called LICENSE at the top level of the ICU4X source tree
|
||||
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
|
||||
|
||||
conforme êtres fonctions
|
||||
non tout généralisé
|
||||
premier lui
|
||||
faire hommes d’égalité
|
||||
peuple volonté bénéficier
|
||||
générale nationales
|
||||
cruels plus
|
||||
d’encourager opinions
|
||||
genre l’esprit
|
||||
d’origine effectif
|
||||
exigences auront
|
||||
résultent situation recevoir
|
||||
peuples Chacun
|
||||
sont d’égalité
|
||||
jouissent
|
||||
auront l’esprit
|
||||
pays telle
|
||||
publiquement
|
||||
mariage foi
|
||||
travail démocratique religieux
|
||||
rémunération
|
||||
omissions telles
|
||||
L’éducation
|
||||
raison complétée donner
|
||||
invoqué auront arbitraires
|
||||
l’amitié suffisant affaires
|
||||
travaille l’accomplissement l’intermédiaire
|
||||
race
|
||||
opinions celles
|
||||
assurer par privée
|
||||
valeur
|
||||
violant traite premier
|
||||
inhérente
|
||||
bienfaits l’avènement
|
||||
Unies s’il actions
|
||||
inquiété l’esclavage
|
||||
inquiété
|
||||
esclaves lieu
|
||||
salaire
|
||||
par
|
||||
toute
|
||||
innocente procédure membres
|
||||
arts l’idéal envers
|
||||
suffrage territoires inhumains
|
||||
d’immixtions l’organisation progrès
|
||||
comme égalité Unies
|
||||
maternité
|
||||
violerait suprême sécurité
|
||||
impliquant eux loisirs
|
||||
nationalité
|
|
@ -0,0 +1,54 @@
|
|||
# This file is part of ICU4X. For terms of use, please see the file
|
||||
# called LICENSE at the top level of the ICU4X source tree
|
||||
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
|
||||
|
||||
זקנה משפילים
|
||||
ינתן חברתי עניניו
|
||||
הפוב
|
||||
ולהיות זכויות הישגים
|
||||
יאסרו מטעמי וללא
|
||||
ספרותית השלם
|
||||
למנוחה חינם
|
||||
וההתאגדות
|
||||
לטפח
|
||||
באלה במלואן
|
||||
יהנו
|
||||
ולרווחתם לגבר האדם
|
||||
בכבודו שבארצות כבוד
|
||||
ובינלאומיים
|
||||
בכך לתנאי אישי
|
||||
שאינן
|
||||
שרירותי
|
||||
במשפט
|
||||
ולעקרונותיהן מטעם
|
||||
שרירותית האשמה יהיה
|
||||
החינוך ולבטחון
|
||||
סובלנות אשמתו במגילה
|
||||
המאוחדות חיוני
|
||||
חשוב במקרה
|
||||
כלתי העולם
|
||||
שמקורה כציבור
|
||||
לשויון
|
||||
לתקנה
|
||||
תלוי ההתאספות
|
||||
הדיבור שהוא
|
||||
והבלתי והבסיסית
|
||||
ולעקרונותיהן יהא וישאף
|
||||
ביתנ הבינלאומי
|
||||
והזלזול להקנות
|
||||
בגלל כולם שיושלם
|
||||
לחיים
|
||||
בדבר
|
||||
לשירות
|
||||
זכויות
|
||||
לפני
|
||||
אדם ולא מזזמנות
|
||||
קנינו שהיה ההתאספות
|
||||
בינלאומי חיוניות לבקש
|
||||
תהיינה
|
||||
ובזכות בכורה מהגנה
|
||||
מתוך
|
||||
ובמצפון מזומנות לאגד
|
||||
והחמריים סוציאלי
|
||||
אנושיים ובהצבעה
|
||||
פראיים
|
|
@ -0,0 +1,54 @@
|
|||
# This file is part of ICU4X. For terms of use, please see the file
|
||||
# called LICENSE at the top level of the ICU4X source tree
|
||||
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
|
||||
|
||||
uciskowi posiadania prawo
|
||||
społecznego największych skazany
|
||||
czy
|
||||
potrzeby samodzielnie przystępowania
|
||||
Krzewi też dokonania
|
||||
pełną prawo
|
||||
buntu
|
||||
moralności
|
||||
zapewnienia znaczenie
|
||||
nieludzki wypadek Nikt
|
||||
zasadności jakikolwiek Każdy
|
||||
samowolnie krajem
|
||||
międzynarodowego
|
||||
członek wielu
|
||||
rozwój wynikających obalenia
|
||||
rasy
|
||||
grudnia która
|
||||
jedynie urlopu ani
|
||||
małżeńskie stanowi ustaniu
|
||||
człowieka postępowych
|
||||
prześladowania
|
||||
politycznej które zawarcia
|
||||
Deklaracja
|
||||
ingerować wyłącznie
|
||||
studia Nikt
|
||||
innego uprawianie zrozumienie
|
||||
wybranych swobodę wyznania
|
||||
wolni osobowości
|
||||
ograniczenie Nie
|
||||
równej społecznego uciekać
|
||||
będącą POWSZECHNA
|
||||
niezdolności poszukiwania międzynarodowej
|
||||
konieczne potrzeby posiada
|
||||
opinii wychowywania 1948
|
||||
międzynarodowej zatrzymać
|
||||
przedstawicieli
|
||||
przeciw
|
||||
wynikających organy pracę
|
||||
człowiek grupami
|
||||
niezbędnych
|
||||
wolności podstawowym
|
||||
opinii małżonków wolność
|
||||
postępować zdecydowanie komórką
|
||||
odniesieniu
|
||||
pokoju azyl
|
||||
zawodowych powrócić człowiek
|
||||
konstytucję
|
||||
takiej postaciach powszechnego
|
||||
wygnać wygnać
|
||||
wspólny poszanowania
|
|
@ -0,0 +1,54 @@
|
|||
# This file is part of ICU4X. For terms of use, please see the file
|
||||
# called LICENSE at the top level of the ICU4X source tree
|
||||
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
|
||||
|
||||
областях
|
||||
будут должен
|
||||
обеспечиваются нежели
|
||||
котором Уставе
|
||||
социального моральных
|
||||
совершеннолетия предоставление
|
||||
том независимо
|
||||
существование
|
||||
вмешательства какому ограниченной
|
||||
распространять
|
||||
находить помощь
|
||||
искусством
|
||||
унижающим положения искать
|
||||
изгнанию член совершеннолетия
|
||||
обществом имуществом государственной
|
||||
идеи братства
|
||||
наслаждаться значение социальной
|
||||
осуществления юрисдикцией наказанию
|
||||
достойное свою III
|
||||
жизнь расторжения инвалидности
|
||||
терпимости этого
|
||||
целях равны
|
||||
обеспечиваются законным
|
||||
принуждаем правосубъектности
|
||||
пыткам доступа неприкосновенность
|
||||
Брак против
|
||||
прибегать независимой
|
||||
человека человеческой
|
||||
быть независимо религии
|
||||
публичным
|
||||
членам против
|
||||
разумом результатом семью
|
||||
Принята участие
|
||||
беспристрастным тем
|
||||
частным основной
|
||||
правового
|
||||
страной обслуживание
|
||||
было свободу полное
|
||||
рабочего свободны
|
||||
состоянии помощь религиозными
|
||||
полное
|
||||
владеть власти морали
|
||||
меньшей
|
||||
братства социальному убежища
|
||||
государств
|
||||
равны который дети
|
||||
терпимости
|
||||
получать бесплатным полного
|
||||
богослужении
|
||||
отдельным
|
|
@ -0,0 +1,54 @@
|
|||
# This file is part of ICU4X. For terms of use, please see the file
|
||||
# called LICENSE at the top level of the ICU4X source tree
|
||||
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
|
||||
|
||||
คิด ใตัอำ เคลื่อนไหว
|
||||
บังคับ บาก
|
||||
สิ่ง สิ้น
|
||||
วัตถุ
|
||||
ชาย อาศัย เท่านั้น
|
||||
สิน
|
||||
เกา
|
||||
ดูแล พิธีกรรม
|
||||
ภายใน
|
||||
เพศ
|
||||
หนัก ประสงค์
|
||||
เหตุ
|
||||
งาน รักษา
|
||||
เพศ ภาษา
|
||||
นี้
|
||||
คู่ สัญชาติ ต้องการ
|
||||
วิธี ระหว่าง ตกลง
|
||||
ทำนอง
|
||||
สืบ กับ ศิลปกรรม
|
||||
เหนือ วรรณกรรม
|
||||
คิด การก หน้าที่
|
||||
ชาติ ศิลปกรรม แต่
|
||||
สามัญ สอด
|
||||
เหยียด วิธี จุด
|
||||
หน้า ถ้า เบื้อง
|
||||
ประชุม
|
||||
ศิลปกรรม
|
||||
เสรีภาพ โหด ก่อ
|
||||
เกียรติศักดิ์ ป่วย เอกราช
|
||||
ประหัต มโนธรรม การ
|
||||
แทน
|
||||
ขัดขืน เวลา เสียง
|
||||
กฎบัตร พยายาม
|
||||
สิน หน้า
|
||||
จำเป็น
|
||||
ประชาธิปไตย หน่วย
|
||||
กรณี จริงจัง
|
||||
ทำนอง
|
||||
ทาษ
|
||||
เพิ่ม
|
||||
บรรดา ขวาง
|
||||
กักขัง
|
||||
มนุษย์
|
||||
ชาย ประกัน มนุษยธรรม
|
||||
จะบัน มูลฐาน เถื่อน
|
||||
พฤติ
|
||||
มิได้
|
||||
หญิง คู่
|
||||
สมา ปฏิบัติ อนึ่ง
|
||||
สิ่ง ทาษ
|
|
@ -0,0 +1,54 @@
|
|||
# This file is part of ICU4X. For terms of use, please see the file
|
||||
# called LICENSE at the top level of the ICU4X source tree
|
||||
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
|
||||
|
||||
mecburidir ilim
|
||||
isnadın sınırları suç
|
||||
tutuklanamaz diğer
|
||||
memleket korunmasi kullanılamaz
|
||||
İnsanlık ilerlemeyi
|
||||
bir mülk menfaatlerinin
|
||||
usul zümreye herhangi
|
||||
mahkeme vicdana ilerleyişe
|
||||
zulüm zalimane
|
||||
ilim öncelikle çocuk
|
||||
mevzubahis ancak
|
||||
muamelesi dinlenmeye
|
||||
eşitlikle prensiplerine ülkenin
|
||||
öğretim bulunmalarına yardım
|
||||
memleketler amacıyla
|
||||
birbirlerine
|
||||
olmalıdır
|
||||
bırakılamaz serbestisine
|
||||
hürriyetin iyi
|
||||
hükmü işbu zalimane
|
||||
evlenme memleketi tedbirlerle
|
||||
evlenmek ahalisi işini
|
||||
hürriyetler
|
||||
belirlenmiş kere
|
||||
elde cürüme
|
||||
tanınan dünyaca yüksek
|
||||
müddetinin ailesine
|
||||
vicdan kırıcı itibariyle
|
||||
geniş inanma
|
||||
kendi görevleri Teşkilatı
|
||||
yaymak
|
||||
öğretim vesayet
|
||||
renk kişiliğinin
|
||||
tamamlanan
|
||||
haklara bulunma
|
||||
hükmü uygulanabilecek
|
||||
etmiş geliştirilmesini hoşgörü
|
||||
sahiptir temel
|
||||
giyim
|
||||
Bundan temeli
|
||||
icaplarını
|
||||
mülk karışma tekmil
|
||||
vicdana hürriyetine işini
|
||||
Herkesin vahşiliklere
|
||||
dolaşma dünyanın
|
||||
davasının Uluslararasında idamesi
|
||||
eşittir
|
||||
haklardan hakkı
|
||||
kovuşturmalar hürriyetlerden gözönünde
|
||||
Evrensel fiilli beyannamesi
|
|
@ -0,0 +1,58 @@
|
|||
# This file is part of ICU4X. For terms of use, please see the file
|
||||
# called LICENSE at the top level of the ICU4X source tree
|
||||
# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
|
||||
|
||||
# The contents of this file have been translated by "Google Translate".
|
||||
|
||||
Vào những năm cuối của thế kỷ 19, không ai có thể tin rằng thế giới này
|
||||
đang được theo dõi một cách sâu sắc và chặt chẽ bởi những trí thông minh
|
||||
lớn hơn con người nhưng cũng nguy hiểm như chính con người; rằng khi con
|
||||
người bận rộn với những mối quan tâm khác nhau của họ, họ bị xem xét và
|
||||
nghiên cứu kỹ lưỡng, có lẽ gần như một người đàn ông với kính hiển vi có thể
|
||||
xem xét kỹ lưỡng những sinh vật nhất thời tụ tập và sinh sôi nảy nở trong
|
||||
một giọt nước. Với sự tự mãn vô hạn, con người đi đi lại lại khắp thế giới
|
||||
này chỉ vì những công việc nhỏ nhặt của họ, thanh thản với niềm tin chắc
|
||||
chắn về đế chế của họ đối với vật chất. Có thể là infusoria dưới kính hiển
|
||||
vi cũng làm như vậy. Không ai coi các thế giới cũ hơn trong không gian là
|
||||
nguồn gây nguy hiểm cho con người, hoặc nghĩ về chúng chỉ để bác bỏ ý
|
||||
tưởng về sự sống đối với chúng là không thể hoặc không thể xảy ra.
|
||||
Thật tò mò khi nhớ lại một số thói quen tinh thần của những ngày đã
|
||||
qua. Hầu hết những người trên trái đất đều tưởng tượng rằng có thể có
|
||||
những người khác trên sao Hỏa, có lẽ thấp kém hơn họ và sẵn sàng chào
|
||||
đón một doanh nghiệp truyền giáo. Tuy nhiên, bên kia vịnh không gian,
|
||||
những bộ óc đối với tâm trí của chúng ta cũng như tâm trí của chúng ta đối
|
||||
với những con thú bị diệt vong, những bộ óc rộng lớn, lạnh lùng và vô cảm,
|
||||
nhìn trái đất này với con mắt ghen tị, và dần dần và chắc chắn vạch ra
|
||||
những kế hoạch chống lại chúng ta. Và đầu thế kỷ 20 đã xảy ra sự vỡ mộng
|
||||
lớn. Hành tinh sao Hỏa, tôi không cần nhắc độc giả, quay xung quanh mặt
|
||||
trời ở khoảng cách trung bình 140.000.000 dặm, và ánh sáng và nhiệt mà
|
||||
nó nhận được từ mặt trời chỉ bằng một nửa so với thế giới này nhận được.
|
||||
Nếu giả thuyết về tinh vân có bất kỳ sự thật nào, nó phải tồn tại lâu
|
||||
đời hơn thế giới của chúng ta; và rất lâu trước khi trái đất này ngừng
|
||||
nóng chảy, sự sống trên bề mặt của nó hẳn đã bắt đầu quá trình của nó.
|
||||
Thực tế là nó chỉ chiếm một phần bảy thể tích của trái đất đã làm tăng
|
||||
tốc độ nguội đi của nó đến nhiệt độ mà sự sống có thể bắt đầu. Nó có
|
||||
không khí và nước và tất cả những gì cần thiết để hỗ trợ sự tồn tại
|
||||
sinh động. Tuy nhiên, con người quá hão huyền và bị mù quáng bởi sự phù
|
||||
phiếm của mình, đến nỗi cho đến tận cuối thế kỷ 19, không có nhà văn nào
|
||||
bày tỏ bất kỳ ý tưởng nào rằng sự sống thông minh có thể đã phát triển ở đó xa,
|
||||
hoặc thực sự là ở tất cả, vượt ra ngoài mức độ trần gian của nó. Người ta
|
||||
cũng không hiểu một cách tổng quát rằng vì sao Hỏa già hơn trái đất của chúng
|
||||
ta, chỉ bằng một phần tư diện tích bề mặt và ở xa mặt trời hơn, nên điều tất
|
||||
yếu dẫn đến là nó không chỉ xa hơn so với thời điểm bắt đầu mà còn gần ngày kết
|
||||
thúc hơn. Sự nguội lạnh thế tục mà một ngày nào đó phải vượt qua hành tinh của chúng
|
||||
ta đã thực sự đi xa với người hàng xóm của chúng ta. Tình trạng vật lý của nó phần lớn
|
||||
vẫn còn là một bí ẩn, nhưng giờ đây chúng ta biết rằng ngay cả ở vùng xích đạo của nó,
|
||||
nhiệt độ giữa trưa hầu như không bằng nhiệt độ của mùa đông lạnh nhất của chúng ta.
|
||||
Không khí của nó loãng hơn nhiều so với không khí của chúng ta, các đại dương của nó đã
|
||||
thu hẹp lại cho đến khi chỉ bao phủ một phần ba bề mặt của nó, và khi các mùa chậm chạp
|
||||
của nó thay đổi, các chỏm tuyết khổng lồ tụ lại và tan chảy ở hai cực và định kỳ làm ngập các vùng ôn đới của nó.
|
||||
Giai đoạn cuối cùng của sự kiệt sức, mà đối với chúng ta vẫn còn quá xa vời, đã trở thành
|
||||
một vấn đề ngày nay đối với các cư dân trên sao Hỏa. Áp lực trước mắt của sự cần
|
||||
thiết đã làm sáng tỏ trí tuệ của họ, mở rộng sức mạnh của họ và làm chai đá trái
|
||||
tim họ. Và nhìn xuyên qua không gian với các công cụ, và trí thông minh như chúng
|
||||
ta hiếm khi mơ tới, họ thấy, ở khoảng cách gần nhất chỉ cách họ 35.000.000 dặm
|
||||
về phía mặt trời, một ngôi sao buổi sáng của hy vọng, hành tinh ấm áp hơn của chúng
|
||||
ta, màu xanh lục của thảm thực vật và màu xám của nước , với bầu không khí nhiều
|
||||
mây hùng hồn của sự màu mỡ, với những cái nhìn thoáng qua qua những đám mây
|
||||
trôi dạt của nó là những dải đất rộng lớn đông dân và những vùng biển chật hẹp đông đúc hải quân.
|
|
@ -0,0 +1,213 @@
|
|||
// This file is part of ICU4X. For terms of use, please see the file
|
||||
// called LICENSE at the top level of the ICU4X source tree
|
||||
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
|
||||
|
||||
use criterion::{black_box, BenchmarkId, Criterion};
|
||||
|
||||
use icu_normalizer::{ComposingNormalizer, DecomposingNormalizer};
|
||||
|
||||
struct BenchDataContent {
|
||||
pub file_name: String,
|
||||
pub nfc: String,
|
||||
pub nfd: String,
|
||||
pub nfkc: String,
|
||||
pub nfkd: String,
|
||||
pub nfc_u16: Vec<u16>,
|
||||
pub nfd_u16: Vec<u16>,
|
||||
pub nfkc_u16: Vec<u16>,
|
||||
pub nfkd_u16: Vec<u16>,
|
||||
}
|
||||
|
||||
fn strip_headers(content: &str) -> String {
|
||||
content
|
||||
.lines()
|
||||
.filter(|&s| !s.starts_with('#'))
|
||||
.map(|s| s.to_owned())
|
||||
.collect::<Vec<String>>()
|
||||
.join("\n")
|
||||
}
|
||||
|
||||
fn normalizer_bench_data() -> [BenchDataContent; 15] {
|
||||
let nfc_normalizer: ComposingNormalizer = ComposingNormalizer::new_nfc();
|
||||
let nfd_normalizer: DecomposingNormalizer = DecomposingNormalizer::new_nfd();
|
||||
let nfkc_normalizer: ComposingNormalizer = ComposingNormalizer::new_nfkc();
|
||||
let nfkd_normalizer: DecomposingNormalizer = DecomposingNormalizer::new_nfkd();
|
||||
|
||||
let content_latin: (&str, &str) = (
|
||||
"TestNames_Latin",
|
||||
&strip_headers(include_str!("./data/TestNames_Latin.txt")),
|
||||
);
|
||||
let content_jp_h: (&str, &str) = (
|
||||
"TestNames_Japanese_h",
|
||||
&strip_headers(include_str!("./data/TestNames_Japanese_h.txt")),
|
||||
);
|
||||
let content_jp_k: (&str, &str) = (
|
||||
"TestNames_Japanese_k",
|
||||
&strip_headers(include_str!("./data/TestNames_Japanese_k.txt")),
|
||||
);
|
||||
let content_korean: (&str, &str) = (
|
||||
"TestNames_Korean",
|
||||
&strip_headers(include_str!("./data/TestNames_Korean.txt")),
|
||||
);
|
||||
let content_random_words_ar: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_ar",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_ar.txt")),
|
||||
);
|
||||
let content_random_words_de: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_de",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_de.txt")),
|
||||
);
|
||||
let content_random_words_el: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_el",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_el.txt")),
|
||||
);
|
||||
let content_random_words_es: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_es",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_es.txt")),
|
||||
);
|
||||
let content_random_words_fr: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_fr",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_fr.txt")),
|
||||
);
|
||||
let content_random_words_he: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_he",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_he.txt")),
|
||||
);
|
||||
let content_random_words_pl: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_pl",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_pl.txt")),
|
||||
);
|
||||
let content_random_words_ru: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_ru",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_ru.txt")),
|
||||
);
|
||||
let content_random_words_th: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_th",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_th.txt")),
|
||||
);
|
||||
let content_random_words_tr: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_tr",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_tr.txt")),
|
||||
);
|
||||
let content_viet: (&str, &str) = ("wotw", &strip_headers(include_str!("./data/wotw.txt")));
|
||||
|
||||
[
|
||||
content_latin,
|
||||
content_viet,
|
||||
content_jp_k,
|
||||
content_jp_h,
|
||||
content_korean,
|
||||
content_random_words_ru,
|
||||
content_random_words_ar,
|
||||
content_random_words_el,
|
||||
content_random_words_es,
|
||||
content_random_words_fr,
|
||||
content_random_words_tr,
|
||||
content_random_words_th,
|
||||
content_random_words_pl,
|
||||
content_random_words_he,
|
||||
content_random_words_de,
|
||||
]
|
||||
.map(|(file_name, raw_content)| {
|
||||
let nfc = &nfc_normalizer.normalize(raw_content);
|
||||
let nfd = &nfd_normalizer.normalize(raw_content);
|
||||
let nfkc = &nfkc_normalizer.normalize(raw_content);
|
||||
let nfkd = &nfkd_normalizer.normalize(raw_content);
|
||||
BenchDataContent {
|
||||
file_name: file_name.to_owned(),
|
||||
nfc: nfc.to_owned(),
|
||||
nfd: nfd.to_owned(),
|
||||
nfkc: nfkc.to_owned(),
|
||||
nfkd: nfkd.to_owned(),
|
||||
nfc_u16: nfc.encode_utf16().collect(),
|
||||
nfd_u16: nfd.encode_utf16().collect(),
|
||||
nfkc_u16: nfkc.encode_utf16().collect(),
|
||||
nfkd_u16: nfkd.encode_utf16().collect(),
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn function_under_bench(normalizer: &DecomposingNormalizer, text: &str) {
|
||||
normalizer.normalize(text);
|
||||
}
|
||||
|
||||
fn function_under_bench_u16(normalizer: &DecomposingNormalizer, text: &[u16]) {
|
||||
normalizer.normalize_utf16(text);
|
||||
}
|
||||
|
||||
pub fn criterion_benchmark(criterion: &mut Criterion) {
|
||||
let group_name = "decomposing_normalizer_nfd";
|
||||
|
||||
let normalizer_under_bench: DecomposingNormalizer = DecomposingNormalizer::new_nfd();
|
||||
|
||||
let mut group = criterion.benchmark_group(group_name);
|
||||
|
||||
for bench_data_content in black_box(normalizer_bench_data()) {
|
||||
group.bench_function(
|
||||
BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)),
|
||||
|bencher| {
|
||||
bencher
|
||||
.iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfc))
|
||||
},
|
||||
);
|
||||
group.bench_function(
|
||||
BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)),
|
||||
|bencher| {
|
||||
bencher
|
||||
.iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfd))
|
||||
},
|
||||
);
|
||||
group.bench_function(
|
||||
BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)),
|
||||
|bencher| {
|
||||
bencher.iter(|| {
|
||||
function_under_bench(&normalizer_under_bench, &bench_data_content.nfkc)
|
||||
})
|
||||
},
|
||||
);
|
||||
group.bench_function(
|
||||
BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)),
|
||||
|bencher| {
|
||||
bencher.iter(|| {
|
||||
function_under_bench(&normalizer_under_bench, &bench_data_content.nfkd)
|
||||
})
|
||||
},
|
||||
);
|
||||
|
||||
// UTF 16
|
||||
group.bench_function(
|
||||
BenchmarkId::from_parameter(format!("from_nfc_{}_u16", bench_data_content.file_name)),
|
||||
|bencher| {
|
||||
bencher.iter(|| {
|
||||
function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfc_u16)
|
||||
})
|
||||
},
|
||||
);
|
||||
group.bench_function(
|
||||
BenchmarkId::from_parameter(format!("from_nfd_{}_u16", bench_data_content.file_name)),
|
||||
|bencher| {
|
||||
bencher.iter(|| {
|
||||
function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfd_u16)
|
||||
})
|
||||
},
|
||||
);
|
||||
group.bench_function(
|
||||
BenchmarkId::from_parameter(format!("from_nfkc_{}_u16", bench_data_content.file_name)),
|
||||
|bencher| {
|
||||
bencher.iter(|| {
|
||||
function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkc_u16)
|
||||
})
|
||||
},
|
||||
);
|
||||
group.bench_function(
|
||||
BenchmarkId::from_parameter(format!("from_nfkd_{}_u16", bench_data_content.file_name)),
|
||||
|bencher| {
|
||||
bencher.iter(|| {
|
||||
function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkd_u16)
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
group.finish();
|
||||
}
|
|
@ -0,0 +1,211 @@
|
|||
// This file is part of ICU4X. For terms of use, please see the file
|
||||
// called LICENSE at the top level of the ICU4X source tree
|
||||
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
|
||||
|
||||
use criterion::{black_box, BenchmarkId, Criterion};
|
||||
|
||||
use icu_normalizer::{ComposingNormalizer, DecomposingNormalizer};
|
||||
|
||||
struct BenchDataContent {
|
||||
pub file_name: String,
|
||||
pub nfc: String,
|
||||
pub nfd: String,
|
||||
pub nfkc: String,
|
||||
pub nfkd: String,
|
||||
pub nfc_u16: Vec<u16>,
|
||||
pub nfd_u16: Vec<u16>,
|
||||
pub nfkc_u16: Vec<u16>,
|
||||
pub nfkd_u16: Vec<u16>,
|
||||
}
|
||||
|
||||
fn strip_headers(content: &str) -> String {
|
||||
content
|
||||
.lines()
|
||||
.filter(|&s| !s.starts_with('#'))
|
||||
.map(|s| s.to_owned())
|
||||
.collect::<Vec<String>>()
|
||||
.join("\n")
|
||||
}
|
||||
|
||||
fn normalizer_bench_data() -> [BenchDataContent; 15] {
|
||||
let nfc_normalizer: ComposingNormalizer = ComposingNormalizer::new_nfc();
|
||||
let nfd_normalizer: DecomposingNormalizer = DecomposingNormalizer::new_nfd();
|
||||
let nfkc_normalizer: ComposingNormalizer = ComposingNormalizer::new_nfkc();
|
||||
let nfkd_normalizer: DecomposingNormalizer = DecomposingNormalizer::new_nfkd();
|
||||
|
||||
let content_latin: (&str, &str) = (
|
||||
"TestNames_Latin",
|
||||
&strip_headers(include_str!("./data/TestNames_Latin.txt")),
|
||||
);
|
||||
let content_jp_h: (&str, &str) = (
|
||||
"TestNames_Japanese_h",
|
||||
&strip_headers(include_str!("./data/TestNames_Japanese_h.txt")),
|
||||
);
|
||||
let content_jp_k: (&str, &str) = (
|
||||
"TestNames_Japanese_k",
|
||||
&strip_headers(include_str!("./data/TestNames_Japanese_k.txt")),
|
||||
);
|
||||
let content_korean: (&str, &str) = (
|
||||
"TestNames_Korean",
|
||||
&strip_headers(include_str!("./data/TestNames_Korean.txt")),
|
||||
);
|
||||
let content_random_words_ar: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_ar",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_ar.txt")),
|
||||
);
|
||||
let content_random_words_de: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_de",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_de.txt")),
|
||||
);
|
||||
let content_random_words_el: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_el",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_el.txt")),
|
||||
);
|
||||
let content_random_words_es: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_es",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_es.txt")),
|
||||
);
|
||||
let content_random_words_fr: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_fr",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_fr.txt")),
|
||||
);
|
||||
let content_random_words_he: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_he",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_he.txt")),
|
||||
);
|
||||
let content_random_words_pl: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_pl",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_pl.txt")),
|
||||
);
|
||||
let content_random_words_ru: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_ru",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_ru.txt")),
|
||||
);
|
||||
let content_random_words_th: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_th",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_th.txt")),
|
||||
);
|
||||
let content_random_words_tr: (&str, &str) = (
|
||||
"TestRandomWordsUDHR_tr",
|
||||
&strip_headers(include_str!("./data/TestRandomWordsUDHR_tr.txt")),
|
||||
);
|
||||
let content_viet: (&str, &str) = ("wotw", &strip_headers(include_str!("./data/wotw.txt")));
|
||||
|
||||
[
|
||||
content_latin,
|
||||
content_viet,
|
||||
content_jp_k,
|
||||
content_jp_h,
|
||||
content_korean,
|
||||
content_random_words_ru,
|
||||
content_random_words_ar,
|
||||
content_random_words_el,
|
||||
content_random_words_es,
|
||||
content_random_words_fr,
|
||||
content_random_words_tr,
|
||||
content_random_words_th,
|
||||
content_random_words_pl,
|
||||
content_random_words_he,
|
||||
content_random_words_de,
|
||||
]
|
||||
.map(|(file_name, raw_content)| {
|
||||
let nfc = &nfc_normalizer.normalize(raw_content);
|
||||
let nfd = &nfd_normalizer.normalize(raw_content);
|
||||
let nfkc = &nfkc_normalizer.normalize(raw_content);
|
||||
let nfkd = &nfkd_normalizer.normalize(raw_content);
|
||||
BenchDataContent {
|
||||
file_name: file_name.to_owned(),
|
||||
nfc: nfc.to_owned(),
|
||||
nfd: nfd.to_owned(),
|
||||
nfkc: nfkc.to_owned(),
|
||||
nfkd: nfkd.to_owned(),
|
||||
nfc_u16: nfc.encode_utf16().collect(),
|
||||
nfd_u16: nfd.encode_utf16().collect(),
|
||||
nfkc_u16: nfkc.encode_utf16().collect(),
|
||||
nfkd_u16: nfkd.encode_utf16().collect(),
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn function_under_bench(normalizer: &DecomposingNormalizer, text: &str) {
|
||||
normalizer.normalize(text);
|
||||
}
|
||||
|
||||
fn function_under_bench_u16(normalizer: &DecomposingNormalizer, text: &[u16]) {
|
||||
normalizer.normalize_utf16(text);
|
||||
}
|
||||
|
||||
pub fn criterion_benchmark(criterion: &mut Criterion) {
|
||||
let group_name = "decomposing_normalizer_nfkd";
|
||||
|
||||
let normalizer_under_bench: DecomposingNormalizer = DecomposingNormalizer::new_nfkd();
|
||||
|
||||
let mut group = criterion.benchmark_group(group_name);
|
||||
for bench_data_content in black_box(normalizer_bench_data()) {
|
||||
group.bench_function(
|
||||
BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)),
|
||||
|bencher| {
|
||||
bencher
|
||||
.iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfc))
|
||||
},
|
||||
);
|
||||
group.bench_function(
|
||||
BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)),
|
||||
|bencher| {
|
||||
bencher
|
||||
.iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfd))
|
||||
},
|
||||
);
|
||||
group.bench_function(
|
||||
BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)),
|
||||
|bencher| {
|
||||
bencher.iter(|| {
|
||||
function_under_bench(&normalizer_under_bench, &bench_data_content.nfkc)
|
||||
})
|
||||
},
|
||||
);
|
||||
group.bench_function(
|
||||
BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)),
|
||||
|bencher| {
|
||||
bencher.iter(|| {
|
||||
function_under_bench(&normalizer_under_bench, &bench_data_content.nfkd)
|
||||
})
|
||||
},
|
||||
);
|
||||
|
||||
// UTF 16
|
||||
group.bench_function(
|
||||
BenchmarkId::from_parameter(format!("from_nfc_{}_u16", bench_data_content.file_name)),
|
||||
|bencher| {
|
||||
bencher.iter(|| {
|
||||
function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfc_u16)
|
||||
})
|
||||
},
|
||||
);
|
||||
group.bench_function(
|
||||
BenchmarkId::from_parameter(format!("from_nfd_{}_u16", bench_data_content.file_name)),
|
||||
|bencher| {
|
||||
bencher.iter(|| {
|
||||
function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfd_u16)
|
||||
})
|
||||
},
|
||||
);
|
||||
group.bench_function(
|
||||
BenchmarkId::from_parameter(format!("from_nfkc_{}_u16", bench_data_content.file_name)),
|
||||
|bencher| {
|
||||
bencher.iter(|| {
|
||||
function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkc_u16)
|
||||
})
|
||||
},
|
||||
);
|
||||
group.bench_function(
|
||||
BenchmarkId::from_parameter(format!("from_nfkd_{}_u16", bench_data_content.file_name)),
|
||||
|bencher| {
|
||||
bencher.iter(|| {
|
||||
function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkd_u16)
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
group.finish();
|
||||
}
|
|
@ -0,0 +1,42 @@
|
|||
// This file is part of ICU4X. For terms of use, please see the file
|
||||
// called LICENSE at the top level of the ICU4X source tree
|
||||
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
|
||||
|
||||
//! Normalizer-specific error
|
||||
|
||||
use displaydoc::Display;
|
||||
use icu_properties::PropertiesError;
|
||||
use icu_provider::DataError;
|
||||
|
||||
/// A list of error outcomes for various operations in this module.
|
||||
///
|
||||
/// Re-exported as [`Error`](crate::Error).
|
||||
#[derive(Display, Debug)]
|
||||
#[non_exhaustive]
|
||||
pub enum NormalizerError {
|
||||
/// Error coming from the data provider
|
||||
#[displaydoc("{0}")]
|
||||
Data(DataError),
|
||||
/// The data uses a planned but unsupported feature.
|
||||
FutureExtension,
|
||||
/// Data failed manual validation
|
||||
ValidationError,
|
||||
}
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
impl std::error::Error for NormalizerError {}
|
||||
|
||||
impl From<DataError> for NormalizerError {
|
||||
fn from(e: DataError) -> Self {
|
||||
NormalizerError::Data(e)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<PropertiesError> for NormalizerError {
|
||||
fn from(e: PropertiesError) -> Self {
|
||||
match e {
|
||||
PropertiesError::PropDataLoad(d) => NormalizerError::Data(d),
|
||||
_ => unreachable!("Shouldn't have non-Data PropertiesError"),
|
||||
}
|
||||
}
|
||||
}
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,520 @@
|
|||
// This file is part of ICU4X. For terms of use, please see the file
|
||||
// called LICENSE at the top level of the ICU4X source tree
|
||||
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
|
||||
|
||||
//! Access to the Unicode properties or property-based operations that
|
||||
//! are required for NFC and NFD.
|
||||
//!
|
||||
//! Applications should generally use the full normalizers that are
|
||||
//! provided at the top level of this crate. However, the APIs in this
|
||||
//! module are provided for callers such as HarfBuzz that specifically
|
||||
//! want access to the raw canonical composition operation e.g. for use in a
|
||||
//! glyph-availability-guided custom normalizer.
|
||||
|
||||
use crate::char_from_u16;
|
||||
use crate::error::NormalizerError;
|
||||
use crate::in_inclusive_range;
|
||||
use crate::provider::CanonicalCompositionsV1Marker;
|
||||
use crate::provider::CanonicalDecompositionDataV1Marker;
|
||||
use crate::provider::CanonicalDecompositionTablesV1Marker;
|
||||
use crate::provider::NonRecursiveDecompositionSupplementV1Marker;
|
||||
use crate::trie_value_has_ccc;
|
||||
use crate::trie_value_indicates_special_non_starter_decomposition;
|
||||
use crate::BACKWARD_COMBINING_STARTER_MARKER;
|
||||
use crate::FDFA_MARKER;
|
||||
use crate::HANGUL_L_BASE;
|
||||
use crate::HANGUL_N_COUNT;
|
||||
use crate::HANGUL_S_BASE;
|
||||
use crate::HANGUL_S_COUNT;
|
||||
use crate::HANGUL_T_BASE;
|
||||
use crate::HANGUL_T_COUNT;
|
||||
use crate::HANGUL_V_BASE;
|
||||
use crate::NON_ROUND_TRIP_MARKER;
|
||||
use crate::SPECIAL_NON_STARTER_DECOMPOSITION_MARKER_U16;
|
||||
/// want access to the underlying properties e.g. for use in a
|
||||
/// glyph-availability-guided custom normalizer.
|
||||
use icu_properties::CanonicalCombiningClass;
|
||||
use icu_provider::prelude::*;
|
||||
|
||||
/// The raw canonical composition operation.
|
||||
///
|
||||
/// Callers should generally use `ComposingNormalizer` instead of this API.
|
||||
/// However, this API is provided for callers such as HarfBuzz that specifically
|
||||
/// want access to the raw canonical composition operation e.g. for use in a
|
||||
/// glyph-availability-guided custom normalizer.
|
||||
#[derive(Debug)]
|
||||
pub struct CanonicalComposition {
|
||||
canonical_compositions: DataPayload<CanonicalCompositionsV1Marker>,
|
||||
}
|
||||
|
||||
#[cfg(feature = "compiled_data")]
|
||||
impl Default for CanonicalComposition {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl CanonicalComposition {
|
||||
/// Performs canonical composition (including Hangul) on a pair of
|
||||
/// characters or returns `None` if these characters don't compose.
|
||||
/// Composition exclusions are taken into account.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// let comp = icu::normalizer::properties::CanonicalComposition::new();
|
||||
///
|
||||
/// assert_eq!(comp.compose('a', 'b'), None); // Just two non-composing starters
|
||||
/// assert_eq!(comp.compose('a', '\u{0308}'), Some('ä'));
|
||||
/// assert_eq!(comp.compose('ẹ', '\u{0302}'), Some('ệ'));
|
||||
/// assert_eq!(comp.compose('𝅗', '𝅥'), None); // Composition exclusion
|
||||
/// assert_eq!(comp.compose('ে', 'া'), Some('ো')); // Second is starter
|
||||
/// assert_eq!(comp.compose('ᄀ', 'ᅡ'), Some('가')); // Hangul LV
|
||||
/// assert_eq!(comp.compose('가', 'ᆨ'), Some('각')); // Hangul LVT
|
||||
/// ```
|
||||
#[inline(always)]
|
||||
pub fn compose(&self, starter: char, second: char) -> Option<char> {
|
||||
crate::compose(
|
||||
self.canonical_compositions
|
||||
.get()
|
||||
.canonical_compositions
|
||||
.iter(),
|
||||
starter,
|
||||
second,
|
||||
)
|
||||
}
|
||||
|
||||
/// Constructs a new `CanonicalComposition` using compiled data.
|
||||
///
|
||||
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
|
||||
///
|
||||
/// [📚 Help choosing a constructor](icu_provider::constructors)
|
||||
#[cfg(feature = "compiled_data")]
|
||||
pub const fn new() -> Self {
|
||||
Self {
|
||||
canonical_compositions: DataPayload::from_static_ref(
|
||||
crate::provider::Baked::SINGLETON_NORMALIZER_COMP_V1,
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
icu_provider::gen_any_buffer_data_constructors!(locale: skip, options: skip, error: NormalizerError,
|
||||
#[cfg(skip)]
|
||||
functions: [
|
||||
new,
|
||||
try_new_with_any_provider,
|
||||
try_new_with_buffer_provider,
|
||||
try_new_unstable,
|
||||
Self,
|
||||
]
|
||||
);
|
||||
|
||||
#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new)]
|
||||
pub fn try_new_unstable<D>(provider: &D) -> Result<Self, NormalizerError>
|
||||
where
|
||||
D: DataProvider<CanonicalCompositionsV1Marker> + ?Sized,
|
||||
{
|
||||
let canonical_compositions: DataPayload<CanonicalCompositionsV1Marker> =
|
||||
provider.load(Default::default())?.take_payload()?;
|
||||
Ok(CanonicalComposition {
|
||||
canonical_compositions,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// The outcome of non-recursive canonical decomposition of a character.
|
||||
#[allow(clippy::exhaustive_enums)]
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub enum Decomposed {
|
||||
/// The character is its own canonical decomposition.
|
||||
Default,
|
||||
/// The character decomposes to a single different character.
|
||||
Singleton(char),
|
||||
/// The character decomposes to two characters.
|
||||
Expansion(char, char),
|
||||
}
|
||||
|
||||
/// The raw (non-recursive) canonical decomposition operation.
|
||||
///
|
||||
/// Callers should generally use `DecomposingNormalizer` instead of this API.
|
||||
/// However, this API is provided for callers such as HarfBuzz that specifically
|
||||
/// want access to non-recursive canonical decomposition e.g. for use in a
|
||||
/// glyph-availability-guided custom normalizer.
|
||||
#[derive(Debug)]
|
||||
pub struct CanonicalDecomposition {
|
||||
decompositions: DataPayload<CanonicalDecompositionDataV1Marker>,
|
||||
tables: DataPayload<CanonicalDecompositionTablesV1Marker>,
|
||||
non_recursive: DataPayload<NonRecursiveDecompositionSupplementV1Marker>,
|
||||
}
|
||||
|
||||
#[cfg(feature = "compiled_data")]
|
||||
impl Default for CanonicalDecomposition {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl CanonicalDecomposition {
|
||||
/// Performs non-recursive canonical decomposition (including for Hangul).
|
||||
///
|
||||
/// ```
|
||||
/// use icu::normalizer::properties::Decomposed;
|
||||
/// let decomp = icu::normalizer::properties::CanonicalDecomposition::new();
|
||||
///
|
||||
/// assert_eq!(decomp.decompose('e'), Decomposed::Default);
|
||||
/// assert_eq!(
|
||||
/// decomp.decompose('ệ'),
|
||||
/// Decomposed::Expansion('ẹ', '\u{0302}')
|
||||
/// );
|
||||
/// assert_eq!(decomp.decompose('각'), Decomposed::Expansion('가', 'ᆨ'));
|
||||
/// assert_eq!(decomp.decompose('\u{212B}'), Decomposed::Singleton('Å')); // ANGSTROM SIGN
|
||||
/// assert_eq!(decomp.decompose('\u{2126}'), Decomposed::Singleton('Ω')); // OHM SIGN
|
||||
/// assert_eq!(decomp.decompose('\u{1F71}'), Decomposed::Singleton('ά')); // oxia
|
||||
/// ```
|
||||
#[inline]
|
||||
pub fn decompose(&self, c: char) -> Decomposed {
|
||||
let lvt = u32::from(c).wrapping_sub(HANGUL_S_BASE);
|
||||
if lvt >= HANGUL_S_COUNT {
|
||||
return self.decompose_non_hangul(c);
|
||||
}
|
||||
let t = lvt % HANGUL_T_COUNT;
|
||||
if t == 0 {
|
||||
let l = lvt / HANGUL_N_COUNT;
|
||||
let v = (lvt % HANGUL_N_COUNT) / HANGUL_T_COUNT;
|
||||
// Safe because values known to be in range
|
||||
return Decomposed::Expansion(
|
||||
unsafe { char::from_u32_unchecked(HANGUL_L_BASE + l) },
|
||||
unsafe { char::from_u32_unchecked(HANGUL_V_BASE + v) },
|
||||
);
|
||||
}
|
||||
let lv = lvt - t;
|
||||
// Safe because values known to be in range
|
||||
Decomposed::Expansion(
|
||||
unsafe { char::from_u32_unchecked(HANGUL_S_BASE + lv) },
|
||||
unsafe { char::from_u32_unchecked(HANGUL_T_BASE + t) },
|
||||
)
|
||||
}
|
||||
|
||||
/// Performs non-recursive canonical decomposition except Hangul syllables
|
||||
/// are reported as `Decomposed::Default`.
|
||||
#[inline(always)]
|
||||
fn decompose_non_hangul(&self, c: char) -> Decomposed {
|
||||
let decomposition = self.decompositions.get().trie.get(c);
|
||||
if decomposition <= BACKWARD_COMBINING_STARTER_MARKER {
|
||||
return Decomposed::Default;
|
||||
}
|
||||
// The loop is only broken out of as goto forward
|
||||
#[allow(clippy::never_loop)]
|
||||
loop {
|
||||
let trail_or_complex = (decomposition >> 16) as u16;
|
||||
let lead = decomposition as u16;
|
||||
if lead > NON_ROUND_TRIP_MARKER && trail_or_complex != 0 {
|
||||
// Decomposition into two BMP characters: starter and non-starter
|
||||
if in_inclusive_range(c, '\u{1F71}', '\u{1FFB}') {
|
||||
// Look in the other trie due to oxia singleton
|
||||
// mappings to corresponding character with tonos.
|
||||
break;
|
||||
}
|
||||
return Decomposed::Expansion(char_from_u16(lead), char_from_u16(trail_or_complex));
|
||||
}
|
||||
if lead > NON_ROUND_TRIP_MARKER {
|
||||
// Decomposition into one BMP character or non-starter
|
||||
debug_assert_ne!(
|
||||
lead, FDFA_MARKER,
|
||||
"How come we got the U+FDFA NFKD marker here?"
|
||||
);
|
||||
if lead == SPECIAL_NON_STARTER_DECOMPOSITION_MARKER_U16 {
|
||||
// Non-starter
|
||||
if !in_inclusive_range(c, '\u{0340}', '\u{0F81}') {
|
||||
return Decomposed::Default;
|
||||
}
|
||||
return match c {
|
||||
'\u{0340}' => {
|
||||
// COMBINING GRAVE TONE MARK
|
||||
Decomposed::Singleton('\u{0300}')
|
||||
}
|
||||
'\u{0341}' => {
|
||||
// COMBINING ACUTE TONE MARK
|
||||
Decomposed::Singleton('\u{0301}')
|
||||
}
|
||||
'\u{0343}' => {
|
||||
// COMBINING GREEK KORONIS
|
||||
Decomposed::Singleton('\u{0313}')
|
||||
}
|
||||
'\u{0344}' => {
|
||||
// COMBINING GREEK DIALYTIKA TONOS
|
||||
Decomposed::Expansion('\u{0308}', '\u{0301}')
|
||||
}
|
||||
'\u{0F73}' => {
|
||||
// TIBETAN VOWEL SIGN II
|
||||
Decomposed::Expansion('\u{0F71}', '\u{0F72}')
|
||||
}
|
||||
'\u{0F75}' => {
|
||||
// TIBETAN VOWEL SIGN UU
|
||||
Decomposed::Expansion('\u{0F71}', '\u{0F74}')
|
||||
}
|
||||
'\u{0F81}' => {
|
||||
// TIBETAN VOWEL SIGN REVERSED II
|
||||
Decomposed::Expansion('\u{0F71}', '\u{0F80}')
|
||||
}
|
||||
_ => Decomposed::Default,
|
||||
};
|
||||
}
|
||||
return Decomposed::Singleton(char_from_u16(lead));
|
||||
}
|
||||
// The recursive decomposition of ANGSTROM SIGN is in the complex
|
||||
// decomposition structure to avoid a branch in `potential_passthrough`
|
||||
// for the BMP case.
|
||||
if c == '\u{212B}' {
|
||||
// ANGSTROM SIGN
|
||||
return Decomposed::Singleton('\u{00C5}');
|
||||
}
|
||||
// Complex decomposition
|
||||
// Format for 16-bit value:
|
||||
// 15..13: length minus two for 16-bit case and length minus one for
|
||||
// the 32-bit case. Length 8 needs to fit in three bits in
|
||||
// the 16-bit case, and this way the value is future-proofed
|
||||
// up to 9 in the 16-bit case. Zero is unused and length one
|
||||
// in the 16-bit case goes directly into the trie.
|
||||
// 12: 1 if all trailing characters are guaranteed non-starters,
|
||||
// 0 if no guarantees about non-starterness.
|
||||
// Note: The bit choice is this way around to allow for
|
||||
// dynamically falling back to not having this but instead
|
||||
// having one more bit for length by merely choosing
|
||||
// different masks.
|
||||
// 11..0: Start offset in storage. The offset is to the logical
|
||||
// sequence of scalars16, scalars32, supplementary_scalars16,
|
||||
// supplementary_scalars32.
|
||||
let offset = usize::from(trail_or_complex & 0xFFF);
|
||||
let tables = self.tables.get();
|
||||
if offset < tables.scalars16.len() {
|
||||
if usize::from(trail_or_complex >> 13) != 0 {
|
||||
// i.e. logical len isn't 2
|
||||
break;
|
||||
}
|
||||
if let Some(first) = tables.scalars16.get(offset) {
|
||||
if let Some(second) = tables.scalars16.get(offset + 1) {
|
||||
// Two BMP starters
|
||||
return Decomposed::Expansion(char_from_u16(first), char_from_u16(second));
|
||||
}
|
||||
}
|
||||
// GIGO case
|
||||
debug_assert!(false);
|
||||
return Decomposed::Default;
|
||||
}
|
||||
let len = usize::from(trail_or_complex >> 13) + 1;
|
||||
if len > 2 {
|
||||
break;
|
||||
}
|
||||
let offset24 = offset - tables.scalars16.len();
|
||||
if let Some(first_c) = tables.scalars24.get(offset24) {
|
||||
if len == 1 {
|
||||
if c != first_c {
|
||||
return Decomposed::Singleton(first_c);
|
||||
} else {
|
||||
// Singleton representation used to avoid
|
||||
// NFC passthrough of characters that combine
|
||||
// with starters that can occur as the first
|
||||
// character of an expansion decomposition.
|
||||
// See section 5 of
|
||||
// https://www.unicode.org/L2/L2024/24009-utc178-properties-recs.pdf
|
||||
return Decomposed::Default;
|
||||
}
|
||||
}
|
||||
if let Some(second_c) = tables.scalars24.get(offset24 + 1) {
|
||||
return Decomposed::Expansion(first_c, second_c);
|
||||
}
|
||||
}
|
||||
// GIGO case
|
||||
debug_assert!(false);
|
||||
return Decomposed::Default;
|
||||
}
|
||||
let non_recursive = self.non_recursive.get();
|
||||
let non_recursive_decomposition = non_recursive.trie.get(c);
|
||||
if non_recursive_decomposition == 0 {
|
||||
// GIGO case
|
||||
debug_assert!(false);
|
||||
return Decomposed::Default;
|
||||
}
|
||||
let trail_or_complex = (non_recursive_decomposition >> 16) as u16;
|
||||
let lead = non_recursive_decomposition as u16;
|
||||
if lead != 0 && trail_or_complex != 0 {
|
||||
// Decomposition into two BMP characters
|
||||
return Decomposed::Expansion(char_from_u16(lead), char_from_u16(trail_or_complex));
|
||||
}
|
||||
if lead != 0 {
|
||||
// Decomposition into one BMP character
|
||||
return Decomposed::Singleton(char_from_u16(lead));
|
||||
}
|
||||
// Decomposition into two non-BMP characters
|
||||
// Low is offset into a table plus one to keep it non-zero.
|
||||
let offset = usize::from(trail_or_complex - 1);
|
||||
if let Some(first) = non_recursive.scalars24.get(offset) {
|
||||
if let Some(second) = non_recursive.scalars24.get(offset + 1) {
|
||||
return Decomposed::Expansion(first, second);
|
||||
}
|
||||
}
|
||||
// GIGO case
|
||||
debug_assert!(false);
|
||||
Decomposed::Default
|
||||
}
|
||||
|
||||
/// Construct from compiled data.
|
||||
///
|
||||
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
|
||||
///
|
||||
/// [📚 Help choosing a constructor](icu_provider::constructors)
|
||||
#[cfg(feature = "compiled_data")]
|
||||
pub const fn new() -> Self {
|
||||
const _: () = assert!(
|
||||
crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1
|
||||
.scalars16
|
||||
.const_len()
|
||||
+ crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1
|
||||
.scalars24
|
||||
.const_len()
|
||||
<= 0xFFF,
|
||||
"NormalizerError::FutureExtension"
|
||||
);
|
||||
|
||||
Self {
|
||||
decompositions: DataPayload::from_static_ref(
|
||||
crate::provider::Baked::SINGLETON_NORMALIZER_NFD_V1,
|
||||
),
|
||||
tables: DataPayload::from_static_ref(
|
||||
crate::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1,
|
||||
),
|
||||
non_recursive: DataPayload::from_static_ref(
|
||||
crate::provider::Baked::SINGLETON_NORMALIZER_DECOMP_V1,
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
icu_provider::gen_any_buffer_data_constructors!(locale: skip, options: skip, error: NormalizerError,
|
||||
#[cfg(skip)]
|
||||
functions: [
|
||||
new,
|
||||
try_new_with_any_provider,
|
||||
try_new_with_buffer_provider,
|
||||
try_new_unstable,
|
||||
Self,
|
||||
]
|
||||
);
|
||||
|
||||
#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new)]
|
||||
pub fn try_new_unstable<D>(provider: &D) -> Result<Self, NormalizerError>
|
||||
where
|
||||
D: DataProvider<CanonicalDecompositionDataV1Marker>
|
||||
+ DataProvider<CanonicalDecompositionTablesV1Marker>
|
||||
+ DataProvider<NonRecursiveDecompositionSupplementV1Marker>
|
||||
+ ?Sized,
|
||||
{
|
||||
let decompositions: DataPayload<CanonicalDecompositionDataV1Marker> =
|
||||
provider.load(Default::default())?.take_payload()?;
|
||||
let tables: DataPayload<CanonicalDecompositionTablesV1Marker> =
|
||||
provider.load(Default::default())?.take_payload()?;
|
||||
|
||||
if tables.get().scalars16.len() + tables.get().scalars24.len() > 0xFFF {
|
||||
// The data is from a future where there exists a normalization flavor whose
|
||||
// complex decompositions take more than 0xFFF but fewer than 0x1FFF code points
|
||||
// of space. If a good use case from such a decomposition flavor arises, we can
|
||||
// dynamically change the bit masks so that the length mask becomes 0x1FFF instead
|
||||
// of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However,
|
||||
// since for now the masks are hard-coded, error out.
|
||||
return Err(NormalizerError::FutureExtension);
|
||||
}
|
||||
|
||||
let non_recursive: DataPayload<NonRecursiveDecompositionSupplementV1Marker> =
|
||||
provider.load(Default::default())?.take_payload()?;
|
||||
|
||||
Ok(CanonicalDecomposition {
|
||||
decompositions,
|
||||
tables,
|
||||
non_recursive,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Lookup of the Canonical_Combining_Class Unicode property.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// use icu::properties::CanonicalCombiningClass;
|
||||
/// use icu::normalizer::properties::CanonicalCombiningClassMap;
|
||||
///
|
||||
/// let map = CanonicalCombiningClassMap::new();
|
||||
/// assert_eq!(map.get('a'), CanonicalCombiningClass::NotReordered); // U+0061: LATIN SMALL LETTER A
|
||||
/// assert_eq!(map.get32(0x0301), CanonicalCombiningClass::Above); // U+0301: COMBINING ACUTE ACCENT
|
||||
/// ```
|
||||
#[derive(Debug)]
|
||||
pub struct CanonicalCombiningClassMap {
|
||||
/// The data trie
|
||||
decompositions: DataPayload<CanonicalDecompositionDataV1Marker>,
|
||||
}
|
||||
|
||||
#[cfg(feature = "compiled_data")]
|
||||
impl Default for CanonicalCombiningClassMap {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl CanonicalCombiningClassMap {
|
||||
/// Look up the canonical combining class for a scalar value
|
||||
#[inline(always)]
|
||||
pub fn get(&self, c: char) -> CanonicalCombiningClass {
|
||||
self.get32(u32::from(c))
|
||||
}
|
||||
|
||||
/// Look up the canonical combining class for a scalar value
|
||||
/// represented as `u32`. If the argument is outside the scalar
|
||||
/// value range, `CanonicalCombiningClass::NotReordered` is returned.
|
||||
pub fn get32(&self, c: u32) -> CanonicalCombiningClass {
|
||||
let trie_value = self.decompositions.get().trie.get32(c);
|
||||
if trie_value_has_ccc(trie_value) {
|
||||
CanonicalCombiningClass(trie_value as u8)
|
||||
} else if trie_value_indicates_special_non_starter_decomposition(trie_value) {
|
||||
match c {
|
||||
0x0340 | 0x0341 | 0x0343 | 0x0344 => CanonicalCombiningClass::Above,
|
||||
_ => CanonicalCombiningClass::NotReordered,
|
||||
}
|
||||
} else {
|
||||
CanonicalCombiningClass::NotReordered
|
||||
}
|
||||
}
|
||||
|
||||
/// Construct from compiled data.
|
||||
///
|
||||
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
|
||||
///
|
||||
/// [📚 Help choosing a constructor](icu_provider::constructors)
|
||||
#[cfg(feature = "compiled_data")]
|
||||
pub const fn new() -> Self {
|
||||
CanonicalCombiningClassMap {
|
||||
decompositions: DataPayload::from_static_ref(
|
||||
crate::provider::Baked::SINGLETON_NORMALIZER_NFD_V1,
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
icu_provider::gen_any_buffer_data_constructors!(locale: skip, options: skip, error: NormalizerError,
|
||||
#[cfg(skip)]
|
||||
functions: [
|
||||
new,
|
||||
try_new_with_any_provider,
|
||||
try_new_with_buffer_provider,
|
||||
try_new_unstable,
|
||||
Self,
|
||||
]);
|
||||
|
||||
#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new)]
|
||||
pub fn try_new_unstable<D>(provider: &D) -> Result<Self, NormalizerError>
|
||||
where
|
||||
D: DataProvider<CanonicalDecompositionDataV1Marker> + ?Sized,
|
||||
{
|
||||
let decompositions: DataPayload<CanonicalDecompositionDataV1Marker> =
|
||||
provider.load(Default::default())?.take_payload()?;
|
||||
Ok(CanonicalCombiningClassMap { decompositions })
|
||||
}
|
||||
}
|
|
@ -0,0 +1,208 @@
|
|||
// This file is part of ICU4X. For terms of use, please see the file
|
||||
// called LICENSE at the top level of the ICU4X source tree
|
||||
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
|
||||
|
||||
//! 🚧 \[Unstable\] Data provider struct definitions for this ICU4X component.
|
||||
//!
|
||||
//! <div class="stab unstable">
|
||||
//! 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
|
||||
//! including in SemVer minor releases. While the serde representation of data structs is guaranteed
|
||||
//! to be stable, their Rust representation might not be. Use with caution.
|
||||
//! </div>
|
||||
//!
|
||||
//! Read more about data providers: [`icu_provider`]
|
||||
|
||||
// Provider structs must be stable
|
||||
#![allow(clippy::exhaustive_structs, clippy::exhaustive_enums)]
|
||||
|
||||
use icu_collections::char16trie::Char16Trie;
|
||||
use icu_collections::codepointtrie::CodePointTrie;
|
||||
use icu_provider::prelude::*;
|
||||
use zerovec::ZeroVec;
|
||||
|
||||
#[cfg(feature = "compiled_data")]
|
||||
#[derive(Debug)]
|
||||
/// Baked data
|
||||
///
|
||||
/// <div class="stab unstable">
|
||||
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
|
||||
/// including in SemVer minor releases. In particular, the `DataProvider` implementations are only
|
||||
/// guaranteed to match with this version's `*_unstable` providers. Use with caution.
|
||||
/// </div>
|
||||
pub struct Baked;
|
||||
|
||||
#[cfg(feature = "compiled_data")]
|
||||
const _: () = {
|
||||
pub mod icu {
|
||||
pub use crate as normalizer;
|
||||
pub use icu_collections as collections;
|
||||
}
|
||||
icu_normalizer_data::make_provider!(Baked);
|
||||
icu_normalizer_data::impl_normalizer_comp_v1!(Baked);
|
||||
icu_normalizer_data::impl_normalizer_decomp_v1!(Baked);
|
||||
icu_normalizer_data::impl_normalizer_nfd_v1!(Baked);
|
||||
icu_normalizer_data::impl_normalizer_nfdex_v1!(Baked);
|
||||
icu_normalizer_data::impl_normalizer_nfkd_v1!(Baked);
|
||||
icu_normalizer_data::impl_normalizer_nfkdex_v1!(Baked);
|
||||
icu_normalizer_data::impl_normalizer_uts46d_v1!(Baked);
|
||||
};
|
||||
|
||||
#[cfg(feature = "datagen")]
|
||||
/// The latest minimum set of keys required by this component.
|
||||
pub const KEYS: &[DataKey] = &[
|
||||
CanonicalCompositionsV1Marker::KEY,
|
||||
CanonicalDecompositionDataV1Marker::KEY,
|
||||
CanonicalDecompositionTablesV1Marker::KEY,
|
||||
CompatibilityDecompositionSupplementV1Marker::KEY,
|
||||
CompatibilityDecompositionTablesV1Marker::KEY,
|
||||
NonRecursiveDecompositionSupplementV1Marker::KEY,
|
||||
Uts46DecompositionSupplementV1Marker::KEY,
|
||||
];
|
||||
|
||||
/// Main data for NFD
|
||||
///
|
||||
/// <div class="stab unstable">
|
||||
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
|
||||
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
|
||||
/// to be stable, their Rust representation might not be. Use with caution.
|
||||
/// </div>
|
||||
#[icu_provider::data_struct(marker(
|
||||
CanonicalDecompositionDataV1Marker,
|
||||
"normalizer/nfd@1",
|
||||
singleton
|
||||
))]
|
||||
#[derive(Debug, PartialEq, Clone)]
|
||||
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake), databake(path = icu_normalizer::provider))]
|
||||
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
|
||||
pub struct DecompositionDataV1<'data> {
|
||||
/// Trie for NFD decomposition.
|
||||
#[cfg_attr(feature = "serde", serde(borrow))]
|
||||
pub trie: CodePointTrie<'data, u32>,
|
||||
}
|
||||
|
||||
/// Data that either NFKD or the decomposed form of UTS 46 needs
|
||||
/// _in addition to_ the NFD data.
|
||||
///
|
||||
/// <div class="stab unstable">
|
||||
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
|
||||
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
|
||||
/// to be stable, their Rust representation might not be. Use with caution.
|
||||
/// </div>
|
||||
#[icu_provider::data_struct(
|
||||
marker(
|
||||
CompatibilityDecompositionSupplementV1Marker,
|
||||
"normalizer/nfkd@1",
|
||||
singleton
|
||||
),
|
||||
marker(Uts46DecompositionSupplementV1Marker, "normalizer/uts46d@1", singleton)
|
||||
)]
|
||||
#[derive(Debug, PartialEq, Clone)]
|
||||
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake), databake(path = icu_normalizer::provider))]
|
||||
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
|
||||
pub struct DecompositionSupplementV1<'data> {
|
||||
/// Trie for the decompositions that differ from NFD.
|
||||
/// Getting a zero from this trie means that you need
|
||||
/// to make another lookup from `DecompositionDataV1::trie`.
|
||||
#[cfg_attr(feature = "serde", serde(borrow))]
|
||||
pub trie: CodePointTrie<'data, u32>,
|
||||
/// Flags that indicate how the set of characters whose
|
||||
/// decompositions starts with a non-starter differs from
|
||||
/// the set for NFD.
|
||||
///
|
||||
/// Bit 0: Whether half-width kana voicing marks decompose
|
||||
/// into non-starters (their full-width combining
|
||||
/// counterparts).
|
||||
/// Bit 1: Whether U+0345 COMBINING GREEK YPOGEGRAMMENI
|
||||
/// decomposes into a starter (U+03B9 GREEK SMALL
|
||||
/// LETTER IOTA).
|
||||
/// (Other bits unused.)
|
||||
pub flags: u8,
|
||||
/// The passthrough bounds of NFD/NFC are lowered to this
|
||||
/// maximum instead. (16-bit, because cannot be higher
|
||||
/// than 0x0300, which is the bound for NFC.)
|
||||
pub passthrough_cap: u16,
|
||||
}
|
||||
|
||||
impl DecompositionSupplementV1<'_> {
|
||||
const HALF_WIDTH_VOICING_MARK_MASK: u8 = 1;
|
||||
|
||||
/// Whether half-width kana voicing marks decompose into non-starters
|
||||
/// (their full-width combining counterparts).
|
||||
pub fn half_width_voicing_marks_become_non_starters(&self) -> bool {
|
||||
(self.flags & DecompositionSupplementV1::HALF_WIDTH_VOICING_MARK_MASK) != 0
|
||||
}
|
||||
}
|
||||
|
||||
/// The expansion tables for cases where the decomposition isn't
|
||||
/// contained in the trie value
|
||||
///
|
||||
/// <div class="stab unstable">
|
||||
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
|
||||
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
|
||||
/// to be stable, their Rust representation might not be. Use with caution.
|
||||
/// </div>
|
||||
#[icu_provider::data_struct(
|
||||
marker(CanonicalDecompositionTablesV1Marker, "normalizer/nfdex@1", singleton),
|
||||
marker(
|
||||
CompatibilityDecompositionTablesV1Marker,
|
||||
"normalizer/nfkdex@1",
|
||||
singleton
|
||||
)
|
||||
)]
|
||||
#[derive(Debug, PartialEq, Clone)]
|
||||
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake), databake(path = icu_normalizer::provider))]
|
||||
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
|
||||
pub struct DecompositionTablesV1<'data> {
|
||||
/// Decompositions that are fully within the BMP
|
||||
#[cfg_attr(feature = "serde", serde(borrow))]
|
||||
pub scalars16: ZeroVec<'data, u16>,
|
||||
/// Decompositions with at least one character outside
|
||||
/// the BMP
|
||||
#[cfg_attr(feature = "serde", serde(borrow))]
|
||||
pub scalars24: ZeroVec<'data, char>,
|
||||
}
|
||||
|
||||
/// Non-Hangul canonical compositions
|
||||
///
|
||||
/// <div class="stab unstable">
|
||||
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
|
||||
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
|
||||
/// to be stable, their Rust representation might not be. Use with caution.
|
||||
/// </div>
|
||||
#[icu_provider::data_struct(marker(CanonicalCompositionsV1Marker, "normalizer/comp@1", singleton))]
|
||||
#[derive(Debug, PartialEq, Clone)]
|
||||
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake), databake(path = icu_normalizer::provider))]
|
||||
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
|
||||
pub struct CanonicalCompositionsV1<'data> {
|
||||
/// Trie keys are two-`char` strings with the second
|
||||
/// character coming first. The value, if any, is the
|
||||
/// (non-Hangul) canonical composition.
|
||||
#[cfg_attr(feature = "serde", serde(borrow))]
|
||||
pub canonical_compositions: Char16Trie<'data>,
|
||||
}
|
||||
|
||||
/// Non-recursive canonical decompositions that differ from
|
||||
/// `DecompositionDataV1`.
|
||||
///
|
||||
/// <div class="stab unstable">
|
||||
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
|
||||
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
|
||||
/// to be stable, their Rust representation might not be. Use with caution.
|
||||
/// </div>
|
||||
#[icu_provider::data_struct(marker(
|
||||
NonRecursiveDecompositionSupplementV1Marker,
|
||||
"normalizer/decomp@1",
|
||||
singleton
|
||||
))]
|
||||
#[derive(Debug, PartialEq, Clone)]
|
||||
#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake), databake(path = icu_normalizer::provider))]
|
||||
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
|
||||
pub struct NonRecursiveDecompositionSupplementV1<'data> {
|
||||
/// Trie for the supplementary non-recursive decompositions
|
||||
#[cfg_attr(feature = "serde", serde(borrow))]
|
||||
pub trie: CodePointTrie<'data, u32>,
|
||||
/// Decompositions with at least one character outside
|
||||
/// the BMP
|
||||
#[cfg_attr(feature = "serde", serde(borrow))]
|
||||
pub scalars24: ZeroVec<'data, char>,
|
||||
}
|
|
@ -0,0 +1,136 @@
|
|||
// This file is part of ICU4X. For terms of use, please see the file
|
||||
// called LICENSE at the top level of the ICU4X source tree
|
||||
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
|
||||
|
||||
//! Bundles the part of UTS 46 that makes sense to implement as a
|
||||
//! normalization.
|
||||
//!
|
||||
//! This is meant to be used as a building block of an UTS 46
|
||||
//! implementation, such as the `idna` crate.
|
||||
|
||||
use crate::CanonicalCompositionsV1Marker;
|
||||
use crate::CanonicalDecompositionDataV1Marker;
|
||||
use crate::CanonicalDecompositionTablesV1Marker;
|
||||
use crate::CompatibilityDecompositionTablesV1Marker;
|
||||
use crate::ComposingNormalizer;
|
||||
use crate::NormalizerError;
|
||||
use crate::Uts46DecompositionSupplementV1Marker;
|
||||
use icu_provider::DataProvider;
|
||||
|
||||
// Implementation note: Despite merely wrapping a `ComposingNormalizer`,
|
||||
// having a `Uts46Mapper` serves two purposes:
|
||||
//
|
||||
// 1. Denying public access to parts of the `ComposingNormalizer` API
|
||||
// that don't work when the data contains markers for ignorables.
|
||||
// 2. Providing a place where additional iterator pre-processing or
|
||||
// post-processing can take place if needed in the future. (When
|
||||
// writing this, it looked like such processing was needed but
|
||||
// now isn't needed after all.)
|
||||
|
||||
/// A mapper that knows how to performs the subsets of UTS 46 processing
|
||||
/// documented on the methods.
|
||||
#[derive(Debug)]
|
||||
pub struct Uts46Mapper {
|
||||
normalizer: ComposingNormalizer,
|
||||
}
|
||||
|
||||
#[cfg(feature = "compiled_data")]
|
||||
impl Default for Uts46Mapper {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl Uts46Mapper {
|
||||
/// Construct with compiled data.
|
||||
#[cfg(feature = "compiled_data")]
|
||||
pub const fn new() -> Self {
|
||||
Uts46Mapper {
|
||||
normalizer: ComposingNormalizer::new_uts46(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Construct with provider.
|
||||
#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new)]
|
||||
pub fn try_new<D>(provider: &D) -> Result<Self, NormalizerError>
|
||||
where
|
||||
D: DataProvider<CanonicalDecompositionDataV1Marker>
|
||||
+ DataProvider<Uts46DecompositionSupplementV1Marker>
|
||||
+ DataProvider<CanonicalDecompositionTablesV1Marker>
|
||||
+ DataProvider<CompatibilityDecompositionTablesV1Marker>
|
||||
// UTS 46 tables merged into CompatibilityDecompositionTablesV1Marker
|
||||
+ DataProvider<CanonicalCompositionsV1Marker>
|
||||
+ ?Sized,
|
||||
{
|
||||
let normalizer = ComposingNormalizer::try_new_uts46_unstable(provider)?;
|
||||
|
||||
Ok(Uts46Mapper { normalizer })
|
||||
}
|
||||
|
||||
/// Returns an iterator adaptor that turns an `Iterator` over `char`
|
||||
/// into an iterator yielding a `char` sequence that gets the following
|
||||
/// operations from the "Map" and "Normalize" steps of the "Processing"
|
||||
/// section of UTS 46 lazily applied to it:
|
||||
///
|
||||
/// 1. The _ignored_ characters are ignored.
|
||||
/// 2. The _mapped_ characters are mapped.
|
||||
/// 3. The _disallowed_ characters are replaced with U+FFFD,
|
||||
/// which itself is a disallowed character.
|
||||
/// 4. The _deviation_ characters are treated as _mapped_ or _valid_
|
||||
/// as appropriate.
|
||||
/// 5. The _disallowed_STD3_valid_ characters are treated as allowed.
|
||||
/// 6. The _disallowed_STD3_mapped_ characters are treated as
|
||||
/// _mapped_.
|
||||
/// 7. The result is normalized to NFC.
|
||||
///
|
||||
/// Notably:
|
||||
///
|
||||
/// * The STD3 or WHATWG ASCII deny list should be implemented as a
|
||||
/// post-processing step.
|
||||
/// * Transitional processing is not performed. Transitional mapping
|
||||
/// would be a pre-processing step, but transitional processing is
|
||||
/// deprecated, and none of Firefox, Safari, or Chrome use it.
|
||||
pub fn map_normalize<'delegate, I: Iterator<Item = char> + 'delegate>(
|
||||
&'delegate self,
|
||||
iter: I,
|
||||
) -> impl Iterator<Item = char> + 'delegate {
|
||||
self.normalizer
|
||||
.normalize_iter_private(iter, crate::IgnorableBehavior::Ignored)
|
||||
}
|
||||
|
||||
/// Returns an iterator adaptor that turns an `Iterator` over `char`
|
||||
/// into an iterator yielding a `char` sequence that gets the following
|
||||
/// operations from the NFC check and statucs steps of the "Validity
|
||||
/// Criteria" section of UTS 46 lazily applied to it:
|
||||
///
|
||||
/// 1. The _ignored_ characters are treated as _disallowed_.
|
||||
/// 2. The _mapped_ characters are mapped.
|
||||
/// 3. The _disallowed_ characters are replaced with U+FFFD,
|
||||
/// which itself is a disallowed character.
|
||||
/// 4. The _deviation_ characters are treated as _mapped_ or _valid_
|
||||
/// as appropriate.
|
||||
/// 5. The _disallowed_STD3_valid_ characters are treated as allowed.
|
||||
/// 6. The _disallowed_STD3_mapped_ characters are treated as
|
||||
/// _mapped_.
|
||||
/// 7. The result is normalized to NFC.
|
||||
///
|
||||
/// Notably:
|
||||
///
|
||||
/// * The STD3 or WHATWG ASCII deny list should be implemented as a
|
||||
/// post-processing step.
|
||||
/// * Transitional processing is not performed. Transitional mapping
|
||||
/// would be a pre-processing step, but transitional processing is
|
||||
/// deprecated, and none of Firefox, Safari, or Chrome use it.
|
||||
/// * The output needs to be compared with input to see if anything
|
||||
/// changed. This check catches failures to adhere to the normalization
|
||||
/// and status requirements. In particular, this comparison results
|
||||
/// in _mapped_ characters resulting in error like "Validity Criteria"
|
||||
/// requires.
|
||||
pub fn normalize_validate<'delegate, I: Iterator<Item = char> + 'delegate>(
|
||||
&'delegate self,
|
||||
iter: I,
|
||||
) -> impl Iterator<Item = char> + 'delegate {
|
||||
self.normalizer
|
||||
.normalize_iter_private(iter, crate::IgnorableBehavior::ReplacementCharacter)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,4 @@
|
|||
# This is a placeholder in the interest of keeping the repository size smaller.
|
||||
# Replace this file with the contents of
|
||||
# https://www.unicode.org/Public/UCD/latest/ucd/NormalizationTest.txt to actually
|
||||
# run the conformance test.
|
|
@ -0,0 +1,2 @@
|
|||
The test data comes from
|
||||
https://www.unicode.org/Public/UCD/latest/ucd/NormalizationTest.txt
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1 @@
|
|||
{"files":{"Cargo.toml":"8a831cf2a49499d9005ed9b4fa48cda2311e43b8a56272f07f48be45f93bfc79","LICENSE":"f367c1b8e1aa262435251e442901da4607b4650e0e63a026f5044473ecfb90f2","README.md":"52aa166967a2e729c2bbe88d827ed5f27e0908c7bf99806f534685c042961577","data/macros.rs":"01406adb7f8a71771640320ee0dffda2e8f721426fd0244b5e428c7e19c2dda2","data/macros/normalizer_comp_v1.rs.data":"4fea06eeaa69c3d3c18b8a854c7af369c0eadfb97cb79e32f8ccd62bbef81234","data/macros/normalizer_decomp_v1.rs.data":"cbe2a0e5ddacb10d1718f7f83ca5cd261b9618cf31b27cd46bfc61363bfc1a90","data/macros/normalizer_nfd_v1.rs.data":"1692d8a94a94afcb25dc4cadd2f413f6b20f8735128d8f2a4c4d7ade6c6e9c86","data/macros/normalizer_nfdex_v1.rs.data":"80eebad6112ac9a3af7120c6a6e7d9c8acf765e4b6ec482a33520ea05e5e27c4","data/macros/normalizer_nfkd_v1.rs.data":"6918be7b4c8f39c24b69f7958175abe8cc846a99cf1067fe09293dc919d5e963","data/macros/normalizer_nfkdex_v1.rs.data":"919d8973135e4a258094b3de711479e6d066de8f4579182b3ecb69a6cdb66e6e","data/macros/normalizer_uts46d_v1.rs.data":"081e089334626c603e2071060326d74328d6a22b0a71e5ead004f50c8956bd94","src/lib.rs":"6dadcea5dc4643966028d0470bd90f7ad5197709599571bd1750df8aa6d37e51"},"package":"f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516"}
|
|
@ -0,0 +1,42 @@
|
|||
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
|
||||
#
|
||||
# When uploading crates to the registry Cargo will automatically
|
||||
# "normalize" Cargo.toml files for maximal compatibility
|
||||
# with all versions of Cargo and also rewrite `path` dependencies
|
||||
# to registry (e.g., crates.io) dependencies.
|
||||
#
|
||||
# If you are reading this file be aware that the original Cargo.toml
|
||||
# will likely look very different (and much more reasonable).
|
||||
# See Cargo.toml.orig for the original contents.
|
||||
|
||||
[package]
|
||||
edition = "2021"
|
||||
rust-version = "1.67"
|
||||
name = "icu_normalizer_data"
|
||||
version = "1.5.0"
|
||||
authors = ["The ICU4X Project Developers"]
|
||||
include = [
|
||||
"data/**/*",
|
||||
"src/**/*",
|
||||
"examples/**/*",
|
||||
"benches/**/*",
|
||||
"tests/**/*",
|
||||
"Cargo.toml",
|
||||
"LICENSE",
|
||||
"README.md",
|
||||
]
|
||||
description = "Data for the icu_normalizer crate"
|
||||
homepage = "https://icu4x.unicode.org"
|
||||
readme = "README.md"
|
||||
categories = ["internationalization"]
|
||||
license = "Unicode-3.0"
|
||||
repository = "https://github.com/unicode-org/icu4x"
|
||||
|
||||
[package.metadata.sources.cldr]
|
||||
tagged = "45.0.0"
|
||||
|
||||
[package.metadata.sources.icuexport]
|
||||
tagged = "icu4x/2024-05-16/75.x"
|
||||
|
||||
[package.metadata.sources.segmenter_lstm]
|
||||
tagged = "v0.1.0"
|
|
@ -0,0 +1,46 @@
|
|||
UNICODE LICENSE V3
|
||||
|
||||
COPYRIGHT AND PERMISSION NOTICE
|
||||
|
||||
Copyright © 2020-2024 Unicode, Inc.
|
||||
|
||||
NOTICE TO USER: Carefully read the following legal agreement. BY
|
||||
DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR
|
||||
SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
|
||||
TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT
|
||||
DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a
|
||||
copy of data files and any associated documentation (the "Data Files") or
|
||||
software and any associated documentation (the "Software") to deal in the
|
||||
Data Files or Software without restriction, including without limitation
|
||||
the rights to use, copy, modify, merge, publish, distribute, and/or sell
|
||||
copies of the Data Files or Software, and to permit persons to whom the
|
||||
Data Files or Software are furnished to do so, provided that either (a)
|
||||
this copyright and permission notice appear with all copies of the Data
|
||||
Files or Software, or (b) this copyright and permission notice appear in
|
||||
associated Documentation.
|
||||
|
||||
THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
|
||||
KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
|
||||
THIRD PARTY RIGHTS.
|
||||
|
||||
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
|
||||
BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
|
||||
OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
||||
WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
|
||||
ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA
|
||||
FILES OR SOFTWARE.
|
||||
|
||||
Except as contained in this notice, the name of a copyright holder shall
|
||||
not be used in advertising or otherwise to promote the sale, use or other
|
||||
dealings in these Data Files or Software without prior written
|
||||
authorization of the copyright holder.
|
||||
|
||||
SPDX-License-Identifier: Unicode-3.0
|
||||
|
||||
—
|
||||
|
||||
Portions of ICU4X may have been adapted from ICU4C and/or ICU4J.
|
||||
ICU 1.8.1 to ICU 57.1 © 1995-2016 International Business Machines Corporation and others.
|
|
@ -0,0 +1,14 @@
|
|||
# icu_normalizer_data [![crates.io](https://img.shields.io/crates/v/icu_normalizer_data)](https://crates.io/crates/icu_normalizer_data)
|
||||
|
||||
<!-- cargo-rdme start -->
|
||||
|
||||
Data for the `icu_normalizer` crate
|
||||
|
||||
This data was generated with CLDR version 45.0.0, ICU version icu4x/2024-05-16/75.x, and
|
||||
LSTM segmenter version v0.1.0.
|
||||
|
||||
<!-- cargo-rdme end -->
|
||||
|
||||
## More Information
|
||||
|
||||
For more information on development, authorship, contributing etc. please visit [`ICU4X home page`](https://github.com/unicode-org/icu4x).
|
|
@ -0,0 +1,76 @@
|
|||
// @generated
|
||||
/// Marks a type as a data provider. You can then use macros like
|
||||
/// `impl_core_helloworld_v1` to add implementations.
|
||||
///
|
||||
/// ```ignore
|
||||
/// struct MyProvider;
|
||||
/// const _: () = {
|
||||
/// include!("path/to/generated/macros.rs");
|
||||
/// make_provider!(MyProvider);
|
||||
/// impl_core_helloworld_v1!(MyProvider);
|
||||
/// }
|
||||
/// ```
|
||||
#[doc(hidden)]
|
||||
#[macro_export]
|
||||
macro_rules! __make_provider {
|
||||
($ name : ty) => {
|
||||
#[clippy::msrv = "1.67"]
|
||||
impl $name {
|
||||
#[doc(hidden)]
|
||||
#[allow(dead_code)]
|
||||
pub const MUST_USE_MAKE_PROVIDER_MACRO: () = ();
|
||||
}
|
||||
icu_provider::impl_data_provider_never_marker!($name);
|
||||
};
|
||||
}
|
||||
#[doc(inline)]
|
||||
pub use __make_provider as make_provider;
|
||||
#[macro_use]
|
||||
#[path = "macros/normalizer_comp_v1.rs.data"]
|
||||
mod normalizer_comp_v1;
|
||||
#[doc(inline)]
|
||||
pub use __impl_normalizer_comp_v1 as impl_normalizer_comp_v1;
|
||||
#[doc(inline)]
|
||||
pub use __impliterable_normalizer_comp_v1 as impliterable_normalizer_comp_v1;
|
||||
#[macro_use]
|
||||
#[path = "macros/normalizer_decomp_v1.rs.data"]
|
||||
mod normalizer_decomp_v1;
|
||||
#[doc(inline)]
|
||||
pub use __impl_normalizer_decomp_v1 as impl_normalizer_decomp_v1;
|
||||
#[doc(inline)]
|
||||
pub use __impliterable_normalizer_decomp_v1 as impliterable_normalizer_decomp_v1;
|
||||
#[macro_use]
|
||||
#[path = "macros/normalizer_nfd_v1.rs.data"]
|
||||
mod normalizer_nfd_v1;
|
||||
#[doc(inline)]
|
||||
pub use __impl_normalizer_nfd_v1 as impl_normalizer_nfd_v1;
|
||||
#[doc(inline)]
|
||||
pub use __impliterable_normalizer_nfd_v1 as impliterable_normalizer_nfd_v1;
|
||||
#[macro_use]
|
||||
#[path = "macros/normalizer_nfdex_v1.rs.data"]
|
||||
mod normalizer_nfdex_v1;
|
||||
#[doc(inline)]
|
||||
pub use __impl_normalizer_nfdex_v1 as impl_normalizer_nfdex_v1;
|
||||
#[doc(inline)]
|
||||
pub use __impliterable_normalizer_nfdex_v1 as impliterable_normalizer_nfdex_v1;
|
||||
#[macro_use]
|
||||
#[path = "macros/normalizer_nfkd_v1.rs.data"]
|
||||
mod normalizer_nfkd_v1;
|
||||
#[doc(inline)]
|
||||
pub use __impl_normalizer_nfkd_v1 as impl_normalizer_nfkd_v1;
|
||||
#[doc(inline)]
|
||||
pub use __impliterable_normalizer_nfkd_v1 as impliterable_normalizer_nfkd_v1;
|
||||
#[macro_use]
|
||||
#[path = "macros/normalizer_nfkdex_v1.rs.data"]
|
||||
mod normalizer_nfkdex_v1;
|
||||
#[doc(inline)]
|
||||
pub use __impl_normalizer_nfkdex_v1 as impl_normalizer_nfkdex_v1;
|
||||
#[doc(inline)]
|
||||
pub use __impliterable_normalizer_nfkdex_v1 as impliterable_normalizer_nfkdex_v1;
|
||||
#[macro_use]
|
||||
#[path = "macros/normalizer_uts46d_v1.rs.data"]
|
||||
mod normalizer_uts46d_v1;
|
||||
#[doc(inline)]
|
||||
pub use __impl_normalizer_uts46d_v1 as impl_normalizer_uts46d_v1;
|
||||
#[doc(inline)]
|
||||
pub use __impliterable_normalizer_uts46d_v1 as impliterable_normalizer_uts46d_v1;
|
42
third_party/rust/icu_normalizer_data/data/macros/normalizer_comp_v1.rs.data
поставляемый
Normal file
42
third_party/rust/icu_normalizer_data/data/macros/normalizer_comp_v1.rs.data
поставляемый
Normal file
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
42
third_party/rust/icu_normalizer_data/data/macros/normalizer_decomp_v1.rs.data
поставляемый
Normal file
42
third_party/rust/icu_normalizer_data/data/macros/normalizer_decomp_v1.rs.data
поставляемый
Normal file
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
42
third_party/rust/icu_normalizer_data/data/macros/normalizer_nfd_v1.rs.data
поставляемый
Normal file
42
third_party/rust/icu_normalizer_data/data/macros/normalizer_nfd_v1.rs.data
поставляемый
Normal file
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
42
third_party/rust/icu_normalizer_data/data/macros/normalizer_nfdex_v1.rs.data
поставляемый
Normal file
42
third_party/rust/icu_normalizer_data/data/macros/normalizer_nfdex_v1.rs.data
поставляемый
Normal file
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
42
third_party/rust/icu_normalizer_data/data/macros/normalizer_nfkd_v1.rs.data
поставляемый
Normal file
42
third_party/rust/icu_normalizer_data/data/macros/normalizer_nfkd_v1.rs.data
поставляемый
Normal file
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
42
third_party/rust/icu_normalizer_data/data/macros/normalizer_nfkdex_v1.rs.data
поставляемый
Normal file
42
third_party/rust/icu_normalizer_data/data/macros/normalizer_nfkdex_v1.rs.data
поставляемый
Normal file
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
42
third_party/rust/icu_normalizer_data/data/macros/normalizer_uts46d_v1.rs.data
поставляемый
Normal file
42
third_party/rust/icu_normalizer_data/data/macros/normalizer_uts46d_v1.rs.data
поставляемый
Normal file
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
|
@ -0,0 +1,17 @@
|
|||
// This file is part of ICU4X. For terms of use, please see the file
|
||||
// called LICENSE at the top level of the ICU4X source tree
|
||||
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
|
||||
|
||||
//! Data for the `icu_normalizer` crate
|
||||
//!
|
||||
//! This data was generated with CLDR version 45.0.0, ICU version icu4x/2024-05-16/75.x, and
|
||||
//! LSTM segmenter version v0.1.0.
|
||||
|
||||
#![no_std]
|
||||
// The source is not readable and is massive as HTML.
|
||||
#![doc(html_no_source)]
|
||||
|
||||
#[cfg(icu4x_custom_data)]
|
||||
include!(concat!(core::env!("ICU4X_DATA_DIR"), "/macros.rs"));
|
||||
#[cfg(not(icu4x_custom_data))]
|
||||
include!("../data/macros.rs");
|
|
@ -1 +1 @@
|
|||
{"files":{"Cargo.toml":"8be30a9748419aed461ce333e260ff4a461bf8166dfc7768307f32fcfc4fbea1","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"76e972ac0f4ddb116e86e10100132a783931a596e7b9872eaa31be15cd4d751d","benches/all.rs":"e734b9c9092ed66986725f86cfe90f3756cfddb058af308b796ba494f9beefc2","src/IdnaMappingTable.txt":"87d6553a4b86bc49dcade38bf26b745cd81800eb8af295dc3fb99b4729eaea38","src/lib.rs":"e7fd80070a7e52dfd1e9fe785bf092eddc9fb421fd0f9a1ba1c2189b8d40d3ed","src/make_uts46_mapping_table.py":"917055fa841f813de2bcf79cc79b595da3d5551559ee768db8660ab77cb26c34","src/punycode.rs":"3697674a70647d200853ac9d1910ffcb4796534332fe328de16c4bb1283e2ec1","src/uts46.rs":"4eee036b6448489002ac5190f3ac28834a4caa063c7cc77474ea6256199619ae","src/uts46_mapping_table.rs":"942fff78147c61da942f5f3a7ff4e90f9d7a00a29285733ac3fc3357eb2ed06f","tests/IdnaTestV2.txt":"c6f3778b0545fd150c8063286c7f5adc901e16557eddccc3751213646d07593d","tests/bad_punycode_tests.json":"ff0a15479ed2cb08f7b4b39465160da66d1ac7575e5d76990c17e7b76cb5e0f5","tests/punycode.rs":"0b0f315a8b124c1275a423a69169b13b19bcd7e9e6a5158bd0d642d01c6db145","tests/punycode_tests.json":"3d4ac0cf25984c37b9ce197f5df680a0136f728fb8ec82bc76624e42139eb3a8","tests/tests.rs":"d205a2bfb29dfee73e014faebd3207a55ef0d40121e6dbd52f5d611b37ac111e","tests/unit.rs":"be025a7d9bab3bd1ce134c87f9d848269e157b31ca5ba0ea03426c1ac736b69e","tests/uts46.rs":"06c97bf7dc20f5372b542fa46922d6dd63fe15e0aa34d799d08df9e3a241aa21"},"package":"634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6"}
|
||||
{"files":{"Cargo.toml":"d453ab4fa012a1f5d9233aa29fa03a7d5bcff06008f2197ce0ddac7e7aa28b2b","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"76e972ac0f4ddb116e86e10100132a783931a596e7b9872eaa31be15cd4d751d","README.md":"dd73e159f3b31a7070f4564f9e68dca14495452e3b30d6fe4ca1d84656b69ee6","benches/all.rs":"53002f41ac38bdd5b1bb0a7ec8d5a9b49ce6cd3d073ce16c1014f9d4e90b762b","src/deprecated.rs":"bdba5a73432d9755c831ec01edf4d512f9390b351dba0eb8ce7b0430fa1073ad","src/lib.rs":"4d30605daf5c18d282d460ee561c7e5218aea76cf33fc072fd79f9617256f04e","src/punycode.rs":"2d9dda9bb6504863ea6f374e9ab4192ccc475a789a43a0fb624b15459a611fbc","src/uts46.rs":"2e719c93954930de20789896b153af7dd84c20e14edba6317f9dd80e3baaccc9","tests/IdnaTestV2.txt":"d668c4ea58d60fe04e6c011df98e0b317da6abaa1273d58f42b581eb0dd7adda","tests/bad_punycode_tests.json":"ff0a15479ed2cb08f7b4b39465160da66d1ac7575e5d76990c17e7b76cb5e0f5","tests/deprecated.rs":"cce256f6616a19314330a06003d6308138aae8257136431d143f062f14ab17c7","tests/punycode.rs":"75fa73b6429ccacaeb5d72fab0b927cdf9f2173a9fc5fb366697bf7002b73921","tests/punycode_tests.json":"50859b828d14d5eeba5ab930de25fb72a35310a0b46f421f65d64c7c3e54d08a","tests/tests.rs":"ecee59f0b0be27ba1e7b24bb449c681024253d0275065f0f0e258e7ec2977d12","tests/unit.rs":"7e450599b52900baa51ea26ff0cb55a830456f60642985abbc87ec671a91b8e1","tests/unitbis.rs":"545259b767cd045aed01c1515c3b092d1b3f6b3366ce88d1593a2c8e3ffcd2af","tests/uts46.rs":"0a1c339708f1ab845d726b1f55dc1be8a423a1304b0399234391d0bd419e3fe0"},"package":"bd69211b9b519e98303c015e21a007e293db403b6c85b9b124e133d25e242cdd"}
|
|
@ -11,13 +11,18 @@
|
|||
|
||||
[package]
|
||||
edition = "2018"
|
||||
rust-version = "1.51"
|
||||
rust-version = "1.67"
|
||||
name = "idna"
|
||||
version = "0.5.0"
|
||||
version = "1.0.2"
|
||||
authors = ["The rust-url developers"]
|
||||
autotests = false
|
||||
description = "IDNA (Internationalizing Domain Names in Applications) and Punycode."
|
||||
categories = ["no_std"]
|
||||
readme = "README.md"
|
||||
keywords = [
|
||||
"no_std",
|
||||
"web",
|
||||
"http",
|
||||
]
|
||||
license = "MIT OR Apache-2.0"
|
||||
repository = "https://github.com/servo/rust-url/"
|
||||
|
||||
|
@ -34,18 +39,25 @@ harness = false
|
|||
[[test]]
|
||||
name = "unit"
|
||||
|
||||
[[test]]
|
||||
name = "unitbis"
|
||||
|
||||
[[bench]]
|
||||
name = "all"
|
||||
harness = false
|
||||
|
||||
[dependencies.unicode-bidi]
|
||||
version = "0.3.10"
|
||||
features = ["hardcoded-data"]
|
||||
default-features = false
|
||||
[dependencies.icu_normalizer]
|
||||
version = "1.4.3"
|
||||
|
||||
[dependencies.unicode-normalization]
|
||||
version = "0.1.22"
|
||||
default-features = false
|
||||
[dependencies.icu_properties]
|
||||
version = "1.4.2"
|
||||
|
||||
[dependencies.smallvec]
|
||||
version = "1.13.1"
|
||||
features = ["const_generics"]
|
||||
|
||||
[dependencies.utf8_iter]
|
||||
version = "1.0.4"
|
||||
|
||||
[dev-dependencies.assert_matches]
|
||||
version = "1.3"
|
||||
|
@ -61,9 +73,12 @@ version = "0.9"
|
|||
|
||||
[features]
|
||||
alloc = []
|
||||
default = ["std"]
|
||||
std = [
|
||||
"alloc",
|
||||
"unicode-bidi/std",
|
||||
"unicode-normalization/std",
|
||||
compiled_data = [
|
||||
"icu_normalizer/compiled_data",
|
||||
"icu_properties/compiled_data",
|
||||
]
|
||||
default = [
|
||||
"std",
|
||||
"compiled_data",
|
||||
]
|
||||
std = ["alloc"]
|
||||
|
|
|
@ -0,0 +1,38 @@
|
|||
# `idna`
|
||||
|
||||
IDNA library for Rust implementing [UTS 46: Unicode IDNA Compatibility Processing](https://www.unicode.org/reports/tr46/) as parametrized by the [WHATWG URL Standard](https://url.spec.whatwg.org/#idna).
|
||||
|
||||
## What it does
|
||||
|
||||
* An implementation of UTS 46 is provided, with configurable ASCII deny list (e.g. STD3 or WHATWG rules).
|
||||
* A callback mechanism is provided for pluggable logic for deciding if a label is deemed potentially too misleading to render as Unicode in a user interface.
|
||||
* Errors are marked as U+FFFD REPLACEMENT CHARACTERs in Unicode output so that locations of errors may be illustrated to the user.
|
||||
|
||||
## What it does not do
|
||||
|
||||
* There is no default/sample policy provided for the callback mechanism mentioned above.
|
||||
* Only UTS 46 is implemented: There is no API to request strictly IDNA 2008 only or strictly IDNA 2003 only.
|
||||
* There is no API for categorizing errors beyond there being an error.
|
||||
* Checks that are configurable in UTS 46 but that the WHATWG URL Standard always set a particular way (regardless of the _beStrict_ flag in the URL Standard) cannot be configured (with the exception of the old deprecated API supporting transitional processing).
|
||||
|
||||
## Usage
|
||||
|
||||
Apps that need to prepare a hostname for usage in protocols are likely to only need the top-level function `domain_to_ascii_cow` with `AsciiDenyList::URL` as the second argument. Note that this rejects IPv6 addresses, so before this, you need to check if the first byte of the input is `b'['` and, if it is, treat the input as an IPv6 address instead.
|
||||
|
||||
Apps that need to display host names to the user should use `uts46::Uts46::to_user_interface`. The _ToUnicode_ operation is rarely appropriate for direct application usage.
|
||||
|
||||
## Cargo features
|
||||
|
||||
* `alloc` - For future proofing. Currently always required. Currently, the crate internal may allocate heap but for typical inputs do not allocate on the heap (apart from the output `String` when applicable).
|
||||
* `compiled_data` - For future proofing. Currently always required. (Passed through to ICU4X.)
|
||||
* `std` - Adds `impl std::error::Error for Errors {}` (and implies `alloc`).
|
||||
* By default, all of the above are enabled.
|
||||
|
||||
## Breaking changes since 0.5.0
|
||||
|
||||
* Stricter IDNA 2008 restrictions are no longer supported. Attempting to enable them panics immediately. UTS 46 allows all the names that IDNA 2008 allows, and when transitional processing is disabled, they resolve the same way. There are additional names that IDNA 2008 disallows but UTS 46 maps to names that IDNA 2008 allows (notably, input is mapped to fold-case output). UTS 46 also allows symbols that were allowed in IDNA 2003 as well as newer symbols that are allowed according to the same principle. (Earlier versions of this crate allowed rejecting such symbols. Rejecting characters that UTS 46 maps to IDNA 2008-permitted characters wasn't supported in earlier versions, either.)
|
||||
* `domain_to_ascii_strict` now performs the _CheckHyphens_ check (matching previous documentation).
|
||||
* The ContextJ rules are now implemented and always enabled, even when using the old deprecated API, so input that fails those rules is rejected.
|
||||
* The `Idna::to_ascii_inner` method has been removed. It didn't make sense as a public method, since callers were unable to figure out if there were errors. (A GitHub search found no callers for this method.)
|
||||
* Punycode labels whose decoding does not yield any non-ASCII characters are now treated as being in error.
|
||||
* When turning off default cargo features, the cargo feature `compiled_data` needs to be explicitly enabled.
|
|
@ -1,3 +1,5 @@
|
|||
#![allow(deprecated)]
|
||||
|
||||
#[macro_use]
|
||||
extern crate bencher;
|
||||
extern crate idna;
|
||||
|
@ -11,6 +13,12 @@ fn to_unicode_puny_label(bench: &mut Bencher) {
|
|||
bench.iter(|| config.to_unicode(black_box(encoded)));
|
||||
}
|
||||
|
||||
fn to_ascii_already_puny_label(bench: &mut Bencher) {
|
||||
let encoded = "abc.xn--mgbcm";
|
||||
let config = Config::default();
|
||||
bench.iter(|| config.to_ascii(black_box(encoded)));
|
||||
}
|
||||
|
||||
fn to_unicode_ascii(bench: &mut Bencher) {
|
||||
let encoded = "example.com";
|
||||
let config = Config::default();
|
||||
|
@ -47,6 +55,7 @@ benchmark_group!(
|
|||
to_unicode_ascii,
|
||||
to_unicode_merged_label,
|
||||
to_ascii_puny_label,
|
||||
to_ascii_already_puny_label,
|
||||
to_ascii_simple,
|
||||
to_ascii_merged,
|
||||
);
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,248 @@
|
|||
// Copyright 2013-2014 The rust-url developers.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
//! Deprecated API for [*Unicode IDNA Compatibility Processing*
|
||||
//! (Unicode Technical Standard #46)](http://www.unicode.org/reports/tr46/)
|
||||
|
||||
#![allow(deprecated)]
|
||||
|
||||
use alloc::borrow::Cow;
|
||||
use alloc::string::String;
|
||||
|
||||
use crate::uts46::*;
|
||||
use crate::Errors;
|
||||
|
||||
/// Performs preprocessing equivalent to UTS 46 transitional processing
|
||||
/// if `transitional` is `true`. If `transitional` is `false`, merely
|
||||
/// lets the input pass through as-is (for call site convenience).
|
||||
///
|
||||
/// The output of this function is to be passed to [`Uts46::process`].
|
||||
fn map_transitional(domain: &str, transitional: bool) -> Cow<'_, str> {
|
||||
if !transitional {
|
||||
return Cow::Borrowed(domain);
|
||||
}
|
||||
let mut chars = domain.chars();
|
||||
loop {
|
||||
let prev = chars.clone();
|
||||
if let Some(c) = chars.next() {
|
||||
match c {
|
||||
'ß' | 'ẞ' | 'ς' | '\u{200C}' | '\u{200D}' => {
|
||||
let mut s = String::with_capacity(domain.len());
|
||||
let tail = prev.as_str();
|
||||
let head = &domain[..domain.len() - tail.len()];
|
||||
s.push_str(head);
|
||||
for c in tail.chars() {
|
||||
match c {
|
||||
'ß' | 'ẞ' => {
|
||||
s.push_str("ss");
|
||||
}
|
||||
'ς' => {
|
||||
s.push('σ');
|
||||
}
|
||||
'\u{200C}' | '\u{200D}' => {}
|
||||
_ => {
|
||||
s.push(c);
|
||||
}
|
||||
}
|
||||
}
|
||||
return Cow::Owned(s);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
Cow::Borrowed(domain)
|
||||
}
|
||||
|
||||
/// Deprecated. Use the crate-top-level functions or [`Uts46`].
|
||||
#[derive(Default)]
|
||||
#[deprecated]
|
||||
pub struct Idna {
|
||||
config: Config,
|
||||
}
|
||||
|
||||
impl Idna {
|
||||
pub fn new(config: Config) -> Self {
|
||||
Self { config }
|
||||
}
|
||||
|
||||
/// [UTS 46 ToASCII](http://www.unicode.org/reports/tr46/#ToASCII)
|
||||
#[allow(clippy::wrong_self_convention)] // Retain old weirdness in deprecated API
|
||||
pub fn to_ascii(&mut self, domain: &str, out: &mut String) -> Result<(), Errors> {
|
||||
let mapped = map_transitional(domain, self.config.transitional_processing);
|
||||
match Uts46::new().process(
|
||||
mapped.as_bytes(),
|
||||
self.config.deny_list(),
|
||||
self.config.hyphens(),
|
||||
ErrorPolicy::FailFast, // Old code did not appear to expect the output to be useful in the error case.
|
||||
|_, _, _| false,
|
||||
out,
|
||||
None,
|
||||
) {
|
||||
Ok(ProcessingSuccess::Passthrough) => {
|
||||
if self.config.verify_dns_length && !verify_dns_length(&mapped, true) {
|
||||
return Err(crate::Errors::default());
|
||||
}
|
||||
out.push_str(&mapped);
|
||||
Ok(())
|
||||
}
|
||||
Ok(ProcessingSuccess::WroteToSink) => {
|
||||
if self.config.verify_dns_length && !verify_dns_length(out, true) {
|
||||
return Err(crate::Errors::default());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
Err(ProcessingError::ValidityError) => Err(crate::Errors::default()),
|
||||
Err(ProcessingError::SinkError) => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
/// [UTS 46 ToUnicode](http://www.unicode.org/reports/tr46/#ToUnicode)
|
||||
#[allow(clippy::wrong_self_convention)] // Retain old weirdness in deprecated API
|
||||
pub fn to_unicode(&mut self, domain: &str, out: &mut String) -> Result<(), Errors> {
|
||||
let mapped = map_transitional(domain, self.config.transitional_processing);
|
||||
match Uts46::new().process(
|
||||
mapped.as_bytes(),
|
||||
self.config.deny_list(),
|
||||
self.config.hyphens(),
|
||||
ErrorPolicy::MarkErrors,
|
||||
|_, _, _| true,
|
||||
out,
|
||||
None,
|
||||
) {
|
||||
Ok(ProcessingSuccess::Passthrough) => {
|
||||
out.push_str(&mapped);
|
||||
Ok(())
|
||||
}
|
||||
Ok(ProcessingSuccess::WroteToSink) => Ok(()),
|
||||
Err(ProcessingError::ValidityError) => Err(crate::Errors::default()),
|
||||
Err(ProcessingError::SinkError) => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Deprecated configuration API.
|
||||
#[derive(Clone, Copy)]
|
||||
#[must_use]
|
||||
#[deprecated]
|
||||
pub struct Config {
|
||||
use_std3_ascii_rules: bool,
|
||||
transitional_processing: bool,
|
||||
verify_dns_length: bool,
|
||||
check_hyphens: bool,
|
||||
}
|
||||
|
||||
/// The defaults are that of _beStrict=false_ in the [WHATWG URL Standard](https://url.spec.whatwg.org/#idna)
|
||||
impl Default for Config {
|
||||
fn default() -> Self {
|
||||
Config {
|
||||
use_std3_ascii_rules: false,
|
||||
transitional_processing: false,
|
||||
check_hyphens: false,
|
||||
// Only use for to_ascii, not to_unicode
|
||||
verify_dns_length: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Config {
|
||||
/// Whether to enforce STD3 or WHATWG URL Standard ASCII deny list.
|
||||
///
|
||||
/// `true` for STD3, `false` for no deny list.
|
||||
///
|
||||
/// Note that `true` rejects pseudo-hosts used by various TXT record-based protocols.
|
||||
#[inline]
|
||||
pub fn use_std3_ascii_rules(mut self, value: bool) -> Self {
|
||||
self.use_std3_ascii_rules = value;
|
||||
self
|
||||
}
|
||||
|
||||
/// Whether to enable (deprecated) transitional processing.
|
||||
///
|
||||
/// Note that Firefox, Safari, and Chrome do not use transitional
|
||||
/// processing.
|
||||
#[inline]
|
||||
pub fn transitional_processing(mut self, value: bool) -> Self {
|
||||
self.transitional_processing = value;
|
||||
self
|
||||
}
|
||||
|
||||
/// Whether the _VerifyDNSLength_ operation should be performed
|
||||
/// by `to_ascii`.
|
||||
///
|
||||
/// For compatibility with previous behavior, even when set to `true`,
|
||||
/// the trailing root label dot is allowed contrary to the spec.
|
||||
#[inline]
|
||||
pub fn verify_dns_length(mut self, value: bool) -> Self {
|
||||
self.verify_dns_length = value;
|
||||
self
|
||||
}
|
||||
|
||||
/// Whether to enforce STD3 rules for hyphen placement.
|
||||
///
|
||||
/// `true` to deny hyphens in the first and last positions.
|
||||
/// `false` to not enforce hyphen placement.
|
||||
///
|
||||
/// Note that for backward compatibility this is not the same as
|
||||
/// UTS 46 _CheckHyphens_, which also disallows hyphens in the
|
||||
/// third and fourth positions.
|
||||
///
|
||||
/// Note that `true` rejects real-world names, including some GitHub user pages.
|
||||
#[inline]
|
||||
pub fn check_hyphens(mut self, value: bool) -> Self {
|
||||
self.check_hyphens = value;
|
||||
self
|
||||
}
|
||||
|
||||
/// Obsolete method retained to ease migration. The argument must be `false`.
|
||||
///
|
||||
/// Panics
|
||||
///
|
||||
/// If the argument is `true`.
|
||||
#[inline]
|
||||
#[allow(unused_mut)]
|
||||
pub fn use_idna_2008_rules(mut self, value: bool) -> Self {
|
||||
assert!(!value, "IDNA 2008 rules are no longer supported");
|
||||
self
|
||||
}
|
||||
|
||||
/// Compute the deny list
|
||||
fn deny_list(&self) -> AsciiDenyList {
|
||||
if self.use_std3_ascii_rules {
|
||||
AsciiDenyList::STD3
|
||||
} else {
|
||||
AsciiDenyList::EMPTY
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute the hyphen mode
|
||||
fn hyphens(&self) -> Hyphens {
|
||||
if self.check_hyphens {
|
||||
Hyphens::CheckFirstLast
|
||||
} else {
|
||||
Hyphens::Allow
|
||||
}
|
||||
}
|
||||
|
||||
/// [UTS 46 ToASCII](http://www.unicode.org/reports/tr46/#ToASCII)
|
||||
pub fn to_ascii(self, domain: &str) -> Result<String, Errors> {
|
||||
let mut result = String::with_capacity(domain.len());
|
||||
let mut codec = Idna::new(self);
|
||||
codec.to_ascii(domain, &mut result).map(|()| result)
|
||||
}
|
||||
|
||||
/// [UTS 46 ToUnicode](http://www.unicode.org/reports/tr46/#ToUnicode)
|
||||
pub fn to_unicode(self, domain: &str) -> (String, Result<(), Errors>) {
|
||||
let mut codec = Idna::new(self);
|
||||
let mut out = String::with_capacity(domain.len());
|
||||
let result = codec.to_unicode(domain, &mut out);
|
||||
(out, result)
|
||||
}
|
||||
}
|
|
@ -42,45 +42,127 @@ extern crate alloc;
|
|||
#[cfg(not(feature = "alloc"))]
|
||||
compile_error!("the `alloc` feature must be enabled");
|
||||
|
||||
#[cfg(test)]
|
||||
#[macro_use]
|
||||
extern crate assert_matches;
|
||||
// Avoid a breaking change if in the future there's a use case for
|
||||
// having a Bring-Your-Own-ICU4X-Data constructor for `Uts46` and
|
||||
// not also having compiled data in the binary.
|
||||
#[cfg(not(feature = "compiled_data"))]
|
||||
compile_error!("the `compiled_data` feature must be enabled");
|
||||
|
||||
use alloc::borrow::Cow;
|
||||
use alloc::string::String;
|
||||
pub use uts46::AsciiDenyList;
|
||||
use uts46::Uts46;
|
||||
|
||||
mod deprecated;
|
||||
pub mod punycode;
|
||||
mod uts46;
|
||||
pub mod uts46;
|
||||
|
||||
pub use crate::uts46::{Config, Errors, Idna};
|
||||
#[allow(deprecated)]
|
||||
pub use crate::deprecated::{Config, Idna};
|
||||
|
||||
/// The [domain to ASCII](https://url.spec.whatwg.org/#concept-domain-to-ascii) algorithm.
|
||||
/// Type indicating that there were errors during UTS #46 processing.
|
||||
#[derive(Default, Debug)]
|
||||
#[non_exhaustive]
|
||||
pub struct Errors {}
|
||||
|
||||
impl From<Errors> for Result<(), Errors> {
|
||||
fn from(e: Errors) -> Result<(), Errors> {
|
||||
Err(e)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
impl std::error::Error for Errors {}
|
||||
|
||||
impl core::fmt::Display for Errors {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
|
||||
core::fmt::Debug::fmt(self, f)
|
||||
}
|
||||
}
|
||||
|
||||
/// The [domain to ASCII](https://url.spec.whatwg.org/#concept-domain-to-ascii) algorithm;
|
||||
/// version returning a `Cow`.
|
||||
///
|
||||
/// Most applications should be using this function rather than the sibling functions,
|
||||
/// and most applications should pass [`AsciiDenyList::URL`] as the second argument.
|
||||
/// Passing [`AsciiDenyList::URL`] as the second argument makes this function also
|
||||
/// perform the [forbidden domain code point](https://url.spec.whatwg.org/#forbidden-domain-code-point)
|
||||
/// check in addition to the [domain to ASCII](https://url.spec.whatwg.org/#concept-domain-to-ascii)
|
||||
/// algorithm.
|
||||
///
|
||||
/// Returns the ASCII representation a domain name,
|
||||
/// normalizing characters (upper-case to lower-case and other kinds of equivalence)
|
||||
/// and using Punycode as necessary.
|
||||
///
|
||||
/// This process may fail.
|
||||
///
|
||||
/// If you have a `&str` instead of `&[u8]`, just call `.to_bytes()` on it before
|
||||
/// passing it to this function. It's still preferable to use this function over
|
||||
/// the sibling functions that take `&str`.
|
||||
pub fn domain_to_ascii_cow(
|
||||
domain: &[u8],
|
||||
ascii_deny_list: AsciiDenyList,
|
||||
) -> Result<Cow<'_, str>, Errors> {
|
||||
Uts46::new().to_ascii(
|
||||
domain,
|
||||
ascii_deny_list,
|
||||
uts46::Hyphens::Allow,
|
||||
uts46::DnsLength::Ignore,
|
||||
)
|
||||
}
|
||||
|
||||
/// The [domain to ASCII](https://url.spec.whatwg.org/#concept-domain-to-ascii) algorithm;
|
||||
/// version returning `String` and no ASCII deny list (i.e. _UseSTD3ASCIIRules=false_).
|
||||
///
|
||||
/// This function exists for backward-compatibility. Consider using [`domain_to_ascii_cow`]
|
||||
/// instead.
|
||||
///
|
||||
/// Return the ASCII representation a domain name,
|
||||
/// normalizing characters (upper-case to lower-case and other kinds of equivalence)
|
||||
/// and using Punycode as necessary.
|
||||
///
|
||||
/// This process may fail.
|
||||
pub fn domain_to_ascii(domain: &str) -> Result<String, uts46::Errors> {
|
||||
Config::default().to_ascii(domain)
|
||||
pub fn domain_to_ascii(domain: &str) -> Result<String, Errors> {
|
||||
domain_to_ascii_cow(domain.as_bytes(), AsciiDenyList::EMPTY).map(|cow| cow.into_owned())
|
||||
}
|
||||
|
||||
/// The [domain to ASCII](https://url.spec.whatwg.org/#concept-domain-to-ascii) algorithm,
|
||||
/// with the `beStrict` flag set.
|
||||
pub fn domain_to_ascii_strict(domain: &str) -> Result<String, uts46::Errors> {
|
||||
Config::default()
|
||||
.use_std3_ascii_rules(true)
|
||||
.verify_dns_length(true)
|
||||
.to_ascii(domain)
|
||||
///
|
||||
/// Note that this rejects various real-world names including:
|
||||
/// * YouTube CDN nodes
|
||||
/// * Some GitHub user pages
|
||||
/// * Pseudo-hosts used by various TXT record-based protocols.
|
||||
pub fn domain_to_ascii_strict(domain: &str) -> Result<String, Errors> {
|
||||
Uts46::new()
|
||||
.to_ascii(
|
||||
domain.as_bytes(),
|
||||
uts46::AsciiDenyList::STD3,
|
||||
uts46::Hyphens::Check,
|
||||
uts46::DnsLength::Verify,
|
||||
)
|
||||
.map(|cow| cow.into_owned())
|
||||
}
|
||||
|
||||
/// The [domain to Unicode](https://url.spec.whatwg.org/#concept-domain-to-unicode) algorithm.
|
||||
/// The [domain to Unicode](https://url.spec.whatwg.org/#concept-domain-to-unicode) algorithm;
|
||||
/// version returning `String` and no ASCII deny list (i.e. _UseSTD3ASCIIRules=false_).
|
||||
///
|
||||
/// This function exists for backward-compatibility. Consider using [`Uts46::to_user_interface`]
|
||||
/// or [`Uts46::to_unicode`].
|
||||
///
|
||||
/// Return the Unicode representation of a domain name,
|
||||
/// normalizing characters (upper-case to lower-case and other kinds of equivalence)
|
||||
/// and decoding Punycode as necessary.
|
||||
///
|
||||
/// This may indicate [syntax violations](https://url.spec.whatwg.org/#syntax-violation)
|
||||
/// but always returns a string for the mapped domain.
|
||||
pub fn domain_to_unicode(domain: &str) -> (String, Result<(), uts46::Errors>) {
|
||||
Config::default().to_unicode(domain)
|
||||
/// If the second item of the tuple indicates an error, the first item of the tuple
|
||||
/// denotes errors using the REPLACEMENT CHARACTERs in order to be able to illustrate
|
||||
/// errors to the user. When the second item of the return tuple signals an error,
|
||||
/// the first item of the tuple must not be used in a network protocol.
|
||||
pub fn domain_to_unicode(domain: &str) -> (String, Result<(), Errors>) {
|
||||
let (cow, result) = Uts46::new().to_unicode(
|
||||
domain.as_bytes(),
|
||||
uts46::AsciiDenyList::EMPTY,
|
||||
uts46::Hyphens::Allow,
|
||||
);
|
||||
(cow.into_owned(), result)
|
||||
}
|
||||
|
|
|
@ -1,185 +0,0 @@
|
|||
# Copyright 2013-2014 The rust-url developers.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
# option. This file may not be copied, modified, or distributed
|
||||
# except according to those terms.
|
||||
|
||||
# Run as: python make_uts46_mapping_table.py IdnaMappingTable.txt > uts46_mapping_table.rs
|
||||
# You can get the latest idna table from
|
||||
# http://www.unicode.org/Public/idna/latest/IdnaMappingTable.txt
|
||||
|
||||
import collections
|
||||
import itertools
|
||||
|
||||
print('''\
|
||||
// Copyright 2013-2020 The rust-url developers.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
// Generated by make_idna_table.py
|
||||
''')
|
||||
|
||||
txt = open("IdnaMappingTable.txt")
|
||||
|
||||
def escape_char(c):
|
||||
return "\\u{%x}" % ord(c[0])
|
||||
|
||||
def char(s):
|
||||
return chr(int(s, 16))
|
||||
|
||||
strtab = collections.OrderedDict()
|
||||
strtab_offset = 0
|
||||
|
||||
def strtab_slice(s):
|
||||
global strtab, strtab_offset
|
||||
|
||||
if s in strtab:
|
||||
return strtab[s]
|
||||
else:
|
||||
utf8_len = len(s.encode('utf8'))
|
||||
c = (strtab_offset, utf8_len)
|
||||
strtab[s] = c
|
||||
strtab_offset += utf8_len
|
||||
return c
|
||||
|
||||
def rust_slice(s):
|
||||
start = s[0]
|
||||
length = s[1]
|
||||
start_lo = start & 0xff
|
||||
start_hi = start >> 8
|
||||
assert length <= 255
|
||||
assert start_hi <= 255
|
||||
return "(StringTableSlice { byte_start_lo: %d, byte_start_hi: %d, byte_len: %d })" % (start_lo, start_hi, length)
|
||||
|
||||
ranges = []
|
||||
|
||||
for line in txt:
|
||||
# remove comments
|
||||
line, _, _ = line.partition('#')
|
||||
# skip empty lines
|
||||
if len(line.strip()) == 0:
|
||||
continue
|
||||
fields = line.split(';')
|
||||
if fields[0].strip() == 'D800..DFFF':
|
||||
continue # Surrogates don't occur in Rust strings.
|
||||
first, _, last = fields[0].strip().partition('..')
|
||||
if not last:
|
||||
last = first
|
||||
mapping = fields[1].strip().replace('_', ' ').title().replace(' ', '')
|
||||
unicode_str = None
|
||||
if len(fields) > 2:
|
||||
if fields[2].strip():
|
||||
unicode_str = u''.join(char(c) for c in fields[2].strip().split(' '))
|
||||
elif mapping == "Deviation":
|
||||
unicode_str = u''
|
||||
|
||||
if len(fields) > 3:
|
||||
assert fields[3].strip() in ('NV8', 'XV8'), fields[3]
|
||||
assert mapping == 'Valid', mapping
|
||||
mapping = 'DisallowedIdna2008'
|
||||
|
||||
ranges.append((first, last, mapping, unicode_str))
|
||||
|
||||
def mergeable_key(r):
|
||||
mapping = r[2]
|
||||
|
||||
# These types have associated data, so we should not merge them.
|
||||
if mapping in ('Mapped', 'Deviation', 'DisallowedStd3Mapped'):
|
||||
return r
|
||||
assert mapping in ('Valid', 'Ignored', 'Disallowed', 'DisallowedStd3Valid', 'DisallowedIdna2008')
|
||||
return mapping
|
||||
|
||||
grouped_ranges = itertools.groupby(ranges, key=mergeable_key)
|
||||
|
||||
optimized_ranges = []
|
||||
|
||||
for (k, g) in grouped_ranges:
|
||||
group = list(g)
|
||||
if len(group) == 1:
|
||||
optimized_ranges.append(group[0])
|
||||
continue
|
||||
# Assert that nothing in the group has an associated unicode string.
|
||||
for g in group:
|
||||
if g[3] is not None and len(g[3]) > 2:
|
||||
assert not g[3][2].strip()
|
||||
# Assert that consecutive members of the group don't leave gaps in
|
||||
# the codepoint space.
|
||||
a, b = itertools.tee(group)
|
||||
next(b, None)
|
||||
for (g1, g2) in zip(a, b):
|
||||
last_char = int(g1[1], 16)
|
||||
next_char = int(g2[0], 16)
|
||||
if last_char + 1 == next_char:
|
||||
continue
|
||||
# There's a gap where surrogates would appear, but we don't have to
|
||||
# worry about that gap, as surrogates never appear in Rust strings.
|
||||
# Assert we're seeing the surrogate case here.
|
||||
assert last_char == 0xd7ff
|
||||
assert next_char == 0xe000
|
||||
optimized_ranges.append((group[0][0], group[-1][1]) + group[0][2:])
|
||||
|
||||
def is_single_char_range(r):
|
||||
(first, last, _, _) = r
|
||||
return first == last
|
||||
|
||||
# We can reduce the size of the character range table and the index table to about 1/4
|
||||
# by merging runs of single character ranges and using character offsets from the start
|
||||
# of that range to retrieve the correct `Mapping` value
|
||||
def merge_single_char_ranges(ranges):
|
||||
current = []
|
||||
for r in ranges:
|
||||
if not current or is_single_char_range(current[-1]) and is_single_char_range(r):
|
||||
current.append(r)
|
||||
continue
|
||||
if len(current) != 0:
|
||||
ret = current
|
||||
current = [r]
|
||||
yield ret
|
||||
continue
|
||||
current.append(r)
|
||||
ret = current
|
||||
current = []
|
||||
yield ret
|
||||
yield current
|
||||
|
||||
optimized_ranges = list(merge_single_char_ranges(optimized_ranges))
|
||||
|
||||
SINGLE_MARKER = 1 << 15
|
||||
|
||||
print("static TABLE: &[(char, u16)] = &[")
|
||||
|
||||
offset = 0
|
||||
for ranges in optimized_ranges:
|
||||
assert offset < SINGLE_MARKER
|
||||
|
||||
block_len = len(ranges)
|
||||
single = SINGLE_MARKER if block_len == 1 else 0
|
||||
index = offset | single
|
||||
offset += block_len
|
||||
|
||||
start = escape_char(char(ranges[0][0]))
|
||||
print(" ('%s', %s)," % (start, index))
|
||||
|
||||
print("];\n")
|
||||
|
||||
print("static MAPPING_TABLE: &[Mapping] = &[")
|
||||
|
||||
for ranges in optimized_ranges:
|
||||
for (first, last, mapping, unicode_str) in ranges:
|
||||
if unicode_str is not None:
|
||||
mapping += rust_slice(strtab_slice(unicode_str))
|
||||
print(" %s," % mapping)
|
||||
|
||||
print("];\n")
|
||||
|
||||
def escape_str(s):
|
||||
return [escape_char(c) for c in s]
|
||||
|
||||
print("static STRING_TABLE: &str = \"%s\";"
|
||||
% '\\\n '.join(itertools.chain(*[escape_str(s) for s in strtab.keys()])))
|
|
@ -15,17 +15,17 @@
|
|||
|
||||
use alloc::{string::String, vec::Vec};
|
||||
use core::char;
|
||||
use core::u32;
|
||||
use core::fmt::Write;
|
||||
use core::marker::PhantomData;
|
||||
|
||||
// Bootstring parameters for Punycode
|
||||
static BASE: u32 = 36;
|
||||
static T_MIN: u32 = 1;
|
||||
static T_MAX: u32 = 26;
|
||||
static SKEW: u32 = 38;
|
||||
static DAMP: u32 = 700;
|
||||
static INITIAL_BIAS: u32 = 72;
|
||||
static INITIAL_N: u32 = 0x80;
|
||||
static DELIMITER: char = '-';
|
||||
const BASE: u32 = 36;
|
||||
const T_MIN: u32 = 1;
|
||||
const T_MAX: u32 = 26;
|
||||
const SKEW: u32 = 38;
|
||||
const DAMP: u32 = 700;
|
||||
const INITIAL_BIAS: u32 = 72;
|
||||
const INITIAL_N: u32 = 0x80;
|
||||
|
||||
#[inline]
|
||||
fn adapt(mut delta: u32, num_points: u32, first_time: bool) -> u32 {
|
||||
|
@ -41,10 +41,17 @@ fn adapt(mut delta: u32, num_points: u32, first_time: bool) -> u32 {
|
|||
|
||||
/// Convert Punycode to an Unicode `String`.
|
||||
///
|
||||
/// This is a convenience wrapper around `decode`.
|
||||
/// Return None on malformed input or overflow.
|
||||
/// Overflow can only happen on inputs that take more than
|
||||
/// 63 encoded bytes, the DNS limit on domain name labels.
|
||||
#[inline]
|
||||
pub fn decode_to_string(input: &str) -> Option<String> {
|
||||
decode(input).map(|chars| chars.into_iter().collect())
|
||||
Some(
|
||||
Decoder::default()
|
||||
.decode::<u8, ExternalCaller>(input.as_bytes())
|
||||
.ok()?
|
||||
.collect(),
|
||||
)
|
||||
}
|
||||
|
||||
/// Convert Punycode to Unicode.
|
||||
|
@ -53,33 +60,130 @@ pub fn decode_to_string(input: &str) -> Option<String> {
|
|||
/// Overflow can only happen on inputs that take more than
|
||||
/// 63 encoded bytes, the DNS limit on domain name labels.
|
||||
pub fn decode(input: &str) -> Option<Vec<char>> {
|
||||
Some(Decoder::default().decode(input).ok()?.collect())
|
||||
Some(
|
||||
Decoder::default()
|
||||
.decode::<u8, ExternalCaller>(input.as_bytes())
|
||||
.ok()?
|
||||
.collect(),
|
||||
)
|
||||
}
|
||||
|
||||
/// Marker for internal vs. external caller to retain old API behavior
|
||||
/// while tweaking behavior for internal callers.
|
||||
///
|
||||
/// External callers need overflow checks when encoding, but internal
|
||||
/// callers don't, because `PUNYCODE_ENCODE_MAX_INPUT_LENGTH` is set
|
||||
/// to 1000, and per RFC 3492 section 6.4, the integer variable does
|
||||
/// not need to be able to represent values larger than
|
||||
/// (char::MAX - INITIAL_N) * (PUNYCODE_ENCODE_MAX_INPUT_LENGTH + 1),
|
||||
/// which is less than u32::MAX.
|
||||
///
|
||||
/// External callers need to handle upper-case ASCII when decoding,
|
||||
/// but internal callers don't, because the internal code calls the
|
||||
/// decoder only with lower-case inputs.
|
||||
pub(crate) trait PunycodeCaller {
|
||||
const EXTERNAL_CALLER: bool;
|
||||
}
|
||||
|
||||
pub(crate) struct InternalCaller;
|
||||
|
||||
impl PunycodeCaller for InternalCaller {
|
||||
const EXTERNAL_CALLER: bool = false;
|
||||
}
|
||||
|
||||
struct ExternalCaller;
|
||||
|
||||
impl PunycodeCaller for ExternalCaller {
|
||||
const EXTERNAL_CALLER: bool = true;
|
||||
}
|
||||
|
||||
pub(crate) trait PunycodeCodeUnit {
|
||||
fn is_delimiter(&self) -> bool;
|
||||
fn is_ascii(&self) -> bool;
|
||||
fn digit(&self) -> Option<u32>;
|
||||
fn char(&self) -> char;
|
||||
fn char_ascii_lower_case(&self) -> char;
|
||||
}
|
||||
|
||||
impl PunycodeCodeUnit for u8 {
|
||||
fn is_delimiter(&self) -> bool {
|
||||
*self == b'-'
|
||||
}
|
||||
fn is_ascii(&self) -> bool {
|
||||
*self < 0x80
|
||||
}
|
||||
fn digit(&self) -> Option<u32> {
|
||||
let byte = *self;
|
||||
Some(match byte {
|
||||
byte @ b'0'..=b'9' => byte - b'0' + 26,
|
||||
byte @ b'A'..=b'Z' => byte - b'A',
|
||||
byte @ b'a'..=b'z' => byte - b'a',
|
||||
_ => return None,
|
||||
} as u32)
|
||||
}
|
||||
fn char(&self) -> char {
|
||||
char::from(*self)
|
||||
}
|
||||
fn char_ascii_lower_case(&self) -> char {
|
||||
char::from(self.to_ascii_lowercase())
|
||||
}
|
||||
}
|
||||
|
||||
impl PunycodeCodeUnit for char {
|
||||
fn is_delimiter(&self) -> bool {
|
||||
*self == '-'
|
||||
}
|
||||
fn is_ascii(&self) -> bool {
|
||||
debug_assert!(false); // Unused
|
||||
true
|
||||
}
|
||||
fn digit(&self) -> Option<u32> {
|
||||
let byte = *self;
|
||||
Some(match byte {
|
||||
byte @ '0'..='9' => u32::from(byte) - u32::from('0') + 26,
|
||||
// byte @ 'A'..='Z' => u32::from(byte) - u32::from('A'), // XXX not needed if no public input
|
||||
byte @ 'a'..='z' => u32::from(byte) - u32::from('a'),
|
||||
_ => return None,
|
||||
})
|
||||
}
|
||||
fn char(&self) -> char {
|
||||
debug_assert!(false); // Unused
|
||||
*self
|
||||
}
|
||||
fn char_ascii_lower_case(&self) -> char {
|
||||
// No need to actually lower-case!
|
||||
*self
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub(crate) struct Decoder {
|
||||
insertions: Vec<(usize, char)>,
|
||||
insertions: smallvec::SmallVec<[(usize, char); 59]>,
|
||||
}
|
||||
|
||||
impl Decoder {
|
||||
/// Split the input iterator and return a Vec with insertions of encoded characters
|
||||
pub(crate) fn decode<'a>(&'a mut self, input: &'a str) -> Result<Decode<'a>, ()> {
|
||||
pub(crate) fn decode<'a, T: PunycodeCodeUnit + Copy, C: PunycodeCaller>(
|
||||
&'a mut self,
|
||||
input: &'a [T],
|
||||
) -> Result<Decode<'a, T, C>, ()> {
|
||||
self.insertions.clear();
|
||||
// Handle "basic" (ASCII) code points.
|
||||
// They are encoded as-is before the last delimiter, if any.
|
||||
let (base, input) = match input.rfind(DELIMITER) {
|
||||
None => ("", input),
|
||||
Some(position) => (
|
||||
let (base, input) = if let Some(position) = input.iter().rposition(|c| c.is_delimiter()) {
|
||||
(
|
||||
&input[..position],
|
||||
if position > 0 {
|
||||
&input[position + 1..]
|
||||
} else {
|
||||
input
|
||||
},
|
||||
),
|
||||
)
|
||||
} else {
|
||||
(&input[..0], input)
|
||||
};
|
||||
|
||||
if !base.is_ascii() {
|
||||
if C::EXTERNAL_CALLER && !base.iter().all(|c| c.is_ascii()) {
|
||||
return Err(());
|
||||
}
|
||||
|
||||
|
@ -87,8 +191,8 @@ impl Decoder {
|
|||
let mut length = base_len as u32;
|
||||
let mut code_point = INITIAL_N;
|
||||
let mut bias = INITIAL_BIAS;
|
||||
let mut i = 0;
|
||||
let mut iter = input.bytes();
|
||||
let mut i = 0u32;
|
||||
let mut iter = input.iter();
|
||||
loop {
|
||||
let previous_i = i;
|
||||
let mut weight = 1;
|
||||
|
@ -101,16 +205,13 @@ impl Decoder {
|
|||
// Decode a generalized variable-length integer into delta,
|
||||
// which gets added to i.
|
||||
loop {
|
||||
let digit = match byte {
|
||||
byte @ b'0'..=b'9' => byte - b'0' + 26,
|
||||
byte @ b'A'..=b'Z' => byte - b'A',
|
||||
byte @ b'a'..=b'z' => byte - b'a',
|
||||
_ => return Err(()),
|
||||
} as u32;
|
||||
if digit > (u32::MAX - i) / weight {
|
||||
return Err(()); // Overflow
|
||||
}
|
||||
i += digit * weight;
|
||||
let digit = if let Some(digit) = byte.digit() {
|
||||
digit
|
||||
} else {
|
||||
return Err(());
|
||||
};
|
||||
let product = digit.checked_mul(weight).ok_or(())?;
|
||||
i = i.checked_add(product).ok_or(())?;
|
||||
let t = if k <= bias {
|
||||
T_MIN
|
||||
} else if k >= bias + T_MAX {
|
||||
|
@ -121,10 +222,7 @@ impl Decoder {
|
|||
if digit < t {
|
||||
break;
|
||||
}
|
||||
if weight > u32::MAX / (BASE - t) {
|
||||
return Err(()); // Overflow
|
||||
}
|
||||
weight *= BASE - t;
|
||||
weight = weight.checked_mul(BASE - t).ok_or(())?;
|
||||
k += BASE;
|
||||
byte = match iter.next() {
|
||||
None => return Err(()), // End of input before the end of this delta
|
||||
|
@ -133,13 +231,10 @@ impl Decoder {
|
|||
}
|
||||
|
||||
bias = adapt(i - previous_i, length + 1, previous_i == 0);
|
||||
if i / (length + 1) > u32::MAX - code_point {
|
||||
return Err(()); // Overflow
|
||||
}
|
||||
|
||||
// i was supposed to wrap around from length+1 to 0,
|
||||
// incrementing code_point each time.
|
||||
code_point += i / (length + 1);
|
||||
code_point = code_point.checked_add(i / (length + 1)).ok_or(())?;
|
||||
i %= length + 1;
|
||||
let c = match char::from_u32(code_point) {
|
||||
Some(c) => c,
|
||||
|
@ -159,24 +254,30 @@ impl Decoder {
|
|||
|
||||
self.insertions.sort_by_key(|(i, _)| *i);
|
||||
Ok(Decode {
|
||||
base: base.chars(),
|
||||
base: base.iter(),
|
||||
insertions: &self.insertions,
|
||||
inserted: 0,
|
||||
position: 0,
|
||||
len: base_len + self.insertions.len(),
|
||||
phantom: PhantomData::<C>,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct Decode<'a> {
|
||||
base: core::str::Chars<'a>,
|
||||
pub(crate) struct Decode<'a, T, C>
|
||||
where
|
||||
T: PunycodeCodeUnit + Copy,
|
||||
C: PunycodeCaller,
|
||||
{
|
||||
base: core::slice::Iter<'a, T>,
|
||||
pub(crate) insertions: &'a [(usize, char)],
|
||||
inserted: usize,
|
||||
position: usize,
|
||||
len: usize,
|
||||
phantom: PhantomData<C>,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for Decode<'a> {
|
||||
impl<'a, T: PunycodeCodeUnit + Copy, C: PunycodeCaller> Iterator for Decode<'a, T, C> {
|
||||
type Item = char;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
|
@ -191,7 +292,11 @@ impl<'a> Iterator for Decode<'a> {
|
|||
}
|
||||
if let Some(c) = self.base.next() {
|
||||
self.position += 1;
|
||||
return Some(c);
|
||||
return Some(if C::EXTERNAL_CALLER {
|
||||
c.char()
|
||||
} else {
|
||||
c.char_ascii_lower_case()
|
||||
});
|
||||
} else if self.inserted >= self.insertions.len() {
|
||||
return None;
|
||||
}
|
||||
|
@ -204,7 +309,7 @@ impl<'a> Iterator for Decode<'a> {
|
|||
}
|
||||
}
|
||||
|
||||
impl<'a> ExactSizeIterator for Decode<'a> {
|
||||
impl<'a, T: PunycodeCodeUnit + Copy, C: PunycodeCaller> ExactSizeIterator for Decode<'a, T, C> {
|
||||
fn len(&self) -> usize {
|
||||
self.len - self.position
|
||||
}
|
||||
|
@ -219,7 +324,9 @@ pub fn encode_str(input: &str) -> Option<String> {
|
|||
return None;
|
||||
}
|
||||
let mut buf = String::with_capacity(input.len());
|
||||
encode_into(input.chars(), &mut buf).ok().map(|()| buf)
|
||||
encode_into::<_, _, ExternalCaller>(input.chars(), &mut buf)
|
||||
.ok()
|
||||
.map(|()| buf)
|
||||
}
|
||||
|
||||
/// Convert Unicode to Punycode.
|
||||
|
@ -231,30 +338,58 @@ pub fn encode(input: &[char]) -> Option<String> {
|
|||
return None;
|
||||
}
|
||||
let mut buf = String::with_capacity(input.len());
|
||||
encode_into(input.iter().copied(), &mut buf)
|
||||
encode_into::<_, _, ExternalCaller>(input.iter().copied(), &mut buf)
|
||||
.ok()
|
||||
.map(|()| buf)
|
||||
}
|
||||
|
||||
pub(crate) fn encode_into<I>(input: I, output: &mut String) -> Result<(), ()>
|
||||
pub(crate) enum PunycodeEncodeError {
|
||||
Overflow,
|
||||
Sink,
|
||||
}
|
||||
|
||||
impl From<core::fmt::Error> for PunycodeEncodeError {
|
||||
fn from(_: core::fmt::Error) -> Self {
|
||||
PunycodeEncodeError::Sink
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn encode_into<I, W, C>(input: I, output: &mut W) -> Result<(), PunycodeEncodeError>
|
||||
where
|
||||
I: Iterator<Item = char> + Clone,
|
||||
W: Write + ?Sized,
|
||||
C: PunycodeCaller,
|
||||
{
|
||||
// Handle "basic" (ASCII) code points. They are encoded as-is.
|
||||
let (mut input_length, mut basic_length) = (0u32, 0);
|
||||
for c in input.clone() {
|
||||
input_length = input_length.checked_add(1).ok_or(())?;
|
||||
input_length = input_length
|
||||
.checked_add(1)
|
||||
.ok_or(PunycodeEncodeError::Overflow)?;
|
||||
if c.is_ascii() {
|
||||
output.push(c);
|
||||
output.write_char(c)?;
|
||||
basic_length += 1;
|
||||
}
|
||||
}
|
||||
|
||||
if !C::EXTERNAL_CALLER {
|
||||
// We should never get an overflow here with the internal caller being
|
||||
// length-limited, but let's check anyway once here trusting the math
|
||||
// from RFC 3492 section 6.4 and then omit the overflow checks in the
|
||||
// loop below.
|
||||
let len_plus_one = input_length
|
||||
.checked_add(1)
|
||||
.ok_or(PunycodeEncodeError::Overflow)?;
|
||||
len_plus_one
|
||||
.checked_mul(u32::from(char::MAX) - INITIAL_N)
|
||||
.ok_or(PunycodeEncodeError::Overflow)?;
|
||||
}
|
||||
|
||||
if basic_length > 0 {
|
||||
output.push('-')
|
||||
output.write_char('-')?;
|
||||
}
|
||||
let mut code_point = INITIAL_N;
|
||||
let mut delta = 0;
|
||||
let mut delta = 0u32;
|
||||
let mut bias = INITIAL_BIAS;
|
||||
let mut processed = basic_length;
|
||||
while processed < input_length {
|
||||
|
@ -266,16 +401,26 @@ where
|
|||
.filter(|&c| c >= code_point)
|
||||
.min()
|
||||
.unwrap();
|
||||
if min_code_point - code_point > (u32::MAX - delta) / (processed + 1) {
|
||||
return Err(()); // Overflow
|
||||
}
|
||||
// Increase delta to advance the decoder’s <code_point,i> state to <min_code_point,0>
|
||||
delta += (min_code_point - code_point) * (processed + 1);
|
||||
if C::EXTERNAL_CALLER {
|
||||
let product = (min_code_point - code_point)
|
||||
.checked_mul(processed + 1)
|
||||
.ok_or(PunycodeEncodeError::Overflow)?;
|
||||
delta = delta
|
||||
.checked_add(product)
|
||||
.ok_or(PunycodeEncodeError::Overflow)?;
|
||||
} else {
|
||||
delta += (min_code_point - code_point) * (processed + 1);
|
||||
}
|
||||
code_point = min_code_point;
|
||||
for c in input.clone() {
|
||||
let c = c as u32;
|
||||
if c < code_point {
|
||||
delta = delta.checked_add(1).ok_or(())?;
|
||||
if C::EXTERNAL_CALLER {
|
||||
delta = delta.checked_add(1).ok_or(PunycodeEncodeError::Overflow)?;
|
||||
} else {
|
||||
delta += 1;
|
||||
}
|
||||
}
|
||||
if c == code_point {
|
||||
// Represent delta as a generalized variable-length integer:
|
||||
|
@ -293,11 +438,11 @@ where
|
|||
break;
|
||||
}
|
||||
let value = t + ((q - t) % (BASE - t));
|
||||
output.push(value_to_digit(value));
|
||||
output.write_char(value_to_digit(value))?;
|
||||
q = (q - t) / (BASE - t);
|
||||
k += BASE;
|
||||
}
|
||||
output.push(value_to_digit(q));
|
||||
output.write_char(value_to_digit(q))?;
|
||||
bias = adapt(delta, processed + 1, processed == basic_length);
|
||||
delta = 0;
|
||||
processed += 1;
|
||||
|
@ -323,6 +468,10 @@ fn value_to_digit(value: u32) -> char {
|
|||
#[cfg(target_pointer_width = "64")]
|
||||
fn huge_encode() {
|
||||
let mut buf = String::new();
|
||||
assert!(encode_into(std::iter::repeat('ß').take(u32::MAX as usize + 1), &mut buf).is_err());
|
||||
assert!(encode_into::<_, _, ExternalCaller>(
|
||||
std::iter::repeat('ß').take(u32::MAX as usize + 1),
|
||||
&mut buf
|
||||
)
|
||||
.is_err());
|
||||
assert_eq!(buf.len(), 0);
|
||||
}
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,197 @@
|
|||
// Copyright 2013-2014 The rust-url developers.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
#![allow(clippy::assigning_clones)]
|
||||
#![allow(deprecated)]
|
||||
|
||||
use crate::test::TestFn;
|
||||
use std::char;
|
||||
use std::fmt::Write;
|
||||
|
||||
use idna::Errors;
|
||||
|
||||
pub fn collect_tests<F: FnMut(String, TestFn)>(add_test: &mut F) {
|
||||
// https://www.unicode.org/Public/idna/13.0.0/IdnaTestV2.txt
|
||||
for (i, line) in include_str!("IdnaTestV2.txt").lines().enumerate() {
|
||||
if line.is_empty() || line.starts_with('#') {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Remove comments
|
||||
let line = match line.find('#') {
|
||||
Some(index) => &line[0..index],
|
||||
None => line,
|
||||
};
|
||||
|
||||
let mut pieces = line.split(';').map(|x| x.trim()).collect::<Vec<&str>>();
|
||||
let source = unescape(pieces.remove(0));
|
||||
|
||||
// ToUnicode
|
||||
let mut to_unicode = unescape(pieces.remove(0));
|
||||
if to_unicode.is_empty() {
|
||||
to_unicode = source.clone();
|
||||
}
|
||||
let to_unicode_status = status(pieces.remove(0));
|
||||
|
||||
// ToAsciiN
|
||||
let to_ascii_n = pieces.remove(0);
|
||||
let to_ascii_n = if to_ascii_n.is_empty() {
|
||||
to_unicode.clone()
|
||||
} else {
|
||||
to_ascii_n.to_owned()
|
||||
};
|
||||
let to_ascii_n_status = pieces.remove(0);
|
||||
let to_ascii_n_status = if to_ascii_n_status.is_empty() {
|
||||
to_unicode_status.clone()
|
||||
} else {
|
||||
status(to_ascii_n_status)
|
||||
};
|
||||
|
||||
// ToAsciiT
|
||||
let to_ascii_t = pieces.remove(0);
|
||||
let to_ascii_t = if to_ascii_t.is_empty() {
|
||||
to_ascii_n.clone()
|
||||
} else {
|
||||
to_ascii_t.to_owned()
|
||||
};
|
||||
let to_ascii_t_status = pieces.remove(0);
|
||||
let to_ascii_t_status = if to_ascii_t_status.is_empty() {
|
||||
to_ascii_n_status.clone()
|
||||
} else {
|
||||
status(to_ascii_t_status)
|
||||
};
|
||||
|
||||
let test_name = format!("UTS #46 (deprecated API) line {}", i + 1);
|
||||
add_test(
|
||||
test_name,
|
||||
TestFn::DynTestFn(Box::new(move || {
|
||||
let config = idna::Config::default()
|
||||
.use_std3_ascii_rules(true)
|
||||
.verify_dns_length(true)
|
||||
.check_hyphens(true);
|
||||
|
||||
// http://unicode.org/reports/tr46/#Deviations
|
||||
// applications that perform IDNA2008 lookup are not required to check
|
||||
// for these contexts, so we skip all tests annotated with C*
|
||||
|
||||
// Everybody ignores V2
|
||||
// https://github.com/servo/rust-url/pull/240
|
||||
// https://github.com/whatwg/url/issues/53#issuecomment-181528158
|
||||
// http://www.unicode.org/review/pri317/
|
||||
|
||||
// "The special error codes X3 and X4_2 are now returned where a toASCII error code
|
||||
// was formerly being generated in toUnicode due to an empty label."
|
||||
// This is not implemented yet, so we skip toUnicode X4_2 tests for now, too.
|
||||
|
||||
let (to_unicode_value, to_unicode_result) =
|
||||
config.transitional_processing(false).to_unicode(&source);
|
||||
let to_unicode_result = to_unicode_result.map(|()| to_unicode_value);
|
||||
check(
|
||||
&source,
|
||||
(&to_unicode, &to_unicode_status),
|
||||
to_unicode_result,
|
||||
|e| e == "X4_2" || e == "V2",
|
||||
);
|
||||
|
||||
let to_ascii_n_result = config.transitional_processing(false).to_ascii(&source);
|
||||
check(
|
||||
&source,
|
||||
(&to_ascii_n, &to_ascii_n_status),
|
||||
to_ascii_n_result,
|
||||
|e| e == "V2",
|
||||
);
|
||||
|
||||
let to_ascii_t_result = config.transitional_processing(true).to_ascii(&source);
|
||||
check(
|
||||
&source,
|
||||
(&to_ascii_t, &to_ascii_t_status),
|
||||
to_ascii_t_result,
|
||||
|e| e == "V2",
|
||||
);
|
||||
})),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::redundant_clone)]
|
||||
fn check<F>(source: &str, expected: (&str, &[&str]), actual: Result<String, Errors>, ignore: F)
|
||||
where
|
||||
F: Fn(&str) -> bool,
|
||||
{
|
||||
if !expected.1.is_empty() {
|
||||
if !expected.1.iter().copied().any(ignore) {
|
||||
let res = actual.ok();
|
||||
assert_eq!(
|
||||
res.clone(),
|
||||
None,
|
||||
"Expected error {:?}. result: {} | source: {}",
|
||||
expected.1,
|
||||
res.unwrap(),
|
||||
source,
|
||||
);
|
||||
}
|
||||
} else {
|
||||
assert!(
|
||||
actual.is_ok(),
|
||||
"Couldn't parse {} | error: {:?}",
|
||||
source,
|
||||
actual.err().unwrap(),
|
||||
);
|
||||
assert_eq!(actual.unwrap(), expected.0, "source: {}", source);
|
||||
}
|
||||
}
|
||||
|
||||
fn unescape(input: &str) -> String {
|
||||
let mut output = String::new();
|
||||
let mut chars = input.chars();
|
||||
loop {
|
||||
match chars.next() {
|
||||
None => return output,
|
||||
Some(c) => {
|
||||
if c == '\\' {
|
||||
match chars.next().unwrap() {
|
||||
'\\' => output.push('\\'),
|
||||
'u' => {
|
||||
let c1 = chars.next().unwrap().to_digit(16).unwrap();
|
||||
let c2 = chars.next().unwrap().to_digit(16).unwrap();
|
||||
let c3 = chars.next().unwrap().to_digit(16).unwrap();
|
||||
let c4 = chars.next().unwrap().to_digit(16).unwrap();
|
||||
match char::from_u32(((c1 * 16 + c2) * 16 + c3) * 16 + c4) {
|
||||
Some(c) => output.push(c),
|
||||
None => {
|
||||
write!(&mut output, "\\u{:X}{:X}{:X}{:X}", c1, c2, c3, c4)
|
||||
.expect("Could not write to output");
|
||||
}
|
||||
};
|
||||
}
|
||||
_ => panic!("Invalid test data input"),
|
||||
}
|
||||
} else {
|
||||
output.push(c);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn status(status: &str) -> Vec<&str> {
|
||||
if status.is_empty() || status == "[]" {
|
||||
return Vec::new();
|
||||
}
|
||||
|
||||
let mut result = status.split(", ").collect::<Vec<_>>();
|
||||
assert!(result[0].starts_with('['));
|
||||
result[0] = &result[0][1..];
|
||||
|
||||
let idx = result.len() - 1;
|
||||
let last = &mut result[idx];
|
||||
assert!(last.ends_with(']'));
|
||||
*last = &last[..last.len() - 1];
|
||||
|
||||
result
|
||||
}
|
|
@ -7,7 +7,7 @@
|
|||
// except according to those terms.
|
||||
|
||||
use crate::test::TestFn;
|
||||
use idna::punycode::{decode, encode_str};
|
||||
use idna::punycode::{decode, decode_to_string, encode_str};
|
||||
use serde_json::map::Map;
|
||||
use serde_json::Value;
|
||||
use std::panic::catch_unwind;
|
||||
|
@ -28,6 +28,17 @@ fn one_test(decoded: &str, encoded: &str) {
|
|||
}
|
||||
}
|
||||
|
||||
match decode_to_string(encoded) {
|
||||
None => panic!("Decoding {} failed.", encoded),
|
||||
Some(result) => assert!(
|
||||
result == decoded,
|
||||
"Incorrect decoding of \"{}\":\n \"{}\"\n!= \"{}\"\n",
|
||||
encoded,
|
||||
result,
|
||||
decoded
|
||||
),
|
||||
}
|
||||
|
||||
match encode_str(decoded) {
|
||||
None => panic!("Encoding {} failed.", decoded),
|
||||
Some(result) => assert!(
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
[
|
||||
{
|
||||
"description": "These tests are copied from https://github.com/bestiejs/punycode.js/blob/master/tests/tests.js , used under the MIT license.",
|
||||
"description": "These tests are copied from https://github.com/mathiasbynens/punycode.js/blob/main/tests/tests.js , used under the MIT license.",
|
||||
"decoded": "",
|
||||
"encoded": ""
|
||||
},
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
use tester as test;
|
||||
|
||||
mod deprecated;
|
||||
mod punycode;
|
||||
mod uts46;
|
||||
|
||||
|
@ -19,6 +20,7 @@ fn main() {
|
|||
})
|
||||
};
|
||||
punycode::collect_tests(&mut add_test);
|
||||
deprecated::collect_tests(&mut add_test);
|
||||
uts46::collect_tests(&mut add_test);
|
||||
}
|
||||
test::test_main(&std::env::args().collect::<Vec<_>>(), tests, None)
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
#![allow(deprecated)]
|
||||
|
||||
use assert_matches::assert_matches;
|
||||
use unicode_normalization::char::is_combining_mark;
|
||||
|
||||
/// https://github.com/servo/rust-url/issues/373
|
||||
#[test]
|
||||
|
@ -28,15 +29,21 @@ fn test_punycode_prefix_without_length_check() {
|
|||
.check_hyphens(true)
|
||||
.use_std3_ascii_rules(true);
|
||||
|
||||
assert_eq!(config.to_ascii("xn--").unwrap(), "");
|
||||
assert!(config.to_ascii("xn--").is_err());
|
||||
assert!(config.to_ascii("xn---").is_err());
|
||||
assert!(config.to_ascii("xn-----").is_err());
|
||||
assert_eq!(config.to_ascii("xn--.").unwrap(), ".");
|
||||
assert_eq!(config.to_ascii("xn--...").unwrap(), "...");
|
||||
assert_eq!(config.to_ascii(".xn--").unwrap(), ".");
|
||||
assert_eq!(config.to_ascii("...xn--").unwrap(), "...");
|
||||
assert_eq!(config.to_ascii("xn--.xn--").unwrap(), ".");
|
||||
assert_eq!(config.to_ascii("xn--.example.org").unwrap(), ".example.org");
|
||||
assert!(config.to_ascii("xn--.").is_err());
|
||||
assert!(config.to_ascii("xn--...").is_err());
|
||||
assert!(config.to_ascii(".xn--").is_err());
|
||||
assert!(config.to_ascii("...xn--").is_err());
|
||||
assert!(config.to_ascii("xn--.xn--").is_err());
|
||||
assert!(config.to_ascii("xn--.example.org").is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_punycode_invalid_encoding() {
|
||||
let config = idna::Config::default();
|
||||
assert!(config.to_ascii("xn--55555577").is_err());
|
||||
}
|
||||
|
||||
// http://www.unicode.org/reports/tr46/#Table_Example_Processing
|
||||
|
@ -85,10 +92,10 @@ fn test_examples() {
|
|||
fn test_v5() {
|
||||
let config = idna::Config::default()
|
||||
.verify_dns_length(true)
|
||||
.use_std3_ascii_rules(true);
|
||||
.use_std3_ascii_rules(true)
|
||||
.check_hyphens(true);
|
||||
|
||||
// IdnaTest:784 蔏。𑰺
|
||||
assert!(is_combining_mark('\u{11C3A}'));
|
||||
assert!(config.to_ascii("\u{11C3A}").is_err());
|
||||
assert!(config.to_ascii("\u{850f}.\u{11C3A}").is_err());
|
||||
assert!(config.to_ascii("\u{850f}\u{ff61}\u{11C3A}").is_err());
|
||||
|
@ -98,7 +105,8 @@ fn test_v5() {
|
|||
fn test_v8_bidi_rules() {
|
||||
let config = idna::Config::default()
|
||||
.verify_dns_length(true)
|
||||
.use_std3_ascii_rules(true);
|
||||
.use_std3_ascii_rules(true)
|
||||
.check_hyphens(true);
|
||||
|
||||
assert_eq!(config.to_ascii("abc").unwrap(), "abc");
|
||||
assert_eq!(config.to_ascii("123").unwrap(), "123");
|
||||
|
@ -118,18 +126,11 @@ fn test_v8_bidi_rules() {
|
|||
#[test]
|
||||
fn emoji_domains() {
|
||||
// HOT BEVERAGE is allowed here...
|
||||
let config = idna::Config::default()
|
||||
.verify_dns_length(true)
|
||||
.use_std3_ascii_rules(true);
|
||||
assert_eq!(config.to_ascii("☕.com").unwrap(), "xn--53h.com");
|
||||
|
||||
// ... but not here
|
||||
let config = idna::Config::default()
|
||||
.verify_dns_length(true)
|
||||
.use_std3_ascii_rules(true)
|
||||
.use_idna_2008_rules(true);
|
||||
let error = format!("{:?}", config.to_ascii("☕.com").unwrap_err());
|
||||
assert!(error.contains("disallowed_in_idna_2008"));
|
||||
.check_hyphens(true);
|
||||
assert_eq!(config.to_ascii("☕.com").unwrap(), "xn--53h.com");
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
|
@ -0,0 +1,374 @@
|
|||
use idna::uts46::AsciiDenyList;
|
||||
use idna::uts46::DnsLength;
|
||||
use idna::uts46::Hyphens;
|
||||
|
||||
/// https://github.com/servo/rust-url/issues/373
|
||||
#[test]
|
||||
fn test_punycode_prefix_with_length_check() {
|
||||
let config = idna::uts46::Uts46::new();
|
||||
|
||||
assert!(config
|
||||
.to_ascii(
|
||||
b"xn--",
|
||||
AsciiDenyList::STD3,
|
||||
Hyphens::Check,
|
||||
DnsLength::Verify
|
||||
)
|
||||
.is_err());
|
||||
assert!(config
|
||||
.to_ascii(
|
||||
b"xn---",
|
||||
AsciiDenyList::STD3,
|
||||
Hyphens::Check,
|
||||
DnsLength::Verify
|
||||
)
|
||||
.is_err());
|
||||
assert!(config
|
||||
.to_ascii(
|
||||
b"xn-----",
|
||||
AsciiDenyList::STD3,
|
||||
Hyphens::Check,
|
||||
DnsLength::Verify,
|
||||
)
|
||||
.is_err());
|
||||
assert!(config
|
||||
.to_ascii(
|
||||
b"xn--.",
|
||||
AsciiDenyList::STD3,
|
||||
Hyphens::Check,
|
||||
DnsLength::Verify
|
||||
)
|
||||
.is_err());
|
||||
assert!(config
|
||||
.to_ascii(
|
||||
b"xn--...",
|
||||
AsciiDenyList::STD3,
|
||||
Hyphens::Check,
|
||||
DnsLength::Verify,
|
||||
)
|
||||
.is_err());
|
||||
assert!(config
|
||||
.to_ascii(
|
||||
b".xn--",
|
||||
AsciiDenyList::STD3,
|
||||
Hyphens::Check,
|
||||
DnsLength::Verify,
|
||||
)
|
||||
.is_err());
|
||||
assert!(config
|
||||
.to_ascii(
|
||||
b"...xn--",
|
||||
AsciiDenyList::STD3,
|
||||
Hyphens::Check,
|
||||
DnsLength::Verify,
|
||||
)
|
||||
.is_err());
|
||||
assert!(config
|
||||
.to_ascii(
|
||||
b"xn--.xn--",
|
||||
AsciiDenyList::STD3,
|
||||
Hyphens::Check,
|
||||
DnsLength::Verify,
|
||||
)
|
||||
.is_err());
|
||||
assert!(config
|
||||
.to_ascii(
|
||||
b"xn--.example.org",
|
||||
AsciiDenyList::STD3,
|
||||
Hyphens::Check,
|
||||
DnsLength::Verify,
|
||||
)
|
||||
.is_err());
|
||||
}
|
||||
|
||||
/// https://github.com/servo/rust-url/issues/373
|
||||
#[test]
|
||||
fn test_punycode_prefix_without_length_check() {
|
||||
let config = idna::uts46::Uts46::new();
|
||||
|
||||
assert!(config
|
||||
.to_ascii(
|
||||
b"xn--",
|
||||
AsciiDenyList::URL,
|
||||
Hyphens::Allow,
|
||||
DnsLength::Ignore
|
||||
)
|
||||
.is_err());
|
||||
assert!(config
|
||||
.to_ascii(
|
||||
b"xn---",
|
||||
AsciiDenyList::URL,
|
||||
Hyphens::Allow,
|
||||
DnsLength::Ignore
|
||||
)
|
||||
.is_err());
|
||||
assert!(config
|
||||
.to_ascii(
|
||||
b"xn-----",
|
||||
AsciiDenyList::URL,
|
||||
Hyphens::Allow,
|
||||
DnsLength::Ignore
|
||||
)
|
||||
.is_err());
|
||||
assert!(config
|
||||
.to_ascii(
|
||||
b"xn--.",
|
||||
AsciiDenyList::URL,
|
||||
Hyphens::Allow,
|
||||
DnsLength::Ignore
|
||||
)
|
||||
.is_err());
|
||||
assert!(config
|
||||
.to_ascii(
|
||||
b"xn--...",
|
||||
AsciiDenyList::URL,
|
||||
Hyphens::Allow,
|
||||
DnsLength::Ignore
|
||||
)
|
||||
.is_err());
|
||||
assert!(config
|
||||
.to_ascii(
|
||||
b".xn--",
|
||||
AsciiDenyList::URL,
|
||||
Hyphens::Allow,
|
||||
DnsLength::Ignore
|
||||
)
|
||||
.is_err());
|
||||
assert!(config
|
||||
.to_ascii(
|
||||
b"...xn--",
|
||||
AsciiDenyList::URL,
|
||||
Hyphens::Allow,
|
||||
DnsLength::Ignore
|
||||
)
|
||||
.is_err());
|
||||
assert!(config
|
||||
.to_ascii(
|
||||
b"xn--.xn--",
|
||||
AsciiDenyList::URL,
|
||||
Hyphens::Allow,
|
||||
DnsLength::Ignore
|
||||
)
|
||||
.is_err());
|
||||
assert!(config
|
||||
.to_ascii(
|
||||
b"xn--.example.org",
|
||||
AsciiDenyList::URL,
|
||||
Hyphens::Allow,
|
||||
DnsLength::Ignore
|
||||
)
|
||||
.is_err());
|
||||
}
|
||||
/*
|
||||
// http://www.unicode.org/reports/tr46/#Table_Example_Processing
|
||||
#[test]
|
||||
fn test_examples() {
|
||||
let codec = idna::uts46bis::Uts46::new();
|
||||
let mut out = String::new();
|
||||
|
||||
assert_matches!(codec.to_unicode("Bloß.de", &mut out), Ok(()));
|
||||
assert_eq!(out, "bloß.de");
|
||||
|
||||
out.clear();
|
||||
assert_matches!(codec.to_unicode("xn--blo-7ka.de", &mut out), Ok(()));
|
||||
assert_eq!(out, "bloß.de");
|
||||
|
||||
out.clear();
|
||||
assert_matches!(codec.to_unicode("u\u{308}.com", &mut out), Ok(()));
|
||||
assert_eq!(out, "ü.com");
|
||||
|
||||
out.clear();
|
||||
assert_matches!(codec.to_unicode("xn--tda.com", &mut out), Ok(()));
|
||||
assert_eq!(out, "ü.com");
|
||||
|
||||
out.clear();
|
||||
assert_matches!(codec.to_unicode("xn--u-ccb.com", &mut out), Err(_));
|
||||
|
||||
out.clear();
|
||||
assert_matches!(codec.to_unicode("a⒈com", &mut out), Err(_));
|
||||
|
||||
out.clear();
|
||||
assert_matches!(codec.to_unicode("xn--a-ecp.ru", &mut out), Err(_));
|
||||
|
||||
out.clear();
|
||||
assert_matches!(codec.to_unicode("xn--0.pt", &mut out), Err(_));
|
||||
|
||||
out.clear();
|
||||
assert_matches!(codec.to_unicode("日本語。JP", &mut out), Ok(()));
|
||||
assert_eq!(out, "日本語.jp");
|
||||
|
||||
out.clear();
|
||||
assert_matches!(codec.to_unicode("☕.us", &mut out), Ok(()));
|
||||
assert_eq!(out, "☕.us");
|
||||
}
|
||||
*/
|
||||
|
||||
#[test]
|
||||
fn test_v5() {
|
||||
let config = idna::uts46::Uts46::new();
|
||||
|
||||
// IdnaTest:784 蔏。𑰺
|
||||
assert!(config
|
||||
.to_ascii(
|
||||
"\u{11C3A}".as_bytes(),
|
||||
AsciiDenyList::STD3,
|
||||
Hyphens::Check,
|
||||
DnsLength::Verify,
|
||||
)
|
||||
.is_err());
|
||||
assert!(config
|
||||
.to_ascii(
|
||||
"\u{850f}.\u{11C3A}".as_bytes(),
|
||||
AsciiDenyList::STD3,
|
||||
Hyphens::Check,
|
||||
DnsLength::Verify,
|
||||
)
|
||||
.is_err());
|
||||
assert!(config
|
||||
.to_ascii(
|
||||
"\u{850f}\u{ff61}\u{11C3A}".as_bytes(),
|
||||
AsciiDenyList::STD3,
|
||||
Hyphens::Check,
|
||||
DnsLength::Verify,
|
||||
)
|
||||
.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_v8_bidi_rules() {
|
||||
let config = idna::uts46::Uts46::new();
|
||||
|
||||
assert_eq!(
|
||||
config
|
||||
.to_ascii(
|
||||
b"abc",
|
||||
AsciiDenyList::STD3,
|
||||
Hyphens::Check,
|
||||
DnsLength::Verify,
|
||||
)
|
||||
.unwrap(),
|
||||
"abc"
|
||||
);
|
||||
assert_eq!(
|
||||
config
|
||||
.to_ascii(
|
||||
b"123",
|
||||
AsciiDenyList::STD3,
|
||||
Hyphens::Check,
|
||||
DnsLength::Verify,
|
||||
)
|
||||
.unwrap(),
|
||||
"123"
|
||||
);
|
||||
assert_eq!(
|
||||
config
|
||||
.to_ascii(
|
||||
"אבּג".as_bytes(),
|
||||
AsciiDenyList::STD3,
|
||||
Hyphens::Check,
|
||||
DnsLength::Verify,
|
||||
)
|
||||
.unwrap(),
|
||||
"xn--kdb3bdf"
|
||||
);
|
||||
assert_eq!(
|
||||
config
|
||||
.to_ascii(
|
||||
"ابج".as_bytes(),
|
||||
AsciiDenyList::STD3,
|
||||
Hyphens::Check,
|
||||
DnsLength::Verify,
|
||||
)
|
||||
.unwrap(),
|
||||
"xn--mgbcm"
|
||||
);
|
||||
assert_eq!(
|
||||
config
|
||||
.to_ascii(
|
||||
"abc.ابج".as_bytes(),
|
||||
AsciiDenyList::STD3,
|
||||
Hyphens::Check,
|
||||
DnsLength::Verify,
|
||||
)
|
||||
.unwrap(),
|
||||
"abc.xn--mgbcm"
|
||||
);
|
||||
assert_eq!(
|
||||
config
|
||||
.to_ascii(
|
||||
"אבּג.ابج".as_bytes(),
|
||||
AsciiDenyList::STD3,
|
||||
Hyphens::Check,
|
||||
DnsLength::Verify,
|
||||
)
|
||||
.unwrap(),
|
||||
"xn--kdb3bdf.xn--mgbcm"
|
||||
);
|
||||
|
||||
// Bidi domain names cannot start with digits
|
||||
assert!(config
|
||||
.to_ascii(
|
||||
"0a.\u{05D0}".as_bytes(),
|
||||
AsciiDenyList::STD3,
|
||||
Hyphens::Check,
|
||||
DnsLength::Verify,
|
||||
)
|
||||
.is_err());
|
||||
assert!(config
|
||||
.to_ascii(
|
||||
"0à.\u{05D0}".as_bytes(),
|
||||
AsciiDenyList::STD3,
|
||||
Hyphens::Check,
|
||||
DnsLength::Verify,
|
||||
)
|
||||
.is_err());
|
||||
|
||||
// Bidi chars may be punycode-encoded
|
||||
assert!(config
|
||||
.to_ascii(
|
||||
b"xn--0ca24w",
|
||||
AsciiDenyList::STD3,
|
||||
Hyphens::Check,
|
||||
DnsLength::Verify,
|
||||
)
|
||||
.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn emoji_domains() {
|
||||
// HOT BEVERAGE is allowed here...
|
||||
let config = idna::uts46::Uts46::new();
|
||||
assert_eq!(
|
||||
config
|
||||
.to_ascii(
|
||||
"☕.com".as_bytes(),
|
||||
AsciiDenyList::STD3,
|
||||
Hyphens::Check,
|
||||
DnsLength::Verify,
|
||||
)
|
||||
.unwrap(),
|
||||
"xn--53h.com"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unicode_before_delimiter() {
|
||||
let config = idna::uts46::Uts46::new();
|
||||
assert!(config
|
||||
.to_ascii(
|
||||
"xn--f\u{34a}-PTP".as_bytes(),
|
||||
AsciiDenyList::STD3,
|
||||
Hyphens::Check,
|
||||
DnsLength::Verify,
|
||||
)
|
||||
.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn upper_case_ascii_in_punycode() {
|
||||
let config = idna::uts46::Uts46::new();
|
||||
let (unicode, result) =
|
||||
config.to_unicode("xn--A-1ga".as_bytes(), AsciiDenyList::STD3, Hyphens::Check);
|
||||
assert!(result.is_ok());
|
||||
assert_eq!(&unicode, "aö");
|
||||
}
|
|
@ -6,10 +6,16 @@
|
|||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
#![allow(clippy::assigning_clones)]
|
||||
|
||||
use crate::test::TestFn;
|
||||
use std::char;
|
||||
use std::fmt::Write;
|
||||
|
||||
use idna::uts46::verify_dns_length;
|
||||
use idna::uts46::ProcessingError;
|
||||
use idna::uts46::ProcessingSuccess;
|
||||
use idna::uts46::{AsciiDenyList, DnsLength, ErrorPolicy, Hyphens};
|
||||
use idna::Errors;
|
||||
|
||||
pub fn collect_tests<F: FnMut(String, TestFn)>(add_test: &mut F) {
|
||||
|
@ -49,28 +55,11 @@ pub fn collect_tests<F: FnMut(String, TestFn)>(add_test: &mut F) {
|
|||
status(to_ascii_n_status)
|
||||
};
|
||||
|
||||
// ToAsciiT
|
||||
let to_ascii_t = pieces.remove(0);
|
||||
let to_ascii_t = if to_ascii_t.is_empty() {
|
||||
to_ascii_n.clone()
|
||||
} else {
|
||||
to_ascii_t.to_owned()
|
||||
};
|
||||
let to_ascii_t_status = pieces.remove(0);
|
||||
let to_ascii_t_status = if to_ascii_t_status.is_empty() {
|
||||
to_ascii_n_status.clone()
|
||||
} else {
|
||||
status(to_ascii_t_status)
|
||||
};
|
||||
|
||||
let test_name = format!("UTS #46 line {}", i + 1);
|
||||
add_test(
|
||||
test_name,
|
||||
TestFn::DynTestFn(Box::new(move || {
|
||||
let config = idna::Config::default()
|
||||
.use_std3_ascii_rules(true)
|
||||
.verify_dns_length(true)
|
||||
.check_hyphens(true);
|
||||
let config = idna::uts46::Uts46::new();
|
||||
|
||||
// http://unicode.org/reports/tr46/#Deviations
|
||||
// applications that perform IDNA2008 lookup are not required to check
|
||||
|
@ -86,29 +75,85 @@ pub fn collect_tests<F: FnMut(String, TestFn)>(add_test: &mut F) {
|
|||
// This is not implemented yet, so we skip toUnicode X4_2 tests for now, too.
|
||||
|
||||
let (to_unicode_value, to_unicode_result) =
|
||||
config.transitional_processing(false).to_unicode(&source);
|
||||
let to_unicode_result = to_unicode_result.map(|()| to_unicode_value);
|
||||
config.to_unicode(source.as_bytes(), AsciiDenyList::STD3, Hyphens::Check);
|
||||
let to_unicode_result = to_unicode_result.map(|()| to_unicode_value.into_owned());
|
||||
check(
|
||||
&source,
|
||||
(&to_unicode, &to_unicode_status),
|
||||
to_unicode_result,
|
||||
|e| e.starts_with('C') || e == "V2" || e == "X4_2",
|
||||
|e| e == "X4_2",
|
||||
);
|
||||
|
||||
let to_ascii_n_result = config.transitional_processing(false).to_ascii(&source);
|
||||
let to_ascii_n_result = config.to_ascii(
|
||||
source.as_bytes(),
|
||||
AsciiDenyList::STD3,
|
||||
Hyphens::Check,
|
||||
DnsLength::VerifyAllowRootDot,
|
||||
);
|
||||
check(
|
||||
&source,
|
||||
(&to_ascii_n, &to_ascii_n_status),
|
||||
to_ascii_n_result,
|
||||
|e| e.starts_with('C') || e == "V2",
|
||||
to_ascii_n_result.map(|cow| cow.into_owned()),
|
||||
|_| false,
|
||||
);
|
||||
|
||||
let to_ascii_t_result = config.transitional_processing(true).to_ascii(&source);
|
||||
let mut to_unicode_simultaneous = String::new();
|
||||
let mut to_ascii_simultaneous = String::new();
|
||||
let (to_unicode_simultaneous_result, to_ascii_simultaneous_result) = match config
|
||||
.process(
|
||||
source.as_bytes(),
|
||||
AsciiDenyList::STD3,
|
||||
Hyphens::Check,
|
||||
ErrorPolicy::MarkErrors,
|
||||
|_, _, _| true,
|
||||
&mut to_unicode_simultaneous,
|
||||
Some(&mut to_ascii_simultaneous),
|
||||
) {
|
||||
Ok(ProcessingSuccess::Passthrough) => (
|
||||
Ok(source.to_string()),
|
||||
if verify_dns_length(&source, true) {
|
||||
Ok(source.to_string())
|
||||
} else {
|
||||
Err(Errors::default())
|
||||
},
|
||||
),
|
||||
Ok(ProcessingSuccess::WroteToSink) => {
|
||||
if to_ascii_simultaneous.is_empty() {
|
||||
(
|
||||
Ok(to_unicode_simultaneous.clone()),
|
||||
if verify_dns_length(&to_unicode_simultaneous, true) {
|
||||
Ok(to_unicode_simultaneous)
|
||||
} else {
|
||||
Err(Errors::default())
|
||||
},
|
||||
)
|
||||
} else {
|
||||
(
|
||||
Ok(to_unicode_simultaneous),
|
||||
if verify_dns_length(&to_ascii_simultaneous, true) {
|
||||
Ok(to_ascii_simultaneous)
|
||||
} else {
|
||||
Err(Errors::default())
|
||||
},
|
||||
)
|
||||
}
|
||||
}
|
||||
Err(ProcessingError::ValidityError) => {
|
||||
(Err(Errors::default()), Err(Errors::default()))
|
||||
}
|
||||
Err(ProcessingError::SinkError) => unreachable!(),
|
||||
};
|
||||
check(
|
||||
&source,
|
||||
(&to_ascii_t, &to_ascii_t_status),
|
||||
to_ascii_t_result,
|
||||
|e| e.starts_with('C') || e == "V2",
|
||||
(&to_unicode, &to_unicode_status),
|
||||
to_unicode_simultaneous_result,
|
||||
|e| e == "X4_2",
|
||||
);
|
||||
check(
|
||||
&source,
|
||||
(&to_ascii_n, &to_ascii_n_status),
|
||||
to_ascii_simultaneous_result,
|
||||
|_| false,
|
||||
);
|
||||
})),
|
||||
)
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
{"files":{"COPYRIGHT":"23860c2a7b5d96b21569afedf033469bab9fe14a1b24a35068b8641c578ce24d","Cargo.toml":"d43bfc158330a3a780af52ff0e82d88c8b54707ddf0469e6e27749c8ded4d1b7","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"7b63ecd5f1902af1b63729947373683c32745c16a10e8e6292e2e2dcd7e90ae0","README.md":"60162364e07490163f7a8e199c7a0a8ace165ae5aa7e4b6f16ff1617ddef5867","benches/bench.rs":"827e5343b059a732904be29717c2797203bfd0a633edf08042afea65372a3e2c","scripts/unicode.py":"6b1d9025fa9970c23b9721c6704aa085263408d645cf9c469295978010fd7504","src/__test_api.rs":"78e21bfa0b98894f545c8ed3e31cec20d7a48951a7f3ed69a6130c4b3d463aee","src/decompose.rs":"c0eb774843a545356e63bbcd7fb926f80d3c97ef4601ca3701fc34154f2e9905","src/lib.rs":"1983769ea083caa36b0736c87cf2a98e91c2b900f1d5dec64e327360fa862386","src/lookups.rs":"962f9909b32e02b8a2a05836135d9cd39bb1ce01f7c659de99cbd8a3a3c78574","src/no_std_prelude.rs":"602e81e67b8952b6571826f431e3b6787be3073bc10f38a0d3374278f81a6a1f","src/normalize.rs":"de2670b4437d335d42884af844a750f70e541467ecd34077dfe032103cb9b041","src/perfect_hash.rs":"400c84e2f467f61bd55d55d08672da6a9ad7a57c938ce5d0c701a6994b1b273b","src/quick_check.rs":"9756312d75fc31b67fca954e44a4812945a7e436b03ba18b9a2441f6de570f6f","src/recompose.rs":"a6228ad7561a5c7a1ef1d510159bdde1eea8a161007c80e470432e9b844d5536","src/replace.rs":"b24c904f3e00851a78820e30ddfa4ff10c795f8925fd0ee7f5870f31fdfa770b","src/stream_safe.rs":"383d71f0da401af8e735877e43855c7e16cb06deb2263539cdec2a407dbe257d","src/tables.rs":"3d9983a4e24c5b1e5dc272a025cdc729b7107f9a52a1fc89eca598e69af36c3a","src/test.rs":"3af8ad8c6bd2cc1ca44660bd265ad813c88d3074b448df4d9ff376b25fb77d26"},"package":"5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921"}
|
|
@ -1,7 +0,0 @@
|
|||
Licensed under the Apache License, Version 2.0
|
||||
<LICENSE-APACHE or
|
||||
http://www.apache.org/licenses/LICENSE-2.0> or the MIT
|
||||
license <LICENSE-MIT or http://opensource.org/licenses/MIT>,
|
||||
at your option. All files in the project carrying such
|
||||
notice may not be copied, modified, or distributed except
|
||||
according to those terms.
|
|
@ -1,52 +0,0 @@
|
|||
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
|
||||
#
|
||||
# When uploading crates to the registry Cargo will automatically
|
||||
# "normalize" Cargo.toml files for maximal compatibility
|
||||
# with all versions of Cargo and also rewrite `path` dependencies
|
||||
# to registry (e.g., crates.io) dependencies.
|
||||
#
|
||||
# If you are reading this file be aware that the original Cargo.toml
|
||||
# will likely look very different (and much more reasonable).
|
||||
# See Cargo.toml.orig for the original contents.
|
||||
|
||||
[package]
|
||||
edition = "2018"
|
||||
name = "unicode-normalization"
|
||||
version = "0.1.22"
|
||||
authors = [
|
||||
"kwantam <kwantam@gmail.com>",
|
||||
"Manish Goregaokar <manishsmail@gmail.com>",
|
||||
]
|
||||
exclude = [
|
||||
"target/*",
|
||||
"Cargo.lock",
|
||||
"scripts/tmp",
|
||||
"*.txt",
|
||||
"tests/*",
|
||||
]
|
||||
description = """
|
||||
This crate provides functions for normalization of
|
||||
Unicode strings, including Canonical and Compatible
|
||||
Decomposition and Recomposition, as described in
|
||||
Unicode Standard Annex #15.
|
||||
"""
|
||||
homepage = "https://github.com/unicode-rs/unicode-normalization"
|
||||
documentation = "https://docs.rs/unicode-normalization/"
|
||||
readme = "README.md"
|
||||
keywords = [
|
||||
"text",
|
||||
"unicode",
|
||||
"normalization",
|
||||
"decomposition",
|
||||
"recomposition",
|
||||
]
|
||||
license = "MIT/Apache-2.0"
|
||||
repository = "https://github.com/unicode-rs/unicode-normalization"
|
||||
|
||||
[dependencies.tinyvec]
|
||||
version = "1"
|
||||
features = ["alloc"]
|
||||
|
||||
[features]
|
||||
default = ["std"]
|
||||
std = []
|
|
@ -1,201 +0,0 @@
|
|||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
|
@ -1,39 +0,0 @@
|
|||
# unicode-normalization
|
||||
|
||||
[![Build Status](https://travis-ci.org/unicode-rs/unicode-normalization.svg)](https://travis-ci.org/unicode-rs/unicode-normalization)
|
||||
[![Docs](https://docs.rs/unicode-normalization/badge.svg)](https://docs.rs/unicode-normalization/)
|
||||
|
||||
Unicode character composition and decomposition utilities
|
||||
as described in
|
||||
[Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/).
|
||||
|
||||
This crate requires Rust 1.36+.
|
||||
|
||||
```rust
|
||||
extern crate unicode_normalization;
|
||||
|
||||
use unicode_normalization::char::compose;
|
||||
use unicode_normalization::UnicodeNormalization;
|
||||
|
||||
fn main() {
|
||||
assert_eq!(compose('A','\u{30a}'), Some('Å'));
|
||||
|
||||
let s = "ÅΩ";
|
||||
let c = s.nfc().collect::<String>();
|
||||
assert_eq!(c, "ÅΩ");
|
||||
}
|
||||
```
|
||||
|
||||
## crates.io
|
||||
|
||||
You can use this package in your project by adding the following
|
||||
to your `Cargo.toml`:
|
||||
|
||||
```toml
|
||||
[dependencies]
|
||||
unicode-normalization = "0.1.22"
|
||||
```
|
||||
|
||||
## `no_std` + `alloc` support
|
||||
|
||||
This crate is completely `no_std` + `alloc` compatible. This can be enabled by disabling the `std` feature, i.e. specifying `default-features = false` for this crate on your `Cargo.toml`.
|
|
@ -1,127 +0,0 @@
|
|||
#![feature(test)]
|
||||
|
||||
extern crate test;
|
||||
extern crate unicode_normalization;
|
||||
|
||||
use std::fs;
|
||||
use test::Bencher;
|
||||
use unicode_normalization::UnicodeNormalization;
|
||||
|
||||
const ASCII: &'static str = "all types of normalized";
|
||||
const NFC: &'static str = "Introducci\u{00f3}n a Unicode.pdf";
|
||||
const NFD: &'static str = "Introduccio\u{0301}n a Unicode.pdf";
|
||||
|
||||
#[bench]
|
||||
fn bench_is_nfc_ascii(b: &mut Bencher) {
|
||||
b.iter(|| unicode_normalization::is_nfc(ASCII));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_is_nfc_normalized(b: &mut Bencher) {
|
||||
b.iter(|| unicode_normalization::is_nfc(NFC));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_is_nfc_not_normalized(b: &mut Bencher) {
|
||||
b.iter(|| unicode_normalization::is_nfc(NFD));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_is_nfd_ascii(b: &mut Bencher) {
|
||||
b.iter(|| unicode_normalization::is_nfd(ASCII));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_is_nfd_normalized(b: &mut Bencher) {
|
||||
b.iter(|| unicode_normalization::is_nfd(NFD));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_is_nfd_not_normalized(b: &mut Bencher) {
|
||||
b.iter(|| unicode_normalization::is_nfd(NFC));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_is_nfc_stream_safe_ascii(b: &mut Bencher) {
|
||||
b.iter(|| unicode_normalization::is_nfc_stream_safe(ASCII));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_is_nfc_stream_safe_normalized(b: &mut Bencher) {
|
||||
b.iter(|| unicode_normalization::is_nfc_stream_safe(NFC));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_is_nfc_stream_safe_not_normalized(b: &mut Bencher) {
|
||||
b.iter(|| unicode_normalization::is_nfc_stream_safe(NFD));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_is_nfd_stream_safe_ascii(b: &mut Bencher) {
|
||||
b.iter(|| unicode_normalization::is_nfd_stream_safe(ASCII));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_is_nfd_stream_safe_normalized(b: &mut Bencher) {
|
||||
b.iter(|| unicode_normalization::is_nfd_stream_safe(NFD));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_is_nfd_stream_safe_not_normalized(b: &mut Bencher) {
|
||||
b.iter(|| unicode_normalization::is_nfd_stream_safe(NFC));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_nfc_ascii(b: &mut Bencher) {
|
||||
b.iter(|| ASCII.nfc().count());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_nfd_ascii(b: &mut Bencher) {
|
||||
b.iter(|| ASCII.nfd().count());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_nfc_long(b: &mut Bencher) {
|
||||
let long = fs::read_to_string("benches/long.txt").unwrap();
|
||||
b.iter(|| long.nfc().count());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_nfd_long(b: &mut Bencher) {
|
||||
let long = fs::read_to_string("benches/long.txt").unwrap();
|
||||
b.iter(|| long.nfd().count());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_nfkc_ascii(b: &mut Bencher) {
|
||||
b.iter(|| ASCII.nfkc().count());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_nfkd_ascii(b: &mut Bencher) {
|
||||
b.iter(|| ASCII.nfkd().count());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_nfkc_long(b: &mut Bencher) {
|
||||
let long = fs::read_to_string("benches/long.txt").unwrap();
|
||||
b.iter(|| long.nfkc().count());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_nfkd_long(b: &mut Bencher) {
|
||||
let long = fs::read_to_string("benches/long.txt").unwrap();
|
||||
b.iter(|| long.nfkd().count());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_streamsafe_ascii(b: &mut Bencher) {
|
||||
b.iter(|| ASCII.stream_safe().count());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_streamsafe_adversarial(b: &mut Bencher) {
|
||||
let s = "bo\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}oom";
|
||||
b.iter(|| s.stream_safe().count());
|
||||
}
|
|
@ -1,621 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
#
|
||||
# Copyright 2011-2018 The Rust Project Developers. See the COPYRIGHT
|
||||
# file at the top-level directory of this distribution and at
|
||||
# http://rust-lang.org/COPYRIGHT.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
# option. This file may not be copied, modified, or distributed
|
||||
# except according to those terms.
|
||||
|
||||
# This script uses the following Unicode tables:
|
||||
# - DerivedNormalizationProps.txt
|
||||
# - NormalizationTest.txt
|
||||
# - UnicodeData.txt
|
||||
# - StandardizedVariants.txt
|
||||
#
|
||||
# Since this should not require frequent updates, we just store this
|
||||
# out-of-line and check the tables.rs and normalization_tests.rs files into git.
|
||||
import collections
|
||||
import urllib.request
|
||||
|
||||
UNICODE_VERSION = "15.0.0"
|
||||
UCD_URL = "https://www.unicode.org/Public/%s/ucd/" % UNICODE_VERSION
|
||||
|
||||
PREAMBLE = """// Copyright 2012-2018 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
|
||||
|
||||
#![allow(missing_docs)]
|
||||
"""
|
||||
|
||||
NormalizationTest = collections.namedtuple(
|
||||
"NormalizationTest",
|
||||
["source", "nfc", "nfd", "nfkc", "nfkd"],
|
||||
)
|
||||
|
||||
# Mapping taken from Table 12 from:
|
||||
# http://www.unicode.org/reports/tr44/#General_Category_Values
|
||||
expanded_categories = {
|
||||
'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'],
|
||||
'Lm': ['L'], 'Lo': ['L'],
|
||||
'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'],
|
||||
'Nd': ['N'], 'Nl': ['N'], 'No': ['No'],
|
||||
'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'],
|
||||
'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'],
|
||||
'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'],
|
||||
'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'],
|
||||
'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
|
||||
}
|
||||
|
||||
# Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior
|
||||
# http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior
|
||||
S_BASE, L_COUNT, V_COUNT, T_COUNT = 0xAC00, 19, 21, 28
|
||||
S_COUNT = L_COUNT * V_COUNT * T_COUNT
|
||||
|
||||
class UnicodeData(object):
|
||||
def __init__(self):
|
||||
self._load_unicode_data()
|
||||
self.norm_props = self._load_norm_props()
|
||||
self.norm_tests = self._load_norm_tests()
|
||||
|
||||
self.canon_comp = self._compute_canonical_comp()
|
||||
self.canon_fully_decomp, self.compat_fully_decomp = self._compute_fully_decomposed()
|
||||
|
||||
self.cjk_compat_variants_fully_decomp = {}
|
||||
self._load_cjk_compat_ideograph_variants()
|
||||
|
||||
def stats(name, table):
|
||||
count = sum(len(v) for v in table.values())
|
||||
print("%s: %d chars => %d decomposed chars" % (name, len(table), count))
|
||||
|
||||
print("Decomposition table stats:")
|
||||
stats("Canonical decomp", self.canon_decomp)
|
||||
stats("Compatible decomp", self.compat_decomp)
|
||||
stats("Canonical fully decomp", self.canon_fully_decomp)
|
||||
stats("Compatible fully decomp", self.compat_fully_decomp)
|
||||
stats("CJK Compat Variants fully decomp", self.cjk_compat_variants_fully_decomp)
|
||||
|
||||
self.ss_leading, self.ss_trailing = self._compute_stream_safe_tables()
|
||||
|
||||
def _fetch(self, filename):
|
||||
resp = urllib.request.urlopen(UCD_URL + filename)
|
||||
return resp.read().decode('utf-8')
|
||||
|
||||
def _load_unicode_data(self):
|
||||
self.name_to_char_int = {}
|
||||
self.combining_classes = {}
|
||||
self.compat_decomp = {}
|
||||
self.canon_decomp = {}
|
||||
self.general_category_mark = []
|
||||
self.general_category_public_assigned = []
|
||||
|
||||
assigned_start = 0;
|
||||
prev_char_int = -1;
|
||||
prev_name = "";
|
||||
|
||||
for line in self._fetch("UnicodeData.txt").splitlines():
|
||||
# See ftp://ftp.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html
|
||||
pieces = line.split(';')
|
||||
assert len(pieces) == 15
|
||||
char, name, category, cc, decomp = pieces[0], pieces[1], pieces[2], pieces[3], pieces[5]
|
||||
char_int = int(char, 16)
|
||||
|
||||
name = pieces[1].strip()
|
||||
self.name_to_char_int[name] = char_int
|
||||
|
||||
if cc != '0':
|
||||
self.combining_classes[char_int] = cc
|
||||
|
||||
if decomp.startswith('<'):
|
||||
self.compat_decomp[char_int] = [int(c, 16) for c in decomp.split()[1:]]
|
||||
elif decomp != '':
|
||||
self.canon_decomp[char_int] = [int(c, 16) for c in decomp.split()]
|
||||
|
||||
if category == 'M' or 'M' in expanded_categories.get(category, []):
|
||||
self.general_category_mark.append(char_int)
|
||||
|
||||
assert category != 'Cn', "Unexpected: Unassigned codepoint in UnicodeData.txt"
|
||||
if category not in ['Co', 'Cs']:
|
||||
if char_int != prev_char_int + 1 and not is_first_and_last(prev_name, name):
|
||||
self.general_category_public_assigned.append((assigned_start, prev_char_int))
|
||||
assigned_start = char_int
|
||||
prev_char_int = char_int
|
||||
prev_name = name;
|
||||
|
||||
self.general_category_public_assigned.append((assigned_start, prev_char_int))
|
||||
|
||||
def _load_cjk_compat_ideograph_variants(self):
|
||||
for line in self._fetch("StandardizedVariants.txt").splitlines():
|
||||
strip_comments = line.split('#', 1)[0].strip()
|
||||
if not strip_comments:
|
||||
continue
|
||||
|
||||
variation_sequence, description, differences = strip_comments.split(';')
|
||||
description = description.strip()
|
||||
|
||||
# Don't use variations that only apply in particular shaping environments.
|
||||
if differences:
|
||||
continue
|
||||
|
||||
# Look for entries where the description field is a codepoint name.
|
||||
if description not in self.name_to_char_int:
|
||||
continue
|
||||
|
||||
# Only consider the CJK Compatibility Ideographs.
|
||||
if not description.startswith('CJK COMPATIBILITY IDEOGRAPH-'):
|
||||
continue
|
||||
|
||||
char_int = self.name_to_char_int[description]
|
||||
|
||||
assert not char_int in self.combining_classes, "Unexpected: CJK compat variant with a combining class"
|
||||
assert not char_int in self.compat_decomp, "Unexpected: CJK compat variant and compatibility decomposition"
|
||||
assert len(self.canon_decomp[char_int]) == 1, "Unexpected: CJK compat variant and non-singleton canonical decomposition"
|
||||
# If we ever need to handle Hangul here, we'll need to handle it separately.
|
||||
assert not (S_BASE <= char_int < S_BASE + S_COUNT)
|
||||
|
||||
cjk_compat_variant_parts = [int(c, 16) for c in variation_sequence.split()]
|
||||
for c in cjk_compat_variant_parts:
|
||||
assert not c in self.canon_decomp, "Unexpected: CJK compat variant is unnormalized (canon)"
|
||||
assert not c in self.compat_decomp, "Unexpected: CJK compat variant is unnormalized (compat)"
|
||||
self.cjk_compat_variants_fully_decomp[char_int] = cjk_compat_variant_parts
|
||||
|
||||
def _load_norm_props(self):
|
||||
props = collections.defaultdict(list)
|
||||
|
||||
for line in self._fetch("DerivedNormalizationProps.txt").splitlines():
|
||||
(prop_data, _, _) = line.partition("#")
|
||||
prop_pieces = prop_data.split(";")
|
||||
|
||||
if len(prop_pieces) < 2:
|
||||
continue
|
||||
|
||||
assert len(prop_pieces) <= 3
|
||||
(low, _, high) = prop_pieces[0].strip().partition("..")
|
||||
|
||||
prop = prop_pieces[1].strip()
|
||||
|
||||
data = None
|
||||
if len(prop_pieces) == 3:
|
||||
data = prop_pieces[2].strip()
|
||||
|
||||
props[prop].append((low, high, data))
|
||||
|
||||
return props
|
||||
|
||||
def _load_norm_tests(self):
|
||||
tests = []
|
||||
for line in self._fetch("NormalizationTest.txt").splitlines():
|
||||
(test_data, _, _) = line.partition("#")
|
||||
test_pieces = test_data.split(";")
|
||||
|
||||
if len(test_pieces) < 5:
|
||||
continue
|
||||
|
||||
source, nfc, nfd, nfkc, nfkd = [[c.strip() for c in p.split()] for p in test_pieces[:5]]
|
||||
tests.append(NormalizationTest(source, nfc, nfd, nfkc, nfkd))
|
||||
|
||||
return tests
|
||||
|
||||
def _compute_canonical_comp(self):
|
||||
canon_comp = {}
|
||||
comp_exclusions = [
|
||||
(int(low, 16), int(high or low, 16))
|
||||
for low, high, _ in self.norm_props["Full_Composition_Exclusion"]
|
||||
]
|
||||
for char_int, decomp in self.canon_decomp.items():
|
||||
if any(lo <= char_int <= hi for lo, hi in comp_exclusions):
|
||||
continue
|
||||
|
||||
assert len(decomp) == 2
|
||||
assert (decomp[0], decomp[1]) not in canon_comp
|
||||
canon_comp[(decomp[0], decomp[1])] = char_int
|
||||
|
||||
return canon_comp
|
||||
|
||||
def _compute_fully_decomposed(self):
|
||||
"""
|
||||
Even though the decomposition algorithm is recursive, it is possible
|
||||
to precompute the recursion at table generation time with modest
|
||||
increase to the table size. Then, for these precomputed tables, we
|
||||
note that 1) compatible decomposition is a subset of canonical
|
||||
decomposition and 2) they mostly agree on their intersection.
|
||||
Therefore, we don't store entries in the compatible table for
|
||||
characters that decompose the same way under canonical decomposition.
|
||||
|
||||
Decomposition table stats:
|
||||
Canonical decomp: 2060 chars => 3085 decomposed chars
|
||||
Compatible decomp: 3662 chars => 5440 decomposed chars
|
||||
Canonical fully decomp: 2060 chars => 3404 decomposed chars
|
||||
Compatible fully decomp: 3678 chars => 5599 decomposed chars
|
||||
|
||||
The upshot is that decomposition code is very simple and easy to inline
|
||||
at mild code size cost.
|
||||
"""
|
||||
def _decompose(char_int, compatible):
|
||||
# 7-bit ASCII never decomposes
|
||||
if char_int <= 0x7f:
|
||||
yield char_int
|
||||
return
|
||||
|
||||
# Assert that we're handling Hangul separately.
|
||||
assert not (S_BASE <= char_int < S_BASE + S_COUNT)
|
||||
|
||||
decomp = self.canon_decomp.get(char_int)
|
||||
if decomp is not None:
|
||||
for decomposed_ch in decomp:
|
||||
for fully_decomposed_ch in _decompose(decomposed_ch, compatible):
|
||||
yield fully_decomposed_ch
|
||||
return
|
||||
|
||||
if compatible and char_int in self.compat_decomp:
|
||||
for decomposed_ch in self.compat_decomp[char_int]:
|
||||
for fully_decomposed_ch in _decompose(decomposed_ch, compatible):
|
||||
yield fully_decomposed_ch
|
||||
return
|
||||
|
||||
yield char_int
|
||||
return
|
||||
|
||||
end_codepoint = max(
|
||||
max(self.canon_decomp.keys()),
|
||||
max(self.compat_decomp.keys()),
|
||||
)
|
||||
|
||||
canon_fully_decomp = {}
|
||||
compat_fully_decomp = {}
|
||||
|
||||
for char_int in range(0, end_codepoint + 1):
|
||||
# Always skip Hangul, since it's more efficient to represent its
|
||||
# decomposition programmatically.
|
||||
if S_BASE <= char_int < S_BASE + S_COUNT:
|
||||
continue
|
||||
|
||||
canon = list(_decompose(char_int, False))
|
||||
if not (len(canon) == 1 and canon[0] == char_int):
|
||||
canon_fully_decomp[char_int] = canon
|
||||
|
||||
compat = list(_decompose(char_int, True))
|
||||
if not (len(compat) == 1 and compat[0] == char_int):
|
||||
compat_fully_decomp[char_int] = compat
|
||||
|
||||
# Since canon_fully_decomp is a subset of compat_fully_decomp, we don't
|
||||
# need to store their overlap when they agree. When they don't agree,
|
||||
# store the decomposition in the compatibility table since we'll check
|
||||
# that first when normalizing to NFKD.
|
||||
assert set(canon_fully_decomp) <= set(compat_fully_decomp)
|
||||
|
||||
for ch in set(canon_fully_decomp) & set(compat_fully_decomp):
|
||||
if canon_fully_decomp[ch] == compat_fully_decomp[ch]:
|
||||
del compat_fully_decomp[ch]
|
||||
|
||||
return canon_fully_decomp, compat_fully_decomp
|
||||
|
||||
def _compute_stream_safe_tables(self):
|
||||
"""
|
||||
To make a text stream-safe with the Stream-Safe Text Process (UAX15-D4),
|
||||
we need to be able to know the number of contiguous non-starters *after*
|
||||
applying compatibility decomposition to each character.
|
||||
|
||||
We can do this incrementally by computing the number of leading and
|
||||
trailing non-starters for each character's compatibility decomposition
|
||||
with the following rules:
|
||||
|
||||
1) If a character is not affected by compatibility decomposition, look
|
||||
up its canonical combining class to find out if it's a non-starter.
|
||||
2) All Hangul characters are starters, even under decomposition.
|
||||
3) Otherwise, very few decomposing characters have a nonzero count
|
||||
of leading or trailing non-starters, so store these characters
|
||||
with their associated counts in a separate table.
|
||||
"""
|
||||
leading_nonstarters = {}
|
||||
trailing_nonstarters = {}
|
||||
|
||||
for c in set(self.canon_fully_decomp) | set(self.compat_fully_decomp):
|
||||
decomposed = self.compat_fully_decomp.get(c) or self.canon_fully_decomp[c]
|
||||
|
||||
num_leading = 0
|
||||
for d in decomposed:
|
||||
if d not in self.combining_classes:
|
||||
break
|
||||
num_leading += 1
|
||||
|
||||
num_trailing = 0
|
||||
for d in reversed(decomposed):
|
||||
if d not in self.combining_classes:
|
||||
break
|
||||
num_trailing += 1
|
||||
|
||||
if num_leading > 0:
|
||||
leading_nonstarters[c] = num_leading
|
||||
if num_trailing > 0:
|
||||
trailing_nonstarters[c] = num_trailing
|
||||
|
||||
return leading_nonstarters, trailing_nonstarters
|
||||
|
||||
hexify = lambda c: '{:04X}'.format(c)
|
||||
|
||||
# Test whether `first` and `last` are corresponding "<..., First>" and
|
||||
# "<..., Last>" markers.
|
||||
def is_first_and_last(first, last):
|
||||
if not first.startswith('<') or not first.endswith(', First>'):
|
||||
return False
|
||||
if not last.startswith('<') or not last.endswith(', Last>'):
|
||||
return False
|
||||
return first[1:-8] == last[1:-7]
|
||||
|
||||
def gen_mph_data(name, d, kv_type, kv_callback):
|
||||
(salt, keys) = minimal_perfect_hash(d)
|
||||
out.write("pub(crate) const %s_SALT: &[u16] = &[\n" % name.upper())
|
||||
for s in salt:
|
||||
out.write(" 0x{:x},\n".format(s))
|
||||
out.write("];\n")
|
||||
out.write("pub(crate) const {}_KV: &[{}] = &[\n".format(name.upper(), kv_type))
|
||||
for k in keys:
|
||||
out.write(" {},\n".format(kv_callback(k)))
|
||||
out.write("];\n\n")
|
||||
|
||||
def gen_combining_class(combining_classes, out):
|
||||
gen_mph_data('canonical_combining_class', combining_classes, 'u32',
|
||||
lambda k: "0x{:X}".format(int(combining_classes[k]) | (k << 8)))
|
||||
|
||||
def gen_composition_table(canon_comp, out):
|
||||
table = {}
|
||||
for (c1, c2), c3 in canon_comp.items():
|
||||
if c1 < 0x10000 and c2 < 0x10000:
|
||||
table[(c1 << 16) | c2] = c3
|
||||
(salt, keys) = minimal_perfect_hash(table)
|
||||
gen_mph_data('COMPOSITION_TABLE', table, '(u32, char)',
|
||||
lambda k: "(0x%s, '\\u{%s}')" % (hexify(k), hexify(table[k])))
|
||||
|
||||
out.write("pub(crate) fn composition_table_astral(c1: char, c2: char) -> Option<char> {\n")
|
||||
out.write(" match (c1, c2) {\n")
|
||||
for (c1, c2), c3 in sorted(canon_comp.items()):
|
||||
if c1 >= 0x10000 and c2 >= 0x10000:
|
||||
out.write(" ('\\u{%s}', '\\u{%s}') => Some('\\u{%s}'),\n" % (hexify(c1), hexify(c2), hexify(c3)))
|
||||
|
||||
out.write(" _ => None,\n")
|
||||
out.write(" }\n")
|
||||
out.write("}\n")
|
||||
|
||||
def gen_decomposition_tables(canon_decomp, compat_decomp, cjk_compat_variants_decomp, out):
|
||||
tables = [(canon_decomp, 'canonical'), (compat_decomp, 'compatibility'), (cjk_compat_variants_decomp, 'cjk_compat_variants')]
|
||||
for table, name in tables:
|
||||
offsets = {}
|
||||
offset = 0
|
||||
out.write("pub(crate) const %s_DECOMPOSED_CHARS: &[char] = &[\n" % name.upper())
|
||||
for k, v in table.items():
|
||||
offsets[k] = offset
|
||||
offset += len(v)
|
||||
for c in v:
|
||||
out.write(" '\\u{%s}',\n" % hexify(c))
|
||||
# The largest offset must fit in a u16.
|
||||
assert offset < 65536
|
||||
out.write("];\n")
|
||||
gen_mph_data(name + '_decomposed', table, "(u32, (u16, u16))",
|
||||
lambda k: "(0x{:x}, ({}, {}))".format(k, offsets[k], len(table[k])))
|
||||
|
||||
def gen_qc_match(prop_table, out):
|
||||
out.write(" match c {\n")
|
||||
|
||||
for low, high, data in prop_table:
|
||||
assert data in ('N', 'M')
|
||||
result = "No" if data == 'N' else "Maybe"
|
||||
if high:
|
||||
out.write(r" '\u{%s}'...'\u{%s}' => %s," % (low, high, result))
|
||||
else:
|
||||
out.write(r" '\u{%s}' => %s," % (low, result))
|
||||
out.write("\n")
|
||||
|
||||
out.write(" _ => Yes,\n")
|
||||
out.write(" }\n")
|
||||
|
||||
def gen_nfc_qc(prop_tables, out):
|
||||
out.write("#[inline]\n")
|
||||
out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
|
||||
out.write("pub fn qc_nfc(c: char) -> IsNormalized {\n")
|
||||
gen_qc_match(prop_tables['NFC_QC'], out)
|
||||
out.write("}\n")
|
||||
|
||||
def gen_nfkc_qc(prop_tables, out):
|
||||
out.write("#[inline]\n")
|
||||
out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
|
||||
out.write("pub fn qc_nfkc(c: char) -> IsNormalized {\n")
|
||||
gen_qc_match(prop_tables['NFKC_QC'], out)
|
||||
out.write("}\n")
|
||||
|
||||
def gen_nfd_qc(prop_tables, out):
|
||||
out.write("#[inline]\n")
|
||||
out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
|
||||
out.write("pub fn qc_nfd(c: char) -> IsNormalized {\n")
|
||||
gen_qc_match(prop_tables['NFD_QC'], out)
|
||||
out.write("}\n")
|
||||
|
||||
def gen_nfkd_qc(prop_tables, out):
|
||||
out.write("#[inline]\n")
|
||||
out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
|
||||
out.write("pub fn qc_nfkd(c: char) -> IsNormalized {\n")
|
||||
gen_qc_match(prop_tables['NFKD_QC'], out)
|
||||
out.write("}\n")
|
||||
|
||||
def gen_combining_mark(general_category_mark, out):
|
||||
gen_mph_data('combining_mark', general_category_mark, 'u32',
|
||||
lambda k: '0x{:04x}'.format(k))
|
||||
|
||||
def gen_public_assigned(general_category_public_assigned, out):
|
||||
# This could be done as a hash but the table is somewhat small.
|
||||
out.write("#[inline]\n")
|
||||
out.write("pub fn is_public_assigned(c: char) -> bool {\n")
|
||||
out.write(" match c {\n")
|
||||
|
||||
start = True
|
||||
for first, last in general_category_public_assigned:
|
||||
if start:
|
||||
out.write(" ")
|
||||
start = False
|
||||
else:
|
||||
out.write(" | ")
|
||||
if first == last:
|
||||
out.write("'\\u{%s}'\n" % hexify(first))
|
||||
else:
|
||||
out.write("'\\u{%s}'..='\\u{%s}'\n" % (hexify(first), hexify(last)))
|
||||
out.write(" => true,\n")
|
||||
|
||||
out.write(" _ => false,\n")
|
||||
out.write(" }\n")
|
||||
out.write("}\n")
|
||||
out.write("\n")
|
||||
|
||||
def gen_stream_safe(leading, trailing, out):
|
||||
# This could be done as a hash but the table is very small.
|
||||
out.write("#[inline]\n")
|
||||
out.write("pub fn stream_safe_leading_nonstarters(c: char) -> usize {\n")
|
||||
out.write(" match c {\n")
|
||||
|
||||
for char, num_leading in sorted(leading.items()):
|
||||
out.write(" '\\u{%s}' => %d,\n" % (hexify(char), num_leading))
|
||||
|
||||
out.write(" _ => 0,\n")
|
||||
out.write(" }\n")
|
||||
out.write("}\n")
|
||||
out.write("\n")
|
||||
|
||||
gen_mph_data('trailing_nonstarters', trailing, 'u32',
|
||||
lambda k: "0x{:X}".format(int(trailing[k]) | (k << 8)))
|
||||
|
||||
def gen_tests(tests, out):
|
||||
out.write("""#[derive(Debug)]
|
||||
pub struct NormalizationTest {
|
||||
pub source: &'static str,
|
||||
pub nfc: &'static str,
|
||||
pub nfd: &'static str,
|
||||
pub nfkc: &'static str,
|
||||
pub nfkd: &'static str,
|
||||
}
|
||||
|
||||
""")
|
||||
|
||||
out.write("pub const NORMALIZATION_TESTS: &[NormalizationTest] = &[\n")
|
||||
str_literal = lambda s: '"%s"' % "".join("\\u{%s}" % c for c in s)
|
||||
|
||||
for test in tests:
|
||||
out.write(" NormalizationTest {\n")
|
||||
out.write(" source: %s,\n" % str_literal(test.source))
|
||||
out.write(" nfc: %s,\n" % str_literal(test.nfc))
|
||||
out.write(" nfd: %s,\n" % str_literal(test.nfd))
|
||||
out.write(" nfkc: %s,\n" % str_literal(test.nfkc))
|
||||
out.write(" nfkd: %s,\n" % str_literal(test.nfkd))
|
||||
out.write(" },\n")
|
||||
|
||||
out.write("];\n")
|
||||
|
||||
# Guaranteed to be less than n.
|
||||
def my_hash(x, salt, n):
|
||||
# This is hash based on the theory that multiplication is efficient
|
||||
mask_32 = 0xffffffff
|
||||
y = ((x + salt) * 2654435769) & mask_32
|
||||
y ^= (x * 0x31415926) & mask_32
|
||||
return (y * n) >> 32
|
||||
|
||||
# Compute minimal perfect hash function, d can be either a dict or list of keys.
|
||||
def minimal_perfect_hash(d):
|
||||
n = len(d)
|
||||
buckets = dict((h, []) for h in range(n))
|
||||
for key in d:
|
||||
h = my_hash(key, 0, n)
|
||||
buckets[h].append(key)
|
||||
bsorted = [(len(buckets[h]), h) for h in range(n)]
|
||||
bsorted.sort(reverse = True)
|
||||
claimed = [False] * n
|
||||
salts = [0] * n
|
||||
keys = [0] * n
|
||||
for (bucket_size, h) in bsorted:
|
||||
# Note: the traditional perfect hashing approach would also special-case
|
||||
# bucket_size == 1 here and assign any empty slot, rather than iterating
|
||||
# until rehash finds an empty slot. But we're not doing that so we can
|
||||
# avoid the branch.
|
||||
if bucket_size == 0:
|
||||
break
|
||||
else:
|
||||
for salt in range(1, 32768):
|
||||
rehashes = [my_hash(key, salt, n) for key in buckets[h]]
|
||||
# Make sure there are no rehash collisions within this bucket.
|
||||
if all(not claimed[hash] for hash in rehashes):
|
||||
if len(set(rehashes)) < bucket_size:
|
||||
continue
|
||||
salts[h] = salt
|
||||
for key in buckets[h]:
|
||||
rehash = my_hash(key, salt, n)
|
||||
claimed[rehash] = True
|
||||
keys[rehash] = key
|
||||
break
|
||||
if salts[h] == 0:
|
||||
print("minimal perfect hashing failed")
|
||||
# Note: if this happens (because of unfortunate data), then there are
|
||||
# a few things that could be done. First, the hash function could be
|
||||
# tweaked. Second, the bucket order could be scrambled (especially the
|
||||
# singletons). Right now, the buckets are sorted, which has the advantage
|
||||
# of being deterministic.
|
||||
#
|
||||
# As a more extreme approach, the singleton bucket optimization could be
|
||||
# applied (give the direct address for singleton buckets, rather than
|
||||
# relying on a rehash). That is definitely the more standard approach in
|
||||
# the minimal perfect hashing literature, but in testing the branch was a
|
||||
# significant slowdown.
|
||||
exit(1)
|
||||
return (salts, keys)
|
||||
|
||||
if __name__ == '__main__':
|
||||
data = UnicodeData()
|
||||
with open("tables.rs", "w", newline = "\n") as out:
|
||||
out.write(PREAMBLE)
|
||||
out.write("use crate::quick_check::IsNormalized;\n")
|
||||
out.write("use crate::quick_check::IsNormalized::*;\n")
|
||||
out.write("\n")
|
||||
|
||||
version = "(%s, %s, %s)" % tuple(UNICODE_VERSION.split("."))
|
||||
out.write("#[allow(unused)]\n")
|
||||
out.write("pub const UNICODE_VERSION: (u8, u8, u8) = %s;\n\n" % version)
|
||||
|
||||
gen_combining_class(data.combining_classes, out)
|
||||
out.write("\n")
|
||||
|
||||
gen_composition_table(data.canon_comp, out)
|
||||
out.write("\n")
|
||||
|
||||
gen_decomposition_tables(data.canon_fully_decomp, data.compat_fully_decomp, data.cjk_compat_variants_fully_decomp, out)
|
||||
|
||||
gen_combining_mark(data.general_category_mark, out)
|
||||
out.write("\n")
|
||||
|
||||
gen_public_assigned(data.general_category_public_assigned, out)
|
||||
out.write("\n")
|
||||
|
||||
gen_nfc_qc(data.norm_props, out)
|
||||
out.write("\n")
|
||||
|
||||
gen_nfkc_qc(data.norm_props, out)
|
||||
out.write("\n")
|
||||
|
||||
gen_nfd_qc(data.norm_props, out)
|
||||
out.write("\n")
|
||||
|
||||
gen_nfkd_qc(data.norm_props, out)
|
||||
out.write("\n")
|
||||
|
||||
gen_stream_safe(data.ss_leading, data.ss_trailing, out)
|
||||
out.write("\n")
|
||||
|
||||
with open("normalization_tests.rs", "w", newline = "\n") as out:
|
||||
out.write(PREAMBLE)
|
||||
gen_tests(data.norm_tests, out)
|
|
@ -1,18 +0,0 @@
|
|||
// This crate comprises hacks and glue required to test private functions from tests/
|
||||
//
|
||||
// Keep this as slim as possible.
|
||||
//
|
||||
// If you're caught using this outside this crates tests/, you get to clean up the mess.
|
||||
|
||||
#[cfg(not(feature = "std"))]
|
||||
use crate::no_std_prelude::*;
|
||||
|
||||
use crate::stream_safe::StreamSafe;
|
||||
|
||||
pub fn stream_safe(s: &str) -> String {
|
||||
StreamSafe::new(s.chars()).collect()
|
||||
}
|
||||
|
||||
pub mod quick_check {
|
||||
pub use crate::quick_check::*;
|
||||
}
|
|
@ -1,161 +0,0 @@
|
|||
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
use core::fmt::{self, Write};
|
||||
use core::iter::Fuse;
|
||||
use core::ops::Range;
|
||||
use tinyvec::TinyVec;
|
||||
|
||||
#[derive(Clone)]
|
||||
enum DecompositionType {
|
||||
Canonical,
|
||||
Compatible,
|
||||
}
|
||||
|
||||
/// External iterator for a string decomposition's characters.
|
||||
#[derive(Clone)]
|
||||
pub struct Decompositions<I> {
|
||||
kind: DecompositionType,
|
||||
iter: Fuse<I>,
|
||||
|
||||
// This buffer stores pairs of (canonical combining class, character),
|
||||
// pushed onto the end in text order.
|
||||
//
|
||||
// It's divided into up to three sections:
|
||||
// 1) A prefix that is free space;
|
||||
// 2) "Ready" characters which are sorted and ready to emit on demand;
|
||||
// 3) A "pending" block which stills needs more characters for us to be able
|
||||
// to sort in canonical order and is not safe to emit.
|
||||
buffer: TinyVec<[(u8, char); 4]>,
|
||||
ready: Range<usize>,
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn new_canonical<I: Iterator<Item = char>>(iter: I) -> Decompositions<I> {
|
||||
Decompositions {
|
||||
kind: self::DecompositionType::Canonical,
|
||||
iter: iter.fuse(),
|
||||
buffer: TinyVec::new(),
|
||||
ready: 0..0,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn new_compatible<I: Iterator<Item = char>>(iter: I) -> Decompositions<I> {
|
||||
Decompositions {
|
||||
kind: self::DecompositionType::Compatible,
|
||||
iter: iter.fuse(),
|
||||
buffer: TinyVec::new(),
|
||||
ready: 0..0,
|
||||
}
|
||||
}
|
||||
|
||||
impl<I> Decompositions<I> {
|
||||
#[inline]
|
||||
fn push_back(&mut self, ch: char) {
|
||||
let class = super::char::canonical_combining_class(ch);
|
||||
|
||||
if class == 0 {
|
||||
self.sort_pending();
|
||||
self.buffer.push((class, ch));
|
||||
self.ready.end = self.buffer.len();
|
||||
} else {
|
||||
self.buffer.push((class, ch));
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn sort_pending(&mut self) {
|
||||
// NB: `sort_by_key` is stable, so it will preserve the original text's
|
||||
// order within a combining class.
|
||||
self.buffer[self.ready.end..].sort_by_key(|k| k.0);
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn reset_buffer(&mut self) {
|
||||
// Equivalent to `self.buffer.drain(0..self.ready.end)`
|
||||
// but faster than drain() if the buffer is a SmallVec or TinyVec
|
||||
let pending = self.buffer.len() - self.ready.end;
|
||||
for i in 0..pending {
|
||||
self.buffer[i] = self.buffer[i + self.ready.end];
|
||||
}
|
||||
self.buffer.truncate(pending);
|
||||
self.ready = 0..0;
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn increment_next_ready(&mut self) {
|
||||
let next = self.ready.start + 1;
|
||||
if next == self.ready.end {
|
||||
self.reset_buffer();
|
||||
} else {
|
||||
self.ready.start = next;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: Iterator<Item = char>> Iterator for Decompositions<I> {
|
||||
type Item = char;
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<char> {
|
||||
while self.ready.end == 0 {
|
||||
match (self.iter.next(), &self.kind) {
|
||||
(Some(ch), &DecompositionType::Canonical) => {
|
||||
super::char::decompose_canonical(ch, |d| self.push_back(d));
|
||||
}
|
||||
(Some(ch), &DecompositionType::Compatible) => {
|
||||
super::char::decompose_compatible(ch, |d| self.push_back(d));
|
||||
}
|
||||
(None, _) => {
|
||||
if self.buffer.is_empty() {
|
||||
return None;
|
||||
} else {
|
||||
self.sort_pending();
|
||||
self.ready.end = self.buffer.len();
|
||||
|
||||
// This implementation means that we can call `next`
|
||||
// on an exhausted iterator; the last outer `next` call
|
||||
// will result in an inner `next` call. To make this
|
||||
// safe, we use `fuse`.
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// We can assume here that, if `self.ready.end` is greater than zero,
|
||||
// it's also greater than `self.ready.start`. That's because we only
|
||||
// increment `self.ready.start` inside `increment_next_ready`, and
|
||||
// whenever it reaches equality with `self.ready.end`, we reset both
|
||||
// to zero, maintaining the invariant that:
|
||||
// self.ready.start < self.ready.end || self.ready.end == self.ready.start == 0
|
||||
//
|
||||
// This less-than-obviously-safe implementation is chosen for performance,
|
||||
// minimizing the number & complexity of branches in `next` in the common
|
||||
// case of buffering then unbuffering a single character with each call.
|
||||
let (_, ch) = self.buffer[self.ready.start];
|
||||
self.increment_next_ready();
|
||||
Some(ch)
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
let (lower, _) = self.iter.size_hint();
|
||||
(lower, None)
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: Iterator<Item = char> + Clone> fmt::Display for Decompositions<I> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
for c in self.clone() {
|
||||
f.write_char(c)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
|
@ -1,235 +0,0 @@
|
|||
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
//! Unicode character composition and decomposition utilities
|
||||
//! as described in
|
||||
//! [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/).
|
||||
//!
|
||||
//! ```rust
|
||||
//! extern crate unicode_normalization;
|
||||
//!
|
||||
//! use unicode_normalization::char::compose;
|
||||
//! use unicode_normalization::UnicodeNormalization;
|
||||
//!
|
||||
//! fn main() {
|
||||
//! assert_eq!(compose('A','\u{30a}'), Some('Å'));
|
||||
//!
|
||||
//! let s = "ÅΩ";
|
||||
//! let c = s.nfc().collect::<String>();
|
||||
//! assert_eq!(c, "ÅΩ");
|
||||
//! }
|
||||
//! ```
|
||||
//!
|
||||
//! # crates.io
|
||||
//!
|
||||
//! You can use this package in your project by adding the following
|
||||
//! to your `Cargo.toml`:
|
||||
//!
|
||||
//! ```toml
|
||||
//! [dependencies]
|
||||
//! unicode-normalization = "0.1.20"
|
||||
//! ```
|
||||
|
||||
#![deny(missing_docs, unsafe_code)]
|
||||
#![doc(
|
||||
html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
|
||||
html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png"
|
||||
)]
|
||||
#![cfg_attr(not(feature = "std"), no_std)]
|
||||
|
||||
#[cfg(not(feature = "std"))]
|
||||
extern crate alloc;
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
extern crate core;
|
||||
|
||||
extern crate tinyvec;
|
||||
|
||||
pub use crate::decompose::Decompositions;
|
||||
pub use crate::quick_check::{
|
||||
is_nfc, is_nfc_quick, is_nfc_stream_safe, is_nfc_stream_safe_quick, is_nfd, is_nfd_quick,
|
||||
is_nfd_stream_safe, is_nfd_stream_safe_quick, is_nfkc, is_nfkc_quick, is_nfkd, is_nfkd_quick,
|
||||
IsNormalized,
|
||||
};
|
||||
pub use crate::recompose::Recompositions;
|
||||
pub use crate::replace::Replacements;
|
||||
pub use crate::stream_safe::StreamSafe;
|
||||
pub use crate::tables::UNICODE_VERSION;
|
||||
use core::{
|
||||
str::Chars,
|
||||
option,
|
||||
};
|
||||
|
||||
mod no_std_prelude;
|
||||
|
||||
mod decompose;
|
||||
mod lookups;
|
||||
mod normalize;
|
||||
mod perfect_hash;
|
||||
mod quick_check;
|
||||
mod recompose;
|
||||
mod replace;
|
||||
mod stream_safe;
|
||||
|
||||
#[rustfmt::skip]
|
||||
mod tables;
|
||||
|
||||
#[doc(hidden)]
|
||||
pub mod __test_api;
|
||||
#[cfg(test)]
|
||||
mod test;
|
||||
|
||||
/// Methods for composing and decomposing characters.
|
||||
pub mod char {
|
||||
pub use crate::normalize::{
|
||||
compose, decompose_canonical, decompose_cjk_compat_variants, decompose_compatible,
|
||||
};
|
||||
|
||||
pub use crate::lookups::{canonical_combining_class, is_combining_mark};
|
||||
|
||||
/// Return whether the given character is assigned (`General_Category` != `Unassigned`)
|
||||
/// and not Private-Use (`General_Category` != `Private_Use`), in the supported version
|
||||
/// of Unicode.
|
||||
pub use crate::tables::is_public_assigned;
|
||||
}
|
||||
|
||||
/// Methods for iterating over strings while applying Unicode normalizations
|
||||
/// as described in
|
||||
/// [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/).
|
||||
pub trait UnicodeNormalization<I: Iterator<Item = char>> {
|
||||
/// Returns an iterator over the string in Unicode Normalization Form D
|
||||
/// (canonical decomposition).
|
||||
fn nfd(self) -> Decompositions<I>;
|
||||
|
||||
/// Returns an iterator over the string in Unicode Normalization Form KD
|
||||
/// (compatibility decomposition).
|
||||
fn nfkd(self) -> Decompositions<I>;
|
||||
|
||||
/// An Iterator over the string in Unicode Normalization Form C
|
||||
/// (canonical decomposition followed by canonical composition).
|
||||
fn nfc(self) -> Recompositions<I>;
|
||||
|
||||
/// An Iterator over the string in Unicode Normalization Form KC
|
||||
/// (compatibility decomposition followed by canonical composition).
|
||||
fn nfkc(self) -> Recompositions<I>;
|
||||
|
||||
/// A transformation which replaces CJK Compatibility Ideograph codepoints
|
||||
/// with normal forms using Standardized Variation Sequences. This is not
|
||||
/// part of the canonical or compatibility decomposition algorithms, but
|
||||
/// performing it before those algorithms produces normalized output which
|
||||
/// better preserves the intent of the original text.
|
||||
///
|
||||
/// Note that many systems today ignore variation selectors, so these
|
||||
/// may not immediately help text display as intended, but they at
|
||||
/// least preserve the information in a standardized form, giving
|
||||
/// implementations the option to recognize them.
|
||||
fn cjk_compat_variants(self) -> Replacements<I>;
|
||||
|
||||
/// An Iterator over the string with Conjoining Grapheme Joiner characters
|
||||
/// inserted according to the Stream-Safe Text Process (UAX15-D4)
|
||||
fn stream_safe(self) -> StreamSafe<I>;
|
||||
}
|
||||
|
||||
impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
|
||||
#[inline]
|
||||
fn nfd(self) -> Decompositions<Chars<'a>> {
|
||||
decompose::new_canonical(self.chars())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn nfkd(self) -> Decompositions<Chars<'a>> {
|
||||
decompose::new_compatible(self.chars())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn nfc(self) -> Recompositions<Chars<'a>> {
|
||||
recompose::new_canonical(self.chars())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn nfkc(self) -> Recompositions<Chars<'a>> {
|
||||
recompose::new_compatible(self.chars())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn cjk_compat_variants(self) -> Replacements<Chars<'a>> {
|
||||
replace::new_cjk_compat_variants(self.chars())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn stream_safe(self) -> StreamSafe<Chars<'a>> {
|
||||
StreamSafe::new(self.chars())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl UnicodeNormalization<option::IntoIter<char>> for char {
|
||||
#[inline]
|
||||
fn nfd(self) -> Decompositions<option::IntoIter<char>> {
|
||||
decompose::new_canonical(Some(self).into_iter())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn nfkd(self) -> Decompositions<option::IntoIter<char>> {
|
||||
decompose::new_compatible(Some(self).into_iter())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn nfc(self) -> Recompositions<option::IntoIter<char>> {
|
||||
recompose::new_canonical(Some(self).into_iter())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn nfkc(self) -> Recompositions<option::IntoIter<char>> {
|
||||
recompose::new_compatible(Some(self).into_iter())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn cjk_compat_variants(self) -> Replacements<option::IntoIter<char>> {
|
||||
replace::new_cjk_compat_variants(Some(self).into_iter())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn stream_safe(self) -> StreamSafe<option::IntoIter<char>> {
|
||||
StreamSafe::new(Some(self).into_iter())
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I {
|
||||
#[inline]
|
||||
fn nfd(self) -> Decompositions<I> {
|
||||
decompose::new_canonical(self)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn nfkd(self) -> Decompositions<I> {
|
||||
decompose::new_compatible(self)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn nfc(self) -> Recompositions<I> {
|
||||
recompose::new_canonical(self)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn nfkc(self) -> Recompositions<I> {
|
||||
recompose::new_compatible(self)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn cjk_compat_variants(self) -> Replacements<I> {
|
||||
replace::new_cjk_compat_variants(self)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn stream_safe(self) -> StreamSafe<I> {
|
||||
StreamSafe::new(self)
|
||||
}
|
||||
}
|
|
@ -1,138 +0,0 @@
|
|||
// Copyright 2019 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
//! Lookups of unicode properties using minimal perfect hashing.
|
||||
|
||||
use crate::perfect_hash::mph_lookup;
|
||||
use crate::tables::*;
|
||||
|
||||
/// Look up the canonical combining class for a codepoint.
|
||||
///
|
||||
/// The value returned is as defined in the Unicode Character Database.
|
||||
pub fn canonical_combining_class(c: char) -> u8 {
|
||||
mph_lookup(
|
||||
c.into(),
|
||||
CANONICAL_COMBINING_CLASS_SALT,
|
||||
CANONICAL_COMBINING_CLASS_KV,
|
||||
u8_lookup_fk,
|
||||
u8_lookup_fv,
|
||||
0,
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn composition_table(c1: char, c2: char) -> Option<char> {
|
||||
if c1 < '\u{10000}' && c2 < '\u{10000}' {
|
||||
mph_lookup(
|
||||
(c1 as u32) << 16 | (c2 as u32),
|
||||
COMPOSITION_TABLE_SALT,
|
||||
COMPOSITION_TABLE_KV,
|
||||
pair_lookup_fk,
|
||||
pair_lookup_fv_opt,
|
||||
None,
|
||||
)
|
||||
} else {
|
||||
composition_table_astral(c1, c2)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn canonical_fully_decomposed(c: char) -> Option<&'static [char]> {
|
||||
mph_lookup(
|
||||
c.into(),
|
||||
CANONICAL_DECOMPOSED_SALT,
|
||||
CANONICAL_DECOMPOSED_KV,
|
||||
pair_lookup_fk,
|
||||
pair_lookup_fv_opt,
|
||||
None,
|
||||
)
|
||||
.map(|(start, len)| &CANONICAL_DECOMPOSED_CHARS[start as usize..][..len as usize])
|
||||
}
|
||||
|
||||
pub(crate) fn compatibility_fully_decomposed(c: char) -> Option<&'static [char]> {
|
||||
mph_lookup(
|
||||
c.into(),
|
||||
COMPATIBILITY_DECOMPOSED_SALT,
|
||||
COMPATIBILITY_DECOMPOSED_KV,
|
||||
pair_lookup_fk,
|
||||
pair_lookup_fv_opt,
|
||||
None,
|
||||
)
|
||||
.map(|(start, len)| &COMPATIBILITY_DECOMPOSED_CHARS[start as usize..][..len as usize])
|
||||
}
|
||||
|
||||
pub(crate) fn cjk_compat_variants_fully_decomposed(c: char) -> Option<&'static [char]> {
|
||||
mph_lookup(
|
||||
c.into(),
|
||||
CJK_COMPAT_VARIANTS_DECOMPOSED_SALT,
|
||||
CJK_COMPAT_VARIANTS_DECOMPOSED_KV,
|
||||
pair_lookup_fk,
|
||||
pair_lookup_fv_opt,
|
||||
None,
|
||||
)
|
||||
.map(|(start, len)| &CJK_COMPAT_VARIANTS_DECOMPOSED_CHARS[start as usize..][..len as usize])
|
||||
}
|
||||
|
||||
/// Return whether the given character is a combining mark (`General_Category=Mark`)
|
||||
pub fn is_combining_mark(c: char) -> bool {
|
||||
mph_lookup(
|
||||
c.into(),
|
||||
COMBINING_MARK_SALT,
|
||||
COMBINING_MARK_KV,
|
||||
bool_lookup_fk,
|
||||
bool_lookup_fv,
|
||||
false,
|
||||
)
|
||||
}
|
||||
|
||||
pub fn stream_safe_trailing_nonstarters(c: char) -> usize {
|
||||
mph_lookup(
|
||||
c.into(),
|
||||
TRAILING_NONSTARTERS_SALT,
|
||||
TRAILING_NONSTARTERS_KV,
|
||||
u8_lookup_fk,
|
||||
u8_lookup_fv,
|
||||
0,
|
||||
) as usize
|
||||
}
|
||||
|
||||
/// Extract the key in a 24 bit key and 8 bit value packed in a u32.
|
||||
#[inline]
|
||||
fn u8_lookup_fk(kv: u32) -> u32 {
|
||||
kv >> 8
|
||||
}
|
||||
|
||||
/// Extract the value in a 24 bit key and 8 bit value packed in a u32.
|
||||
#[inline]
|
||||
fn u8_lookup_fv(kv: u32) -> u8 {
|
||||
(kv & 0xff) as u8
|
||||
}
|
||||
|
||||
/// Extract the key for a boolean lookup.
|
||||
#[inline]
|
||||
fn bool_lookup_fk(kv: u32) -> u32 {
|
||||
kv
|
||||
}
|
||||
|
||||
/// Extract the value for a boolean lookup.
|
||||
#[inline]
|
||||
fn bool_lookup_fv(_kv: u32) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
/// Extract the key in a pair.
|
||||
#[inline]
|
||||
fn pair_lookup_fk<T>(kv: (u32, T)) -> u32 {
|
||||
kv.0
|
||||
}
|
||||
|
||||
/// Extract the value in a pair, returning an option.
|
||||
#[inline]
|
||||
fn pair_lookup_fv_opt<T>(kv: (u32, T)) -> Option<T> {
|
||||
Some(kv.1)
|
||||
}
|
|
@ -1,6 +0,0 @@
|
|||
#[cfg(not(feature = "std"))]
|
||||
pub use alloc::{
|
||||
str::Chars,
|
||||
string::{String, ToString},
|
||||
vec::Vec,
|
||||
};
|
|
@ -1,201 +0,0 @@
|
|||
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
//! Functions for computing canonical and compatible decompositions for Unicode characters.
|
||||
use crate::lookups::{
|
||||
canonical_fully_decomposed, cjk_compat_variants_fully_decomposed,
|
||||
compatibility_fully_decomposed, composition_table,
|
||||
};
|
||||
|
||||
use core::{char, ops::FnMut};
|
||||
|
||||
/// Compute canonical Unicode decomposition for character.
|
||||
/// See [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/)
|
||||
/// for more information.
|
||||
#[inline]
|
||||
pub fn decompose_canonical<F>(c: char, emit_char: F)
|
||||
where
|
||||
F: FnMut(char),
|
||||
{
|
||||
decompose(c, canonical_fully_decomposed, emit_char)
|
||||
}
|
||||
|
||||
/// Compute canonical or compatible Unicode decomposition for character.
|
||||
/// See [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/)
|
||||
/// for more information.
|
||||
#[inline]
|
||||
pub fn decompose_compatible<F: FnMut(char)>(c: char, emit_char: F) {
|
||||
let decompose_char =
|
||||
|c| compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c));
|
||||
decompose(c, decompose_char, emit_char)
|
||||
}
|
||||
|
||||
/// Compute standard-variation decomposition for character.
|
||||
///
|
||||
/// [Standardized Variation Sequences] are used instead of the standard canonical
|
||||
/// decompositions, notably for CJK codepoints with singleton canonical decompositions,
|
||||
/// to avoid losing information. See the
|
||||
/// [Unicode Variation Sequence FAQ](http://unicode.org/faq/vs.html) and the
|
||||
/// "Other Enhancements" section of the
|
||||
/// [Unicode 6.3 Release Summary](https://www.unicode.org/versions/Unicode6.3.0/#Summary)
|
||||
/// for more information.
|
||||
#[inline]
|
||||
pub fn decompose_cjk_compat_variants<F>(c: char, mut emit_char: F)
|
||||
where
|
||||
F: FnMut(char),
|
||||
{
|
||||
// 7-bit ASCII never decomposes
|
||||
if c <= '\x7f' {
|
||||
emit_char(c);
|
||||
return;
|
||||
}
|
||||
|
||||
// Don't perform decomposition for Hangul
|
||||
|
||||
if let Some(decomposed) = cjk_compat_variants_fully_decomposed(c) {
|
||||
for &d in decomposed {
|
||||
emit_char(d);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Finally bottom out.
|
||||
emit_char(c);
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn decompose<D, F>(c: char, decompose_char: D, mut emit_char: F)
|
||||
where
|
||||
D: Fn(char) -> Option<&'static [char]>,
|
||||
F: FnMut(char),
|
||||
{
|
||||
// 7-bit ASCII never decomposes
|
||||
if c <= '\x7f' {
|
||||
emit_char(c);
|
||||
return;
|
||||
}
|
||||
|
||||
// Perform decomposition for Hangul
|
||||
if is_hangul_syllable(c) {
|
||||
decompose_hangul(c, emit_char);
|
||||
return;
|
||||
}
|
||||
|
||||
if let Some(decomposed) = decompose_char(c) {
|
||||
for &d in decomposed {
|
||||
emit_char(d);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Finally bottom out.
|
||||
emit_char(c);
|
||||
}
|
||||
|
||||
/// Compose two characters into a single character, if possible.
|
||||
/// See [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/)
|
||||
/// for more information.
|
||||
pub fn compose(a: char, b: char) -> Option<char> {
|
||||
compose_hangul(a, b).or_else(|| composition_table(a, b))
|
||||
}
|
||||
|
||||
// Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior
|
||||
// http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior
|
||||
const S_BASE: u32 = 0xAC00;
|
||||
const L_BASE: u32 = 0x1100;
|
||||
const V_BASE: u32 = 0x1161;
|
||||
const T_BASE: u32 = 0x11A7;
|
||||
const L_COUNT: u32 = 19;
|
||||
const V_COUNT: u32 = 21;
|
||||
const T_COUNT: u32 = 28;
|
||||
const N_COUNT: u32 = V_COUNT * T_COUNT;
|
||||
const S_COUNT: u32 = L_COUNT * N_COUNT;
|
||||
|
||||
const S_LAST: u32 = S_BASE + S_COUNT - 1;
|
||||
const L_LAST: u32 = L_BASE + L_COUNT - 1;
|
||||
const V_LAST: u32 = V_BASE + V_COUNT - 1;
|
||||
const T_LAST: u32 = T_BASE + T_COUNT - 1;
|
||||
|
||||
// Composition only occurs for `TPart`s in `U+11A8 ... U+11C2`,
|
||||
// i.e. `T_BASE + 1 ... T_LAST`.
|
||||
const T_FIRST: u32 = T_BASE + 1;
|
||||
|
||||
pub(crate) fn is_hangul_syllable(c: char) -> bool {
|
||||
(c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT)
|
||||
}
|
||||
|
||||
// Decompose a precomposed Hangul syllable
|
||||
#[allow(unsafe_code)]
|
||||
#[inline(always)]
|
||||
fn decompose_hangul<F>(s: char, mut emit_char: F)
|
||||
where
|
||||
F: FnMut(char),
|
||||
{
|
||||
let s_index = s as u32 - S_BASE;
|
||||
let l_index = s_index / N_COUNT;
|
||||
unsafe {
|
||||
emit_char(char::from_u32_unchecked(L_BASE + l_index));
|
||||
|
||||
let v_index = (s_index % N_COUNT) / T_COUNT;
|
||||
emit_char(char::from_u32_unchecked(V_BASE + v_index));
|
||||
|
||||
let t_index = s_index % T_COUNT;
|
||||
if t_index > 0 {
|
||||
emit_char(char::from_u32_unchecked(T_BASE + t_index));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn hangul_decomposition_length(s: char) -> usize {
|
||||
let si = s as u32 - S_BASE;
|
||||
let ti = si % T_COUNT;
|
||||
if ti > 0 {
|
||||
3
|
||||
} else {
|
||||
2
|
||||
}
|
||||
}
|
||||
|
||||
// Compose a pair of Hangul Jamo
|
||||
#[allow(unsafe_code)]
|
||||
#[inline(always)]
|
||||
#[allow(ellipsis_inclusive_range_patterns)]
|
||||
fn compose_hangul(a: char, b: char) -> Option<char> {
|
||||
let (a, b) = (a as u32, b as u32);
|
||||
match (a, b) {
|
||||
// Compose a leading consonant and a vowel together into an LV_Syllable
|
||||
(L_BASE...L_LAST, V_BASE...V_LAST) => {
|
||||
let l_index = a - L_BASE;
|
||||
let v_index = b - V_BASE;
|
||||
let lv_index = l_index * N_COUNT + v_index * T_COUNT;
|
||||
let s = S_BASE + lv_index;
|
||||
Some(unsafe { char::from_u32_unchecked(s) })
|
||||
}
|
||||
// Compose an LV_Syllable and a trailing consonant into an LVT_Syllable
|
||||
(S_BASE...S_LAST, T_FIRST...T_LAST) if (a - S_BASE) % T_COUNT == 0 => {
|
||||
Some(unsafe { char::from_u32_unchecked(a + (b - T_BASE)) })
|
||||
}
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::compose_hangul;
|
||||
|
||||
// Regression test from a bugfix where we were composing an LV_Syllable with
|
||||
// T_BASE directly. (We should only compose an LV_Syllable with a character
|
||||
// in the range `T_BASE + 1 ... T_LAST`.)
|
||||
#[test]
|
||||
fn test_hangul_composition() {
|
||||
assert_eq!(compose_hangul('\u{c8e0}', '\u{11a7}'), None);
|
||||
}
|
||||
}
|
|
@ -1,50 +0,0 @@
|
|||
// Copyright 2019 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
//! Support for lookups based on minimal perfect hashing.
|
||||
|
||||
// This function is based on multiplication being fast and is "good enough". Also
|
||||
// it can share some work between the unsalted and salted versions.
|
||||
#[inline]
|
||||
fn my_hash(key: u32, salt: u32, n: usize) -> usize {
|
||||
let y = key.wrapping_add(salt).wrapping_mul(2654435769);
|
||||
let y = y ^ key.wrapping_mul(0x31415926);
|
||||
(((y as u64) * (n as u64)) >> 32) as usize
|
||||
}
|
||||
|
||||
/// Do a lookup using minimal perfect hashing.
|
||||
///
|
||||
/// The table is stored as a sequence of "salt" values, then a sequence of
|
||||
/// values that contain packed key/value pairs. The strategy is to hash twice.
|
||||
/// The first hash retrieves a salt value that makes the second hash unique.
|
||||
/// The hash function doesn't have to be very good, just good enough that the
|
||||
/// resulting map is unique.
|
||||
#[inline]
|
||||
pub(crate) fn mph_lookup<KV, V, FK, FV>(
|
||||
x: u32,
|
||||
salt: &[u16],
|
||||
kv: &[KV],
|
||||
fk: FK,
|
||||
fv: FV,
|
||||
default: V,
|
||||
) -> V
|
||||
where
|
||||
KV: Copy,
|
||||
FK: Fn(KV) -> u32,
|
||||
FV: Fn(KV) -> V,
|
||||
{
|
||||
let s = salt[my_hash(x, 0, salt.len())] as u32;
|
||||
let key_val = kv[my_hash(x, s, salt.len())];
|
||||
if x == fk(key_val) {
|
||||
fv(key_val)
|
||||
} else {
|
||||
default
|
||||
}
|
||||
}
|
|
@ -1,187 +0,0 @@
|
|||
use crate::lookups::canonical_combining_class;
|
||||
use crate::stream_safe;
|
||||
use crate::tables;
|
||||
use crate::UnicodeNormalization;
|
||||
|
||||
/// The QuickCheck algorithm can quickly determine if a text is or isn't
|
||||
/// normalized without any allocations in many cases, but it has to be able to
|
||||
/// return `Maybe` when a full decomposition and recomposition is necessary.
|
||||
#[derive(Debug, Eq, PartialEq)]
|
||||
pub enum IsNormalized {
|
||||
/// The text is definitely normalized.
|
||||
Yes,
|
||||
/// The text is definitely not normalized.
|
||||
No,
|
||||
/// The text may be normalized.
|
||||
Maybe,
|
||||
}
|
||||
|
||||
// https://unicode.org/reports/tr15/#Detecting_Normalization_Forms
|
||||
#[inline]
|
||||
fn quick_check<F, I>(s: I, is_allowed: F, stream_safe: bool) -> IsNormalized
|
||||
where
|
||||
I: Iterator<Item = char>,
|
||||
F: Fn(char) -> IsNormalized,
|
||||
{
|
||||
let mut last_cc = 0u8;
|
||||
let mut nonstarter_count = 0;
|
||||
let mut result = IsNormalized::Yes;
|
||||
for ch in s {
|
||||
// For ASCII we know it's always allowed and a starter
|
||||
if ch <= '\x7f' {
|
||||
last_cc = 0;
|
||||
nonstarter_count = 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Otherwise, lookup the combining class and QC property
|
||||
let cc = canonical_combining_class(ch);
|
||||
if last_cc > cc && cc != 0 {
|
||||
return IsNormalized::No;
|
||||
}
|
||||
match is_allowed(ch) {
|
||||
IsNormalized::Yes => (),
|
||||
IsNormalized::No => return IsNormalized::No,
|
||||
IsNormalized::Maybe => {
|
||||
result = IsNormalized::Maybe;
|
||||
}
|
||||
}
|
||||
if stream_safe {
|
||||
let decomp = stream_safe::classify_nonstarters(ch);
|
||||
|
||||
// If we're above `MAX_NONSTARTERS`, we're definitely *not*
|
||||
// stream-safe normalized.
|
||||
if nonstarter_count + decomp.leading_nonstarters > stream_safe::MAX_NONSTARTERS {
|
||||
return IsNormalized::No;
|
||||
}
|
||||
if decomp.leading_nonstarters == decomp.decomposition_len {
|
||||
nonstarter_count += decomp.decomposition_len;
|
||||
} else {
|
||||
nonstarter_count = decomp.trailing_nonstarters;
|
||||
}
|
||||
}
|
||||
last_cc = cc;
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
/// Quickly check if a string is in NFC, potentially returning
|
||||
/// `IsNormalized::Maybe` if further checks are necessary. In this case a check
|
||||
/// like `s.chars().nfc().eq(s.chars())` should suffice.
|
||||
#[inline]
|
||||
pub fn is_nfc_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
|
||||
quick_check(s, tables::qc_nfc, false)
|
||||
}
|
||||
|
||||
/// Quickly check if a string is in NFKC.
|
||||
#[inline]
|
||||
pub fn is_nfkc_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
|
||||
quick_check(s, tables::qc_nfkc, false)
|
||||
}
|
||||
|
||||
/// Quickly check if a string is in NFD.
|
||||
#[inline]
|
||||
pub fn is_nfd_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
|
||||
quick_check(s, tables::qc_nfd, false)
|
||||
}
|
||||
|
||||
/// Quickly check if a string is in NFKD.
|
||||
#[inline]
|
||||
pub fn is_nfkd_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
|
||||
quick_check(s, tables::qc_nfkd, false)
|
||||
}
|
||||
|
||||
/// Quickly check if a string is Stream-Safe NFC.
|
||||
#[inline]
|
||||
pub fn is_nfc_stream_safe_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
|
||||
quick_check(s, tables::qc_nfc, true)
|
||||
}
|
||||
|
||||
/// Quickly check if a string is Stream-Safe NFD.
|
||||
#[inline]
|
||||
pub fn is_nfd_stream_safe_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
|
||||
quick_check(s, tables::qc_nfd, true)
|
||||
}
|
||||
|
||||
/// Authoritatively check if a string is in NFC.
|
||||
#[inline]
|
||||
pub fn is_nfc(s: &str) -> bool {
|
||||
match is_nfc_quick(s.chars()) {
|
||||
IsNormalized::Yes => true,
|
||||
IsNormalized::No => false,
|
||||
IsNormalized::Maybe => s.chars().eq(s.chars().nfc()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Authoritatively check if a string is in NFKC.
|
||||
#[inline]
|
||||
pub fn is_nfkc(s: &str) -> bool {
|
||||
match is_nfkc_quick(s.chars()) {
|
||||
IsNormalized::Yes => true,
|
||||
IsNormalized::No => false,
|
||||
IsNormalized::Maybe => s.chars().eq(s.chars().nfkc()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Authoritatively check if a string is in NFD.
|
||||
#[inline]
|
||||
pub fn is_nfd(s: &str) -> bool {
|
||||
match is_nfd_quick(s.chars()) {
|
||||
IsNormalized::Yes => true,
|
||||
IsNormalized::No => false,
|
||||
IsNormalized::Maybe => s.chars().eq(s.chars().nfd()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Authoritatively check if a string is in NFKD.
|
||||
#[inline]
|
||||
pub fn is_nfkd(s: &str) -> bool {
|
||||
match is_nfkd_quick(s.chars()) {
|
||||
IsNormalized::Yes => true,
|
||||
IsNormalized::No => false,
|
||||
IsNormalized::Maybe => s.chars().eq(s.chars().nfkd()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Authoritatively check if a string is Stream-Safe NFC.
|
||||
#[inline]
|
||||
pub fn is_nfc_stream_safe(s: &str) -> bool {
|
||||
match is_nfc_stream_safe_quick(s.chars()) {
|
||||
IsNormalized::Yes => true,
|
||||
IsNormalized::No => false,
|
||||
IsNormalized::Maybe => s.chars().eq(s.chars().stream_safe().nfc()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Authoritatively check if a string is Stream-Safe NFD.
|
||||
#[inline]
|
||||
pub fn is_nfd_stream_safe(s: &str) -> bool {
|
||||
match is_nfd_stream_safe_quick(s.chars()) {
|
||||
IsNormalized::Yes => true,
|
||||
IsNormalized::No => false,
|
||||
IsNormalized::Maybe => s.chars().eq(s.chars().stream_safe().nfd()),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{is_nfc_stream_safe_quick, is_nfd_stream_safe_quick, IsNormalized};
|
||||
|
||||
#[test]
|
||||
fn test_stream_safe_nfd() {
|
||||
let okay = "Da\u{031b}\u{0316}\u{0317}\u{0318}\u{0319}\u{031c}\u{031d}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{031a}ngerzone";
|
||||
assert_eq!(is_nfd_stream_safe_quick(okay.chars()), IsNormalized::Yes);
|
||||
|
||||
let too_much = "Da\u{031b}\u{0316}\u{0317}\u{0318}\u{0319}\u{031c}\u{031d}\u{031e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{031a}ngerzone";
|
||||
assert_eq!(is_nfd_stream_safe_quick(too_much.chars()), IsNormalized::No);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stream_safe_nfc() {
|
||||
let okay = "ok\u{e0}\u{031b}\u{0316}\u{0317}\u{0318}\u{0319}\u{031c}\u{031d}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{031a}y";
|
||||
assert_eq!(is_nfc_stream_safe_quick(okay.chars()), IsNormalized::Maybe);
|
||||
|
||||
let too_much = "not ok\u{e0}\u{031b}\u{0316}\u{0317}\u{0318}\u{0319}\u{031c}\u{031d}\u{031e}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{031a}y";
|
||||
assert_eq!(is_nfc_stream_safe_quick(too_much.chars()), IsNormalized::No);
|
||||
}
|
||||
}
|
|
@ -1,154 +0,0 @@
|
|||
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use crate::decompose::Decompositions;
|
||||
use core::fmt::{self, Write};
|
||||
use tinyvec::TinyVec;
|
||||
|
||||
#[derive(Clone)]
|
||||
enum RecompositionState {
|
||||
Composing,
|
||||
Purging(usize),
|
||||
Finished(usize),
|
||||
}
|
||||
|
||||
/// External iterator for a string recomposition's characters.
|
||||
#[derive(Clone)]
|
||||
pub struct Recompositions<I> {
|
||||
iter: Decompositions<I>,
|
||||
state: RecompositionState,
|
||||
buffer: TinyVec<[char; 4]>,
|
||||
composee: Option<char>,
|
||||
last_ccc: Option<u8>,
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn new_canonical<I: Iterator<Item = char>>(iter: I) -> Recompositions<I> {
|
||||
Recompositions {
|
||||
iter: super::decompose::new_canonical(iter),
|
||||
state: self::RecompositionState::Composing,
|
||||
buffer: TinyVec::new(),
|
||||
composee: None,
|
||||
last_ccc: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn new_compatible<I: Iterator<Item = char>>(iter: I) -> Recompositions<I> {
|
||||
Recompositions {
|
||||
iter: super::decompose::new_compatible(iter),
|
||||
state: self::RecompositionState::Composing,
|
||||
buffer: TinyVec::new(),
|
||||
composee: None,
|
||||
last_ccc: None,
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: Iterator<Item = char>> Iterator for Recompositions<I> {
|
||||
type Item = char;
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<char> {
|
||||
use self::RecompositionState::*;
|
||||
|
||||
loop {
|
||||
match self.state {
|
||||
Composing => {
|
||||
for ch in self.iter.by_ref() {
|
||||
let ch_class = super::char::canonical_combining_class(ch);
|
||||
let k = match self.composee {
|
||||
None => {
|
||||
if ch_class != 0 {
|
||||
return Some(ch);
|
||||
}
|
||||
self.composee = Some(ch);
|
||||
continue;
|
||||
}
|
||||
Some(k) => k,
|
||||
};
|
||||
match self.last_ccc {
|
||||
None => match super::char::compose(k, ch) {
|
||||
Some(r) => {
|
||||
self.composee = Some(r);
|
||||
continue;
|
||||
}
|
||||
None => {
|
||||
if ch_class == 0 {
|
||||
self.composee = Some(ch);
|
||||
return Some(k);
|
||||
}
|
||||
self.buffer.push(ch);
|
||||
self.last_ccc = Some(ch_class);
|
||||
}
|
||||
},
|
||||
Some(l_class) => {
|
||||
if l_class >= ch_class {
|
||||
// `ch` is blocked from `composee`
|
||||
if ch_class == 0 {
|
||||
self.composee = Some(ch);
|
||||
self.last_ccc = None;
|
||||
self.state = Purging(0);
|
||||
return Some(k);
|
||||
}
|
||||
self.buffer.push(ch);
|
||||
self.last_ccc = Some(ch_class);
|
||||
continue;
|
||||
}
|
||||
match super::char::compose(k, ch) {
|
||||
Some(r) => {
|
||||
self.composee = Some(r);
|
||||
continue;
|
||||
}
|
||||
None => {
|
||||
self.buffer.push(ch);
|
||||
self.last_ccc = Some(ch_class);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
self.state = Finished(0);
|
||||
if self.composee.is_some() {
|
||||
return self.composee.take();
|
||||
}
|
||||
}
|
||||
Purging(next) => match self.buffer.get(next).cloned() {
|
||||
None => {
|
||||
self.buffer.clear();
|
||||
self.state = Composing;
|
||||
}
|
||||
s => {
|
||||
self.state = Purging(next + 1);
|
||||
return s;
|
||||
}
|
||||
},
|
||||
Finished(next) => match self.buffer.get(next).cloned() {
|
||||
None => {
|
||||
self.buffer.clear();
|
||||
return self.composee.take();
|
||||
}
|
||||
s => {
|
||||
self.state = Finished(next + 1);
|
||||
return s;
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: Iterator<Item = char> + Clone> fmt::Display for Recompositions<I> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
for c in self.clone() {
|
||||
f.write_char(c)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
|
@ -1,61 +0,0 @@
|
|||
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
use core::fmt::{self, Write};
|
||||
use tinyvec::ArrayVec;
|
||||
|
||||
/// External iterator for replacements for a string's characters.
|
||||
#[derive(Clone)]
|
||||
pub struct Replacements<I> {
|
||||
iter: I,
|
||||
// At this time, the longest replacement sequence has length 2, so we just
|
||||
// need buffer space for 1 codepoint.
|
||||
buffer: Option<char>,
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn new_cjk_compat_variants<I: Iterator<Item = char>>(iter: I) -> Replacements<I> {
|
||||
Replacements { iter, buffer: None }
|
||||
}
|
||||
|
||||
impl<I: Iterator<Item = char>> Iterator for Replacements<I> {
|
||||
type Item = char;
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<char> {
|
||||
if let Some(c) = self.buffer.take() {
|
||||
return Some(c);
|
||||
}
|
||||
|
||||
match self.iter.next() {
|
||||
Some(ch) => {
|
||||
// At this time, the longest replacement sequence has length 2.
|
||||
let mut buffer = ArrayVec::<[char; 2]>::new();
|
||||
super::char::decompose_cjk_compat_variants(ch, |d| buffer.push(d));
|
||||
self.buffer = buffer.get(1).copied();
|
||||
Some(buffer[0])
|
||||
}
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
let (lower, _) = self.iter.size_hint();
|
||||
(lower, None)
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: Iterator<Item = char> + Clone> fmt::Display for Replacements<I> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
for c in self.clone() {
|
||||
f.write_char(c)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
|
@ -1,170 +0,0 @@
|
|||
use crate::lookups::{
|
||||
canonical_combining_class, canonical_fully_decomposed, compatibility_fully_decomposed,
|
||||
stream_safe_trailing_nonstarters,
|
||||
};
|
||||
use crate::normalize::{hangul_decomposition_length, is_hangul_syllable};
|
||||
use crate::tables::stream_safe_leading_nonstarters;
|
||||
|
||||
pub(crate) const MAX_NONSTARTERS: usize = 30;
|
||||
const COMBINING_GRAPHEME_JOINER: char = '\u{034F}';
|
||||
|
||||
/// UAX15-D4: This iterator keeps track of how many non-starters there have been
|
||||
/// since the last starter in *NFKD* and will emit a Combining Grapheme Joiner
|
||||
/// (U+034F) if the count exceeds 30.
|
||||
pub struct StreamSafe<I> {
|
||||
iter: I,
|
||||
nonstarter_count: usize,
|
||||
buffer: Option<char>,
|
||||
}
|
||||
|
||||
impl<I> StreamSafe<I> {
|
||||
pub(crate) fn new(iter: I) -> Self {
|
||||
Self {
|
||||
iter,
|
||||
nonstarter_count: 0,
|
||||
buffer: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: Iterator<Item = char>> Iterator for StreamSafe<I> {
|
||||
type Item = char;
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<char> {
|
||||
let next_ch = match self.buffer.take().or_else(|| self.iter.next()) {
|
||||
None => return None,
|
||||
Some(c) => c,
|
||||
};
|
||||
let d = classify_nonstarters(next_ch);
|
||||
if self.nonstarter_count + d.leading_nonstarters > MAX_NONSTARTERS {
|
||||
// Since we're emitting a CGJ, the suffix of the emitted string in NFKD has no trailing
|
||||
// nonstarters, so we can reset the counter to zero. Put `next_ch` back into the
|
||||
// iterator (via `self.buffer`), and we'll reclassify it next iteration.
|
||||
self.nonstarter_count = 0;
|
||||
self.buffer = Some(next_ch);
|
||||
return Some(COMBINING_GRAPHEME_JOINER);
|
||||
}
|
||||
|
||||
// Is the character all nonstarters in NFKD? If so, increment our counter of contiguous
|
||||
// nonstarters in NKFD.
|
||||
if d.leading_nonstarters == d.decomposition_len {
|
||||
self.nonstarter_count += d.decomposition_len;
|
||||
}
|
||||
// Otherwise, reset the counter to the decomposition's number of trailing nonstarters.
|
||||
else {
|
||||
self.nonstarter_count = d.trailing_nonstarters;
|
||||
}
|
||||
Some(next_ch)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct Decomposition {
|
||||
pub(crate) leading_nonstarters: usize,
|
||||
pub(crate) trailing_nonstarters: usize,
|
||||
pub(crate) decomposition_len: usize,
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn classify_nonstarters(c: char) -> Decomposition {
|
||||
// As usual, fast path for ASCII (which is always a starter)
|
||||
if c <= '\x7f' {
|
||||
return Decomposition {
|
||||
leading_nonstarters: 0,
|
||||
trailing_nonstarters: 0,
|
||||
decomposition_len: 1,
|
||||
};
|
||||
}
|
||||
// Next, special case Hangul, since it's not handled by our tables.
|
||||
if is_hangul_syllable(c) {
|
||||
return Decomposition {
|
||||
leading_nonstarters: 0,
|
||||
trailing_nonstarters: 0,
|
||||
decomposition_len: hangul_decomposition_length(c),
|
||||
};
|
||||
}
|
||||
let decomp = compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c));
|
||||
match decomp {
|
||||
Some(decomp) => Decomposition {
|
||||
leading_nonstarters: stream_safe_leading_nonstarters(c),
|
||||
trailing_nonstarters: stream_safe_trailing_nonstarters(c),
|
||||
decomposition_len: decomp.len(),
|
||||
},
|
||||
None => {
|
||||
let is_nonstarter = canonical_combining_class(c) != 0;
|
||||
let nonstarter = if is_nonstarter { 1 } else { 0 };
|
||||
Decomposition {
|
||||
leading_nonstarters: nonstarter,
|
||||
trailing_nonstarters: nonstarter,
|
||||
decomposition_len: 1,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{classify_nonstarters, StreamSafe};
|
||||
use crate::lookups::canonical_combining_class;
|
||||
use crate::normalize::decompose_compatible;
|
||||
|
||||
#[cfg(not(feature = "std"))]
|
||||
use crate::no_std_prelude::*;
|
||||
|
||||
use core::char;
|
||||
|
||||
fn stream_safe(s: &str) -> String {
|
||||
StreamSafe::new(s.chars()).collect()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_simple() {
|
||||
let technically_okay = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}ngerzone";
|
||||
assert_eq!(stream_safe(technically_okay), technically_okay);
|
||||
|
||||
let too_much = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}ngerzone";
|
||||
let fixed_it = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{034f}\u{032e}ngerzone";
|
||||
assert_eq!(stream_safe(too_much), fixed_it);
|
||||
|
||||
let woah_nelly = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}ngerzone";
|
||||
let its_cool = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{034f}\u{032e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{034f}\u{031d}\u{032e}ngerzone";
|
||||
assert_eq!(stream_safe(woah_nelly), its_cool);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_all_nonstarters() {
|
||||
let s = "\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}";
|
||||
let expected = "\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{034F}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}";
|
||||
assert_eq!(stream_safe(s), expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_classify_nonstarters() {
|
||||
// Highest character in the `compat_fully_decomp` table is 2FA1D
|
||||
for ch in 0..0x2FA1E {
|
||||
let ch = match char::from_u32(ch) {
|
||||
Some(c) => c,
|
||||
None => continue,
|
||||
};
|
||||
let c = classify_nonstarters(ch);
|
||||
let mut s = Vec::new();
|
||||
decompose_compatible(ch, |c| s.push(c));
|
||||
|
||||
assert_eq!(s.len(), c.decomposition_len);
|
||||
|
||||
let num_leading = s
|
||||
.iter()
|
||||
.take_while(|&c| canonical_combining_class(*c) != 0)
|
||||
.count();
|
||||
let num_trailing = s
|
||||
.iter()
|
||||
.rev()
|
||||
.take_while(|&c| canonical_combining_class(*c) != 0)
|
||||
.count();
|
||||
|
||||
assert_eq!(num_leading, c.leading_nonstarters);
|
||||
assert_eq!(num_trailing, c.trailing_nonstarters);
|
||||
}
|
||||
}
|
||||
}
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,130 +0,0 @@
|
|||
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use super::char::is_combining_mark;
|
||||
use super::UnicodeNormalization;
|
||||
use core::char;
|
||||
|
||||
#[cfg(not(feature = "std"))]
|
||||
use crate::no_std_prelude::*;
|
||||
|
||||
#[test]
|
||||
fn test_nfd() {
|
||||
macro_rules! t {
|
||||
($input: expr, $expected: expr) => {
|
||||
assert_eq!($input.nfd().to_string(), $expected);
|
||||
// A dummy iterator that is not std::str::Chars directly;
|
||||
// note that `id_func` is used to ensure `Clone` implementation
|
||||
assert_eq!(
|
||||
$input.chars().map(|c| c).nfd().collect::<String>(),
|
||||
$expected
|
||||
);
|
||||
};
|
||||
}
|
||||
t!("abc", "abc");
|
||||
t!("\u{1e0b}\u{1c4}", "d\u{307}\u{1c4}");
|
||||
t!("\u{2026}", "\u{2026}");
|
||||
t!("\u{2126}", "\u{3a9}");
|
||||
t!("\u{1e0b}\u{323}", "d\u{323}\u{307}");
|
||||
t!("\u{1e0d}\u{307}", "d\u{323}\u{307}");
|
||||
t!("a\u{301}", "a\u{301}");
|
||||
t!("\u{301}a", "\u{301}a");
|
||||
t!("\u{d4db}", "\u{1111}\u{1171}\u{11b6}");
|
||||
t!("\u{ac1c}", "\u{1100}\u{1162}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_nfkd() {
|
||||
macro_rules! t {
|
||||
($input: expr, $expected: expr) => {
|
||||
assert_eq!($input.nfkd().to_string(), $expected);
|
||||
};
|
||||
}
|
||||
t!("abc", "abc");
|
||||
t!("\u{1e0b}\u{1c4}", "d\u{307}DZ\u{30c}");
|
||||
t!("\u{2026}", "...");
|
||||
t!("\u{2126}", "\u{3a9}");
|
||||
t!("\u{1e0b}\u{323}", "d\u{323}\u{307}");
|
||||
t!("\u{1e0d}\u{307}", "d\u{323}\u{307}");
|
||||
t!("a\u{301}", "a\u{301}");
|
||||
t!("\u{301}a", "\u{301}a");
|
||||
t!("\u{d4db}", "\u{1111}\u{1171}\u{11b6}");
|
||||
t!("\u{ac1c}", "\u{1100}\u{1162}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_nfc() {
|
||||
macro_rules! t {
|
||||
($input: expr, $expected: expr) => {
|
||||
assert_eq!($input.nfc().to_string(), $expected);
|
||||
};
|
||||
}
|
||||
t!("abc", "abc");
|
||||
t!("\u{1e0b}\u{1c4}", "\u{1e0b}\u{1c4}");
|
||||
t!("\u{2026}", "\u{2026}");
|
||||
t!("\u{2126}", "\u{3a9}");
|
||||
t!("\u{1e0b}\u{323}", "\u{1e0d}\u{307}");
|
||||
t!("\u{1e0d}\u{307}", "\u{1e0d}\u{307}");
|
||||
t!("a\u{301}", "\u{e1}");
|
||||
t!("\u{301}a", "\u{301}a");
|
||||
t!("\u{d4db}", "\u{d4db}");
|
||||
t!("\u{ac1c}", "\u{ac1c}");
|
||||
t!(
|
||||
"a\u{300}\u{305}\u{315}\u{5ae}b",
|
||||
"\u{e0}\u{5ae}\u{305}\u{315}b"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_nfkc() {
|
||||
macro_rules! t {
|
||||
($input: expr, $expected: expr) => {
|
||||
assert_eq!($input.nfkc().to_string(), $expected);
|
||||
};
|
||||
}
|
||||
t!("abc", "abc");
|
||||
t!("\u{1e0b}\u{1c4}", "\u{1e0b}D\u{17d}");
|
||||
t!("\u{2026}", "...");
|
||||
t!("\u{2126}", "\u{3a9}");
|
||||
t!("\u{1e0b}\u{323}", "\u{1e0d}\u{307}");
|
||||
t!("\u{1e0d}\u{307}", "\u{1e0d}\u{307}");
|
||||
t!("a\u{301}", "\u{e1}");
|
||||
t!("\u{301}a", "\u{301}a");
|
||||
t!("\u{d4db}", "\u{d4db}");
|
||||
t!("\u{ac1c}", "\u{ac1c}");
|
||||
t!(
|
||||
"a\u{300}\u{305}\u{315}\u{5ae}b",
|
||||
"\u{e0}\u{5ae}\u{305}\u{315}b"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_normalize_char() {
|
||||
assert_eq!('\u{2126}'.nfd().to_string(), "\u{3a9}")
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_combining_mark_ascii() {
|
||||
for cp in 0..0x7f {
|
||||
assert!(!is_combining_mark(char::from_u32(cp).unwrap()));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_combining_mark_misc() {
|
||||
// https://github.com/unicode-rs/unicode-normalization/issues/16
|
||||
// U+11C3A BHAIKSUKI VOWEL SIGN O
|
||||
// Category: Mark, Nonspacing [Mn]
|
||||
assert!(is_combining_mark('\u{11C3A}'));
|
||||
|
||||
// U+11C3F BHAIKSUKI SIGN VIRAMA
|
||||
// Category: Mark, Nonspacing [Mn]
|
||||
assert!(is_combining_mark('\u{11C3F}'));
|
||||
}
|
|
@ -1 +1 @@
|
|||
{"files":{"Cargo.toml":"33e77fef5d9e5592daeff71b551f983f19ddd9d31a7c002e642a3c40d8b08123","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"76e972ac0f4ddb116e86e10100132a783931a596e7b9872eaa31be15cd4d751d","README.md":"71b01ec6f2f4ce47235ee430ba0c41afac563403a9dbcda23a584c3e915395ac","src/host.rs":"5e25476aaec0153b64d35b53940a72a1ec58e29a0e1fde36944f52eeb945c5f6","src/lib.rs":"e017abe9c33881a96b5daafeee65b1814b4418f5fb0c96d0aaea65a14c9292c9","src/origin.rs":"19a4b451e8615bfef7239d2fc719c489398fe5044edb0df7c84b54eef4ceba1b","src/parser.rs":"5427cd15caedc8e3c1418cc576a7263e96df26a51ad3ce88f8c32d3eb7d6dd2c","src/path_segments.rs":"29db87b6902da4ab1ae925b3874afdeff42b8ddfb46356af6a83b86f34e03b14","src/quirks.rs":"c9311e3dd6f701fb4b8e438b3e3960ff6f8c78a67ae763f3640b178f15c60e45","src/slicing.rs":"3b1aaad36ba7e89f50c90d1ceddda1f8ba52a364c153541ac5c9ce54dacb6724","tests/expected_failures.txt":"f222a5e2f7bdfbd724cf7fb8e35e71a0fe1f3ac9c2771919d7ff5ba9e51c5769","tests/setters_tests.json":"a3a4cbd7b798bc2c4d9656dc50be7397a5a5ed1f0b52daa1da1ad654d38c1dcd","tests/unit.rs":"1abe0a410c5078e1ad9de8c93f2f2ae660ddb47b7efaac9047e952457b068ded","tests/urltestdata.json":"58d67bea710d5f46324fe6841df5fd82090fe4ec2d882bc0fc7c1784d4771884","tests/wpt.rs":"6302c008cde6e7c0df8626701cc825731363722d02e35804bb370c385b455145"},"package":"31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633"}
|
||||
{"files":{"Cargo.toml":"4108358208f628a0e61af3ebe88aedbe585983c518a456644df398012781f136","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"76e972ac0f4ddb116e86e10100132a783931a596e7b9872eaa31be15cd4d751d","README.md":"6111161470aa4d5f2e3806936df0c874b8eca5f8c8cd2d71a60eb6c2cbb776ab","src/host.rs":"9de249e8af8fcd0caf673b37a66ba070dfa1b231ee06a981526a9f863c3acf13","src/lib.rs":"4b7ec6a4f2ee7a63ac332f4609c4f9f648861e7ea0967b80efdf27c52a07f154","src/origin.rs":"19a4b451e8615bfef7239d2fc719c489398fe5044edb0df7c84b54eef4ceba1b","src/parser.rs":"ca317fdf927628351991c73437aa91d36e26637574e6551200125e32f46e60cd","src/path_segments.rs":"29db87b6902da4ab1ae925b3874afdeff42b8ddfb46356af6a83b86f34e03b14","src/quirks.rs":"79818bd168b138e8edd30011033c1f6defb847fe96f8a57381cf9251c27e866b","src/slicing.rs":"3b1aaad36ba7e89f50c90d1ceddda1f8ba52a364c153541ac5c9ce54dacb6724","tests/expected_failures.txt":"fc4f619316f1fb117b01d8089c04b925b8db0652f46b8534a87e115c5544881b","tests/setters_tests.json":"a3a4cbd7b798bc2c4d9656dc50be7397a5a5ed1f0b52daa1da1ad654d38c1dcd","tests/unit.rs":"c895675581e737ad8e1536786f80385df0426495074ee6cc011830f45f16f6f7","tests/urltestdata.json":"58d67bea710d5f46324fe6841df5fd82090fe4ec2d882bc0fc7c1784d4771884","tests/wpt.rs":"8781251116a9de8169327ed40a0237ac6ff2f84e3d579d6fb6d7353362f9a48a"},"package":"f7c25da092f0a868cdf09e8674cd3b7ef3a7d92a24253e663a2fb85e2496de56"}
|
|
@ -11,9 +11,9 @@
|
|||
|
||||
[package]
|
||||
edition = "2018"
|
||||
rust-version = "1.56"
|
||||
rust-version = "1.67"
|
||||
name = "url"
|
||||
version = "2.5.0"
|
||||
version = "2.5.1"
|
||||
authors = ["The rust-url developers"]
|
||||
include = [
|
||||
"src/**/*",
|
||||
|
@ -57,7 +57,7 @@ harness = false
|
|||
version = "1.2.1"
|
||||
|
||||
[dependencies.idna]
|
||||
version = "0.5.0"
|
||||
version = "1.0.0"
|
||||
|
||||
[dependencies.percent-encoding]
|
||||
version = "2.3.1"
|
||||
|
@ -81,3 +81,6 @@ version = "1.0"
|
|||
debugger_visualizer = []
|
||||
default = []
|
||||
expose_internals = []
|
||||
|
||||
[target."cfg(all(target_arch = \"wasm32\", target_os = \"unknown\"))".dev-dependencies.wasm-bindgen-test]
|
||||
version = "0.3"
|
||||
|
|
|
@ -9,6 +9,6 @@ rust-url
|
|||
|
||||
URL library for Rust, based on the [URL Standard](https://url.spec.whatwg.org/).
|
||||
|
||||
[Documentation](https://docs.rs/url/)
|
||||
[Documentation](https://docs.rs/url)
|
||||
|
||||
Please see [UPGRADING.md](https://github.com/servo/rust-url/blob/master/UPGRADING.md) if you are upgrading from a previous version.
|
||||
Please see [UPGRADING.md](https://github.com/servo/rust-url/blob/main/UPGRADING.md) if you are upgrading from a previous version.
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use std::borrow::Cow;
|
||||
use std::cmp;
|
||||
use std::fmt::{self, Formatter};
|
||||
use std::net::{Ipv4Addr, Ipv6Addr};
|
||||
|
@ -81,7 +82,7 @@ impl Host<String> {
|
|||
}
|
||||
return parse_ipv6addr(&input[1..input.len() - 1]).map(Host::Ipv6);
|
||||
}
|
||||
let domain = percent_decode(input.as_bytes()).decode_utf8_lossy();
|
||||
let domain: Cow<'_, [u8]> = percent_decode(input.as_bytes()).into();
|
||||
|
||||
let domain = Self::domain_to_ascii(&domain)?;
|
||||
|
||||
|
@ -89,35 +90,11 @@ impl Host<String> {
|
|||
return Err(ParseError::EmptyHost);
|
||||
}
|
||||
|
||||
let is_invalid_domain_char = |c| {
|
||||
matches!(
|
||||
c,
|
||||
'\0'..='\u{001F}'
|
||||
| ' '
|
||||
| '#'
|
||||
| '%'
|
||||
| '/'
|
||||
| ':'
|
||||
| '<'
|
||||
| '>'
|
||||
| '?'
|
||||
| '@'
|
||||
| '['
|
||||
| '\\'
|
||||
| ']'
|
||||
| '^'
|
||||
| '\u{007F}'
|
||||
| '|'
|
||||
)
|
||||
};
|
||||
|
||||
if domain.find(is_invalid_domain_char).is_some() {
|
||||
Err(ParseError::InvalidDomainCharacter)
|
||||
} else if ends_in_a_number(&domain) {
|
||||
if ends_in_a_number(&domain) {
|
||||
let address = parse_ipv4addr(&domain)?;
|
||||
Ok(Host::Ipv4(address))
|
||||
} else {
|
||||
Ok(Host::Domain(domain))
|
||||
Ok(Host::Domain(domain.to_string()))
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -162,8 +139,8 @@ impl Host<String> {
|
|||
}
|
||||
|
||||
/// convert domain with idna
|
||||
fn domain_to_ascii(domain: &str) -> Result<String, ParseError> {
|
||||
idna::domain_to_ascii(domain).map_err(Into::into)
|
||||
fn domain_to_ascii(domain: &[u8]) -> Result<Cow<'_, str>, ParseError> {
|
||||
idna::domain_to_ascii_cow(domain, idna::AsciiDenyList::URL).map_err(Into::into)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -134,7 +134,7 @@ url = { version = "2", features = ["debugger_visualizer"] }
|
|||
|
||||
*/
|
||||
|
||||
#![doc(html_root_url = "https://docs.rs/url/2.5.0")]
|
||||
#![doc(html_root_url = "https://docs.rs/url/2.5.1")]
|
||||
#![cfg_attr(
|
||||
feature = "debugger_visualizer",
|
||||
debugger_visualizer(natvis_file = "../../debug_metadata/url.natvis")
|
||||
|
@ -146,15 +146,20 @@ pub use form_urlencoded;
|
|||
extern crate serde;
|
||||
|
||||
use crate::host::HostInternal;
|
||||
use crate::parser::{to_u32, Context, Parser, SchemeType, PATH_SEGMENT, USERINFO};
|
||||
use crate::parser::{
|
||||
to_u32, Context, Parser, SchemeType, PATH_SEGMENT, SPECIAL_PATH_SEGMENT, USERINFO,
|
||||
};
|
||||
use percent_encoding::{percent_decode, percent_encode, utf8_percent_encode};
|
||||
use std::borrow::Borrow;
|
||||
use std::cmp;
|
||||
use std::fmt::{self, Write};
|
||||
use std::hash;
|
||||
#[cfg(any(unix, windows, target_os = "redox", target_os = "wasi"))]
|
||||
use std::io;
|
||||
use std::mem;
|
||||
use std::net::{IpAddr, SocketAddr, ToSocketAddrs};
|
||||
use std::net::IpAddr;
|
||||
#[cfg(any(unix, windows, target_os = "redox", target_os = "wasi"))]
|
||||
use std::net::{SocketAddr, ToSocketAddrs};
|
||||
use std::ops::{Range, RangeFrom, RangeTo};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::str;
|
||||
|
@ -214,6 +219,9 @@ pub struct ParseOptions<'a> {
|
|||
|
||||
impl<'a> ParseOptions<'a> {
|
||||
/// Change the base URL
|
||||
///
|
||||
/// See the notes of [`Url::join`] for more details about how this base is considered
|
||||
/// when parsing.
|
||||
pub fn base_url(mut self, new: Option<&'a Url>) -> Self {
|
||||
self.base_url = new;
|
||||
self
|
||||
|
@ -365,9 +373,14 @@ impl Url {
|
|||
///
|
||||
/// The inverse of this is [`make_relative`].
|
||||
///
|
||||
/// Note: a trailing slash is significant.
|
||||
/// # Notes
|
||||
///
|
||||
/// - A trailing slash is significant.
|
||||
/// Without it, the last path component is considered to be a “file” name
|
||||
/// to be removed to get at the “directory” that is used as the base:
|
||||
/// to be removed to get at the “directory” that is used as the base.
|
||||
/// - A [scheme relative special URL](https://url.spec.whatwg.org/#scheme-relative-special-url-string)
|
||||
/// as input replaces everything in the base URL after the scheme.
|
||||
/// - An absolute URL (with a scheme) as input replaces the whole base URL (even the scheme).
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
|
@ -375,14 +388,27 @@ impl Url {
|
|||
/// use url::Url;
|
||||
/// # use url::ParseError;
|
||||
///
|
||||
/// // Base without a trailing slash
|
||||
/// # fn run() -> Result<(), ParseError> {
|
||||
/// let base = Url::parse("https://example.net/a/b.html")?;
|
||||
/// let url = base.join("c.png")?;
|
||||
/// assert_eq!(url.as_str(), "https://example.net/a/c.png"); // Not /a/b.html/c.png
|
||||
///
|
||||
/// // Base with a trailing slash
|
||||
/// let base = Url::parse("https://example.net/a/b/")?;
|
||||
/// let url = base.join("c.png")?;
|
||||
/// assert_eq!(url.as_str(), "https://example.net/a/b/c.png");
|
||||
///
|
||||
/// // Input as scheme relative special URL
|
||||
/// let base = Url::parse("https://alice.com/a")?;
|
||||
/// let url = base.join("//eve.com/b")?;
|
||||
/// assert_eq!(url.as_str(), "https://eve.com/b");
|
||||
///
|
||||
/// // Input as absolute URL
|
||||
/// let base = Url::parse("https://alice.com/a")?;
|
||||
/// let url = base.join("http://eve.com/b")?;
|
||||
/// assert_eq!(url.as_str(), "http://eve.com/b"); // http instead of https
|
||||
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// # run().unwrap();
|
||||
|
@ -1250,6 +1276,7 @@ impl Url {
|
|||
/// })
|
||||
/// }
|
||||
/// ```
|
||||
#[cfg(any(unix, windows, target_os = "redox", target_os = "wasi"))]
|
||||
pub fn socket_addrs(
|
||||
&self,
|
||||
default_port_number: impl Fn() -> Option<u16>,
|
||||
|
@ -1524,7 +1551,8 @@ impl Url {
|
|||
}
|
||||
}
|
||||
|
||||
/// Change this URL’s query string.
|
||||
/// Change this URL’s query string. If `query` is `None`, this URL's
|
||||
/// query string will be cleared.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
|
@ -2816,7 +2844,7 @@ fn path_to_file_url_segments(
|
|||
serialization.push('/');
|
||||
serialization.extend(percent_encode(
|
||||
component.as_os_str().as_bytes(),
|
||||
PATH_SEGMENT,
|
||||
SPECIAL_PATH_SEGMENT,
|
||||
));
|
||||
}
|
||||
if empty {
|
||||
|
|
|
@ -94,15 +94,18 @@ impl From<::idna::Errors> for ParseError {
|
|||
}
|
||||
|
||||
macro_rules! syntax_violation_enum {
|
||||
($($name: ident => $description: expr,)+) => {
|
||||
($($name: ident => $description: literal,)+) => {
|
||||
/// Non-fatal syntax violations that can occur during parsing.
|
||||
///
|
||||
/// This may be extended in the future so exhaustive matching is
|
||||
/// discouraged with an unused variant.
|
||||
/// forbidden.
|
||||
#[derive(PartialEq, Eq, Clone, Copy, Debug)]
|
||||
#[non_exhaustive]
|
||||
pub enum SyntaxViolation {
|
||||
$(
|
||||
/// ```text
|
||||
#[doc = $description]
|
||||
/// ```
|
||||
$name,
|
||||
)+
|
||||
}
|
||||
|
|
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
Ссылка в новой задаче