Bug 1847521 - Replace unicode-segmentation with ICU4X in WebDriver. r=webdriver-reviewers,supply-chain-reviewers,jgraham

Differential Revision: https://phabricator.services.mozilla.com/D198132
This commit is contained in:
Makoto Kato 2024-01-18 02:29:19 +00:00
Родитель 34342fa8e7
Коммит 33707d378d
25 изменённых файлов: 17 добавлений и 11651 удалений

10
Cargo.lock сгенерированный
Просмотреть файл

@ -2054,6 +2054,7 @@ dependencies = [
"chrono",
"clap",
"hyper",
"icu_segmenter",
"lazy_static",
"log",
"marionette",
@ -2069,7 +2070,6 @@ dependencies = [
"serde_yaml",
"tempfile",
"thiserror",
"unicode-segmentation",
"url",
"uuid",
"webdriver",
@ -5810,12 +5810,6 @@ dependencies = [
"tinyvec",
]
[[package]]
name = "unicode-segmentation"
version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fdbf052a0783de01e944a6ce7a8cb939e295b1e7be835a1112c3b9a7f047a5a"
[[package]]
name = "unicode-width"
version = "0.1.10"
@ -6229,6 +6223,7 @@ dependencies = [
"bytes",
"cookie",
"http",
"icu_segmenter",
"log",
"serde",
"serde_derive",
@ -6237,7 +6232,6 @@ dependencies = [
"time 0.3.23",
"tokio",
"tokio-stream",
"unicode-segmentation",
"url",
"warp",
]

Просмотреть файл

@ -710,13 +710,6 @@ user-id = 3618
user-login = "dtolnay"
user-name = "David Tolnay"
[[publisher.unicode-segmentation]]
version = "1.10.0"
when = "2022-09-13"
user-id = 1139
user-login = "Manishearth"
user-name = "Manish Goregaokar"
[[publisher.unicode-width]]
version = "0.1.10"
when = "2022-09-13"

Просмотреть файл

@ -26,6 +26,7 @@ base64 = "0.21"
chrono = "0.4.6"
clap = { version = "4", default-features = false, features = ["cargo", "std", "suggestions", "wrap_help", "string"] }
hyper = "0.14"
icu_segmenter = { version = "1.4", default-features = false, features = ["auto", "compiled_data"] }
lazy_static = "1.0"
log = { version = "0.4", features = ["std"] }
marionette = { path = "./marionette", version="0.5.0" }
@ -40,7 +41,6 @@ serde_json = "1.0"
serde_yaml = "0.8"
tempfile = "3"
thiserror = "1"
unicode-segmentation = "1.9"
url = "2.4"
uuid = { version = "1.0", features = ["v4"] }
webdriver = { path = "../webdriver", version="0.50.0" }

Просмотреть файл

@ -31,12 +31,12 @@
//! [`init`]: fn.init.html
//! [`init_with_level`]: fn.init_with_level.html
use icu_segmenter::GraphemeClusterSegmenter;
use std::fmt;
use std::io;
use std::io::Write;
use std::str;
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use unicode_segmentation::UnicodeSegmentation;
use mozprofile::preferences::Pref;
@ -246,7 +246,15 @@ fn truncate_message(args: &fmt::Arguments) -> Option<(String, String)> {
}
let message = format!("{}", args);
let chars = message.graphemes(true).collect::<Vec<&str>>();
if message.is_empty() || message.len() < MAX_STRING_LENGTH {
return None;
}
let chars = GraphemeClusterSegmenter::new()
.segment_str(&message)
.collect::<Vec<_>>()
.windows(2)
.map(|i| &message[i[0]..i[1]])
.collect::<Vec<&str>>();
if chars.len() > MAX_STRING_LENGTH {
let middle: usize = MAX_STRING_LENGTH / 2;

Просмотреть файл

@ -26,6 +26,7 @@ base64 = "0.21"
bytes = "1.0"
cookie = { version = "0.16", default-features = false }
http = "0.2"
icu_segmenter = { version = "1.4", default-features = false, features = ["auto", "compiled_data"] }
log = "0.4"
serde = "1.0"
serde_json = "1.0"
@ -33,7 +34,6 @@ serde_derive = "1.0"
time = "0.3"
tokio = { version = "1.0", features = ["rt", "net"], optional = true}
tokio-stream = { version = "0.1", features = ["net"], optional = true}
unicode-segmentation = "1.2"
url = "2.4"
thiserror = "1"
warp = { version = "0.3", default-features = false, optional = true }

Просмотреть файл

@ -3,12 +3,12 @@
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
use crate::common::{WebElement, ELEMENT_KEY};
use icu_segmenter::GraphemeClusterSegmenter;
use serde::de::{self, Deserialize, Deserializer};
use serde::ser::{Serialize, Serializer};
use serde_json::Value;
use std::default::Default;
use std::f64;
use unicode_segmentation::UnicodeSegmentation;
#[derive(Debug, PartialEq, Serialize, Deserialize)]
pub struct ActionSequence {
@ -91,7 +91,7 @@ where
{
String::deserialize(deserializer).map(|value| {
// Only a single Unicode grapheme cluster is allowed
if value.graphemes(true).count() != 1 {
if GraphemeClusterSegmenter::new().segment_str(&value).count() != 2 {
return Err(de::Error::custom(format!(
"'{}' should only contain a single Unicode code point",
value

Просмотреть файл

@ -6,6 +6,7 @@
extern crate base64;
extern crate cookie;
extern crate icu_segmenter;
#[macro_use]
extern crate log;
extern crate http;
@ -16,7 +17,6 @@ extern crate serde_json;
extern crate time;
#[cfg(feature = "server")]
extern crate tokio;
extern crate unicode_segmentation;
extern crate url;
#[cfg(feature = "server")]
extern crate warp;

Просмотреть файл

@ -1 +0,0 @@
{"files":{"COPYRIGHT":"23860c2a7b5d96b21569afedf033469bab9fe14a1b24a35068b8641c578ce24d","Cargo.toml":"55e5a65c91693dd47a27409e54ad6d5ce805ce003b822e4a568bfd070725e956","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"7b63ecd5f1902af1b63729947373683c32745c16a10e8e6292e2e2dcd7e90ae0","README.md":"efe7aa058e004e12d683039dbc4440e2fec3088364201a620703acedbeef8cb2","benches/graphemes.rs":"88a9f672ea7a03cc15fae36ce544a6e7234e532359402483978858ccda47db3d","benches/unicode_words.rs":"95c3a178ebe07c8cb2c560546ee911bfc4f1e1db81a6cd2c1cef1c99ed2a421a","benches/word_bounds.rs":"66acf40c0a4b06cdb6dd97c1759aba8dea961bb30cd7f223de3ebff8198520b2","scripts/unicode.py":"d4ba970a0419f33d20f3deb888be12427bfbb40aa25a5719968600d45cf4dadb","scripts/unicode_gen_breaktests.py":"ee96982d8959bec75c2382233cfca7e239f12a89a1be5fbf942601a215bb9283","src/grapheme.rs":"b5a32bdbb529e9417e8ada8d92656339b6ffb4e9bed8e6d32a0409c13a03050b","src/lib.rs":"572789173717edd0fe037ae656530663406951636c548e6793711b7d5caad910","src/sentence.rs":"aac52f69207e0b68925ab0c6c18cc36ed3da8e918006d96d724f0f19d4d9d643","src/tables.rs":"ba9fa1774b6294ed14565ec6be0f2ec316759d54e3af7c002b6848973d7b1f3c","src/test.rs":"f039fa285d510244672a067bdbe98ce7ff940e4f2ff82926466e012ac48ad95a","src/testdata.rs":"533c02ecace1bec3d46b65d101c7619bc83a2fb2c187a2c960346533c09a0e3e","src/word.rs":"6eeea9351c12f0a4404606596a487e0e8aa948ba4b134c7cb827ee41557a39fe"},"package":"0fdbf052a0783de01e944a6ce7a8cb939e295b1e7be835a1112c3b9a7f047a5a"}

Просмотреть файл

@ -1,7 +0,0 @@
Licensed under the Apache License, Version 2.0
<LICENSE-APACHE or
http://www.apache.org/licenses/LICENSE-2.0> or the MIT
license <LICENSE-MIT or http://opensource.org/licenses/MIT>,
at your option. All files in the project carrying such
notice may not be copied, modified, or distributed except
according to those terms.

Просмотреть файл

@ -1,63 +0,0 @@
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
#
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
# to registry (e.g., crates.io) dependencies.
#
# If you are reading this file be aware that the original Cargo.toml
# will likely look very different (and much more reasonable).
# See Cargo.toml.orig for the original contents.
[package]
edition = "2018"
name = "unicode-segmentation"
version = "1.10.0"
authors = [
"kwantam <kwantam@gmail.com>",
"Manish Goregaokar <manishsmail@gmail.com>",
]
exclude = [
"target/*",
"Cargo.lock",
"scripts/tmp",
"benches/texts/*",
"*.txt",
]
description = """
This crate provides Grapheme Cluster, Word and Sentence boundaries
according to Unicode Standard Annex #29 rules.
"""
homepage = "https://github.com/unicode-rs/unicode-segmentation"
documentation = "https://unicode-rs.github.io/unicode-segmentation"
readme = "README.md"
keywords = [
"text",
"unicode",
"grapheme",
"word",
"boundary",
]
license = "MIT/Apache-2.0"
repository = "https://github.com/unicode-rs/unicode-segmentation"
[[bench]]
name = "graphemes"
harness = false
[[bench]]
name = "unicode_words"
harness = false
[[bench]]
name = "word_bounds"
harness = false
[dev-dependencies.criterion]
version = "0.3"
[dev-dependencies.quickcheck]
version = "0.7"
[features]
no_std = []

Просмотреть файл

@ -1,201 +0,0 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

Просмотреть файл

@ -1,25 +0,0 @@
Copyright (c) 2015 The Rust Project Developers
Permission is hereby granted, free of charge, to any
person obtaining a copy of this software and associated
documentation files (the "Software"), to deal in the
Software without restriction, including without
limitation the rights to use, copy, modify, merge,
publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software
is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice
shall be included in all copies or substantial portions
of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.

Просмотреть файл

@ -1,99 +0,0 @@
Iterators which split strings on Grapheme Cluster or Word boundaries, according
to the [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/) rules.
[![Build Status](https://travis-ci.org/unicode-rs/unicode-segmentation.svg)](https://travis-ci.org/unicode-rs/unicode-segmentation)
[Documentation](https://unicode-rs.github.io/unicode-segmentation/unicode_segmentation/index.html)
```rust
use unicode_segmentation::UnicodeSegmentation;
fn main() {
let s = "a̐éö̲\r\n";
let g = s.graphemes(true).collect::<Vec<&str>>();
let b: &[_] = &["a̐", "é", "ö̲", "\r\n"];
assert_eq!(g, b);
let s = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
let w = s.unicode_words().collect::<Vec<&str>>();
let b: &[_] = &["The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right"];
assert_eq!(w, b);
let s = "The quick (\"brown\") fox";
let w = s.split_word_bounds().collect::<Vec<&str>>();
let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", " ", "fox"];
assert_eq!(w, b);
}
```
# no_std
unicode-segmentation does not depend on libstd, so it can be used in crates
with the `#![no_std]` attribute.
# crates.io
You can use this package in your project by adding the following
to your `Cargo.toml`:
```toml
[dependencies]
unicode-segmentation = "1.9.0"
```
# Change Log
## 1.7.1
* Update docs on version number
## 1.7.0
* [#87](https://github.com/unicode-rs/unicode-segmentation/pull/87) Upgrade to Unicode 13
* [#79](https://github.com/unicode-rs/unicode-segmentation/pull/79) Implement a special-case lookup for ascii grapheme categories
* [#77](https://github.com/unicode-rs/unicode-segmentation/pull/77) Optimization for grapheme iteration
## 1.6.0
* [#72](https://github.com/unicode-rs/unicode-segmentation/pull/72) Upgrade to Unicode 12
## 1.5.0
* [#68](https://github.com/unicode-rs/unicode-segmentation/pull/68) Upgrade to Unicode 11
## 1.4.0
* [#56](https://github.com/unicode-rs/unicode-segmentation/pull/56) Upgrade to Unicode 10
## 1.3.0
* [#24](https://github.com/unicode-rs/unicode-segmentation/pull/24) Add support for sentence boundaries
* [#44](https://github.com/unicode-rs/unicode-segmentation/pull/44) Treat `gc=No` as a subset of `gc=N`
## 1.2.1
* [#37](https://github.com/unicode-rs/unicode-segmentation/pull/37):
Fix panic in `provide_context`.
* [#40](https://github.com/unicode-rs/unicode-segmentation/pull/40):
Fix crash in `prev_boundary`.
## 1.2.0
* New `GraphemeCursor` API allows random access and bidirectional iteration.
* Fixed incorrect splitting of certain emoji modifier sequences.
## 1.1.0
* Add `as_str` methods to the iterator types.
## 1.0.3
* Code cleanup and additional tests.
## 1.0.1
* Fix a bug affecting some grapheme clusters containing Prepend characters.
## 1.0.0
* Upgrade to Unicode 9.0.0.

Просмотреть файл

@ -1,63 +0,0 @@
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use unicode_segmentation;
use std::fs;
use unicode_segmentation::UnicodeSegmentation;
fn graphemes(c: &mut Criterion, lang: &str, path: &str) {
let text = fs::read_to_string(path).unwrap();
c.bench_function(&format!("graphemes_{}", lang), |bench| {
bench.iter(|| {
for g in UnicodeSegmentation::graphemes(black_box(&*text), true) {
black_box(g);
}
})
});
}
fn graphemes_arabic(c: &mut Criterion) {
graphemes(c, "arabic", "benches/texts/arabic.txt");
}
fn graphemes_english(c: &mut Criterion) {
graphemes(c, "english", "benches/texts/english.txt");
}
fn graphemes_hindi(c: &mut Criterion) {
graphemes(c, "hindi", "benches/texts/hindi.txt");
}
fn graphemes_japanese(c: &mut Criterion) {
graphemes(c, "japanese", "benches/texts/japanese.txt");
}
fn graphemes_korean(c: &mut Criterion) {
graphemes(c, "korean", "benches/texts/korean.txt");
}
fn graphemes_mandarin(c: &mut Criterion) {
graphemes(c, "mandarin", "benches/texts/mandarin.txt");
}
fn graphemes_russian(c: &mut Criterion) {
graphemes(c, "russian", "benches/texts/russian.txt");
}
fn graphemes_source_code(c: &mut Criterion) {
graphemes(c, "source_code", "benches/texts/source_code.txt");
}
criterion_group!(
benches,
graphemes_arabic,
graphemes_english,
graphemes_hindi,
graphemes_japanese,
graphemes_korean,
graphemes_mandarin,
graphemes_russian,
graphemes_source_code,
);
criterion_main!(benches);

Просмотреть файл

@ -1,64 +0,0 @@
#[macro_use]
extern crate bencher;
extern crate unicode_segmentation;
use bencher::Bencher;
use std::fs;
use unicode_segmentation::UnicodeSegmentation;
fn unicode_words(bench: &mut Bencher, path: &str) {
let text = fs::read_to_string(path).unwrap();
bench.iter(|| {
for w in text.unicode_words() {
bencher::black_box(w);
}
});
bench.bytes = text.len() as u64;
}
fn unicode_words_arabic(bench: &mut Bencher) {
unicode_words(bench, "benches/texts/arabic.txt");
}
fn unicode_words_english(bench: &mut Bencher) {
unicode_words(bench, "benches/texts/english.txt");
}
fn unicode_words_hindi(bench: &mut Bencher) {
unicode_words(bench, "benches/texts/hindi.txt");
}
fn unicode_words_japanese(bench: &mut Bencher) {
unicode_words(bench, "benches/texts/japanese.txt");
}
fn unicode_words_korean(bench: &mut Bencher) {
unicode_words(bench, "benches/texts/korean.txt");
}
fn unicode_words_mandarin(bench: &mut Bencher) {
unicode_words(bench, "benches/texts/mandarin.txt");
}
fn unicode_words_russian(bench: &mut Bencher) {
unicode_words(bench, "benches/texts/russian.txt");
}
fn unicode_words_source_code(bench: &mut Bencher) {
unicode_words(bench, "benches/texts/source_code.txt");
}
benchmark_group!(
benches,
unicode_words_arabic,
unicode_words_english,
unicode_words_hindi,
unicode_words_japanese,
unicode_words_korean,
unicode_words_mandarin,
unicode_words_russian,
unicode_words_source_code,
);
benchmark_main!(benches);

Просмотреть файл

@ -1,64 +0,0 @@
#[macro_use]
extern crate bencher;
extern crate unicode_segmentation;
use bencher::Bencher;
use std::fs;
use unicode_segmentation::UnicodeSegmentation;
fn word_bounds(bench: &mut Bencher, path: &str) {
let text = fs::read_to_string(path).unwrap();
bench.iter(|| {
for w in text.split_word_bounds() {
bencher::black_box(w);
}
});
bench.bytes = text.len() as u64;
}
fn word_bounds_arabic(bench: &mut Bencher) {
word_bounds(bench, "benches/texts/arabic.txt");
}
fn word_bounds_english(bench: &mut Bencher) {
word_bounds(bench, "benches/texts/english.txt");
}
fn word_bounds_hindi(bench: &mut Bencher) {
word_bounds(bench, "benches/texts/hindi.txt");
}
fn word_bounds_japanese(bench: &mut Bencher) {
word_bounds(bench, "benches/texts/japanese.txt");
}
fn word_bounds_korean(bench: &mut Bencher) {
word_bounds(bench, "benches/texts/korean.txt");
}
fn word_bounds_mandarin(bench: &mut Bencher) {
word_bounds(bench, "benches/texts/mandarin.txt");
}
fn word_bounds_russian(bench: &mut Bencher) {
word_bounds(bench, "benches/texts/russian.txt");
}
fn word_bounds_source_code(bench: &mut Bencher) {
word_bounds(bench, "benches/texts/source_code.txt");
}
benchmark_group!(
benches,
word_bounds_arabic,
word_bounds_english,
word_bounds_hindi,
word_bounds_japanese,
word_bounds_korean,
word_bounds_mandarin,
word_bounds_russian,
word_bounds_source_code,
);
benchmark_main!(benches);

Просмотреть файл

@ -1,381 +0,0 @@
#!/usr/bin/env python
#
# Copyright 2011-2015 The Rust Project Developers. See the COPYRIGHT
# file at the top-level directory of this distribution and at
# http://rust-lang.org/COPYRIGHT.
#
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
# option. This file may not be copied, modified, or distributed
# except according to those terms.
# This script uses the following Unicode tables:
# - DerivedCoreProperties.txt
# - auxiliary/GraphemeBreakProperty.txt
# - auxiliary/WordBreakProperty.txt
# - ReadMe.txt
# - UnicodeData.txt
#
# Since this should not require frequent updates, we just store this
# out-of-line and check the unicode.rs file into git.
import fileinput, re, os, sys
preamble = '''// Copyright 2012-2018 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
#![allow(missing_docs, non_upper_case_globals, non_snake_case)]
'''
# Mapping taken from Table 12 from:
# http://www.unicode.org/reports/tr44/#General_Category_Values
expanded_categories = {
'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'],
'Lm': ['L'], 'Lo': ['L'],
'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'],
'Nd': ['N'], 'Nl': ['N'], 'No': ['N'],
'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'],
'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'],
'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'],
'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'],
'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
}
# these are the surrogate codepoints, which are not valid rust characters
surrogate_codepoints = (0xd800, 0xdfff)
UNICODE_VERSION = (15, 0, 0)
UNICODE_VERSION_NUMBER = "%s.%s.%s" %UNICODE_VERSION
def is_surrogate(n):
return surrogate_codepoints[0] <= n <= surrogate_codepoints[1]
def fetch(f):
if not os.path.exists(os.path.basename(f)):
if "emoji" in f:
os.system("curl -O https://www.unicode.org/Public/%s/ucd/emoji/%s"
% (UNICODE_VERSION_NUMBER, f))
else:
os.system("curl -O https://www.unicode.org/Public/%s/ucd/%s"
% (UNICODE_VERSION_NUMBER, f))
if not os.path.exists(os.path.basename(f)):
sys.stderr.write("cannot load %s" % f)
exit(1)
def load_gencats(f):
fetch(f)
gencats = {}
udict = {};
range_start = -1;
for line in fileinput.input(f):
data = line.split(';');
if len(data) != 15:
continue
cp = int(data[0], 16);
if is_surrogate(cp):
continue
if range_start >= 0:
for i in range(range_start, cp):
udict[i] = data;
range_start = -1;
if data[1].endswith(", First>"):
range_start = cp;
continue;
udict[cp] = data;
for code in udict:
[code_org, name, gencat, combine, bidi,
decomp, deci, digit, num, mirror,
old, iso, upcase, lowcase, titlecase ] = udict[code];
# place letter in categories as appropriate
for cat in [gencat, "Assigned"] + expanded_categories.get(gencat, []):
if cat not in gencats:
gencats[cat] = []
gencats[cat].append(code)
gencats = group_cats(gencats)
return gencats
def group_cats(cats):
cats_out = {}
for cat in cats:
cats_out[cat] = group_cat(cats[cat])
return cats_out
def group_cat(cat):
cat_out = []
letters = sorted(set(cat))
cur_start = letters.pop(0)
cur_end = cur_start
for letter in letters:
assert letter > cur_end, \
"cur_end: %s, letter: %s" % (hex(cur_end), hex(letter))
if letter == cur_end + 1:
cur_end = letter
else:
cat_out.append((cur_start, cur_end))
cur_start = cur_end = letter
cat_out.append((cur_start, cur_end))
return cat_out
def ungroup_cat(cat):
cat_out = []
for (lo, hi) in cat:
while lo <= hi:
cat_out.append(lo)
lo += 1
return cat_out
def format_table_content(f, content, indent):
line = " "*indent
first = True
for chunk in content.split(","):
if len(line) + len(chunk) < 98:
if first:
line += chunk
else:
line += ", " + chunk
first = False
else:
f.write(line + ",\n")
line = " "*indent + chunk
f.write(line)
def load_properties(f, interestingprops):
fetch(f)
props = {}
re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
for line in fileinput.input(os.path.basename(f)):
prop = None
d_lo = 0
d_hi = 0
m = re1.match(line)
if m:
d_lo = m.group(1)
d_hi = m.group(1)
prop = m.group(2)
else:
m = re2.match(line)
if m:
d_lo = m.group(1)
d_hi = m.group(2)
prop = m.group(3)
else:
continue
if interestingprops and prop not in interestingprops:
continue
d_lo = int(d_lo, 16)
d_hi = int(d_hi, 16)
if prop not in props:
props[prop] = []
props[prop].append((d_lo, d_hi))
# optimize if possible
for prop in props:
props[prop] = group_cat(ungroup_cat(props[prop]))
return props
def escape_char(c):
return "'\\u{%x}'" % c
def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])), is_const=True):
pub_string = "const"
if not is_const:
pub_string = "let"
if is_pub:
pub_string = "pub " + pub_string
f.write(" %s %s: %s = &[\n" % (pub_string, name, t_type))
data = ""
first = True
for dat in t_data:
if not first:
data += ","
first = False
data += pfun(dat)
format_table_content(f, data, 8)
f.write("\n ];\n\n")
def emit_util_mod(f):
f.write("""
pub mod util {
#[inline]
pub fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
use core::cmp::Ordering::{Equal, Less, Greater};
r.binary_search_by(|&(lo,hi)| {
if lo <= c && c <= hi { Equal }
else if hi < c { Less }
else { Greater }
}).is_ok()
}
#[inline]
fn is_alphabetic(c: char) -> bool {
match c {
'a' ..= 'z' | 'A' ..= 'Z' => true,
c if c > '\x7f' => super::derived_property::Alphabetic(c),
_ => false,
}
}
#[inline]
fn is_numeric(c: char) -> bool {
match c {
'0' ..= '9' => true,
c if c > '\x7f' => super::general_category::N(c),
_ => false,
}
}
#[inline]
pub fn is_alphanumeric(c: char) -> bool {
is_alphabetic(c) || is_numeric(c)
}
}
""")
def emit_property_module(f, mod, tbl, emit):
f.write("mod %s {\n" % mod)
for cat in sorted(emit):
emit_table(f, "%s_table" % cat, tbl[cat], is_pub=False)
f.write(" #[inline]\n")
f.write(" pub fn %s(c: char) -> bool {\n" % cat)
f.write(" super::util::bsearch_range_table(c, %s_table)\n" % cat)
f.write(" }\n\n")
f.write("}\n\n")
def emit_break_module(f, break_table, break_cats, name):
Name = name.capitalize()
f.write("""pub mod %s {
use core::result::Result::{Ok, Err};
pub use self::%sCat::*;
#[allow(non_camel_case_types)]
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
pub enum %sCat {
""" % (name, Name, Name))
break_cats.append("Any")
break_cats.sort()
for cat in break_cats:
f.write((" %sC_" % Name[0]) + cat + ",\n")
f.write(""" }
fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> (u32, u32, %sCat) {
use core::cmp::Ordering::{Equal, Less, Greater};
match r.binary_search_by(|&(lo, hi, _)| {
if lo <= c && c <= hi { Equal }
else if hi < c { Less }
else { Greater }
}) {
Ok(idx) => {
let (lower, upper, cat) = r[idx];
(lower as u32, upper as u32, cat)
}
Err(idx) => {
(
if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
%sC_Any,
)
}
}
}
pub fn %s_category(c: char) -> (u32, u32, %sCat) {
bsearch_range_value_table(c, %s_cat_table)
}
""" % (Name, Name, Name[0], name, Name, name))
emit_table(f, "%s_cat_table" % name, break_table, "&'static [(char, char, %sCat)]" % Name,
pfun=lambda x: "(%s,%s,%sC_%s)" % (escape_char(x[0]), escape_char(x[1]), Name[0], x[2]),
is_pub=False, is_const=True)
f.write("}\n")
if __name__ == "__main__":
r = "tables.rs"
if os.path.exists(r):
os.remove(r)
with open(r, "w") as rf:
# write the file's preamble
rf.write(preamble)
rf.write("""
/// The version of [Unicode](http://www.unicode.org/)
/// that this version of unicode-segmentation is based on.
pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
""" % UNICODE_VERSION)
# download and parse all the data
gencats = load_gencats("UnicodeData.txt")
derived = load_properties("DerivedCoreProperties.txt", ["Alphabetic"])
emit_util_mod(rf)
for (name, cat, pfuns) in ("general_category", gencats, ["N"]), \
("derived_property", derived, ["Alphabetic"]):
emit_property_module(rf, name, cat, pfuns)
### grapheme cluster module
# from http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Break_Property_Values
grapheme_cats = load_properties("auxiliary/GraphemeBreakProperty.txt", [])
# Control
# Note:
# This category also includes Cs (surrogate codepoints), but Rust's `char`s are
# Unicode Scalar Values only, and surrogates are thus invalid `char`s.
# Thus, we have to remove Cs from the Control category
grapheme_cats["Control"] = group_cat(list(
set(ungroup_cat(grapheme_cats["Control"]))
- set(ungroup_cat([surrogate_codepoints]))))
grapheme_table = []
for cat in grapheme_cats:
grapheme_table.extend([(x, y, cat) for (x, y) in grapheme_cats[cat]])
emoji_props = load_properties("emoji-data.txt", ["Extended_Pictographic"])
grapheme_table.extend([(x, y, "Extended_Pictographic") for (x, y) in emoji_props["Extended_Pictographic"]])
grapheme_table.sort(key=lambda w: w[0])
last = -1
for chars in grapheme_table:
if chars[0] <= last:
raise "Grapheme tables and Extended_Pictographic values overlap; need to store these separately!"
last = chars[1]
emit_break_module(rf, grapheme_table, list(grapheme_cats.keys()) + ["Extended_Pictographic"], "grapheme")
rf.write("\n")
word_cats = load_properties("auxiliary/WordBreakProperty.txt", [])
word_table = []
for cat in word_cats:
word_table.extend([(x, y, cat) for (x, y) in word_cats[cat]])
word_table.sort(key=lambda w: w[0])
emit_break_module(rf, word_table, list(word_cats.keys()), "word")
# There are some emoji which are also ALetter, so this needs to be stored separately
# For efficiency, we could still merge the two tables and produce an ALetterEP state
emoji_table = [(x, y, "Extended_Pictographic") for (x, y) in emoji_props["Extended_Pictographic"]]
emit_break_module(rf, emoji_table, ["Extended_Pictographic"], "emoji")
sentence_cats = load_properties("auxiliary/SentenceBreakProperty.txt", [])
sentence_table = []
for cat in sentence_cats:
sentence_table.extend([(x, y, cat) for (x, y) in sentence_cats[cat]])
sentence_table.sort(key=lambda w: w[0])
emit_break_module(rf, sentence_table, list(sentence_cats.keys()), "sentence")

Просмотреть файл

@ -1,212 +0,0 @@
#!/usr/bin/env python
# -*- coding: utf-8
#
# Copyright 2015 The Rust Project Developers. See the COPYRIGHT
# file at the top-level directory of this distribution and at
# http://rust-lang.org/COPYRIGHT.
#
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
# option. This file may not be copied, modified, or distributed
# except according to those terms.
# This script uses the following Unicode tables:
# - auxiliary/GraphemeBreakTest.txt
# - auxiliary/WordBreakTest.txt
#
# Since this should not require frequent updates, we just store this
# out-of-line and check the unicode.rs file into git.
from __future__ import print_function
import unicode, re, os, fileinput
def load_test_data(f, optsplit=[]):
testRe1 = re.compile(r"\s+([^\s].*[^\s])\s+÷\s+#\s+÷\s+\[0.2\].*?([÷×].*)\s+÷\s+\[0.3\]\s*$")
unicode.fetch(f)
data = []
for line in fileinput.input(os.path.basename(f)):
# lines that include a test start with the ÷ character
if len(line) < 2 or not line.startswith('÷'):
continue
m = testRe1.match(line)
if not m:
print("error: no match on line where test was expected: %s" % line)
continue
# process the characters in this test case
chars = process_split_string(m.group(1))
# skip test case if it contains invalid characters (viz., surrogates)
if not chars:
continue
# now process test cases
(chars, info) = process_split_info(m.group(2), chars, optsplit)
# make sure that we have break info for each break!
assert len(chars) - 1 == len(info)
data.append((chars, info))
return data
def process_split_info(s, c, o):
outcs = []
outis = []
workcs = c.pop(0)
# are we on a × or a ÷?
isX = False
if s.startswith('×'):
isX = True
# find each instance of '(÷|×) [x.y] '
while s:
# find the currently considered rule number
sInd = s.index('[') + 1
eInd = s.index(']')
# if it's '× [a.b]' where 'a.b' is in o, then
# we consider it a split even though it's not
# marked as one
# if it's ÷ then it's always a split
if not isX or s[sInd:eInd] in o:
outis.append(s[sInd:eInd])
outcs.append(workcs)
workcs = c.pop(0)
else:
workcs.extend(c.pop(0))
idx = 1
while idx < len(s):
if s[idx:].startswith('×'):
isX = True
break
if s[idx:].startswith('÷'):
isX = False
break
idx += 1
s = s[idx:]
outcs.append(workcs)
return (outcs, outis)
def process_split_string(s):
outls = []
workls = []
inls = s.split()
for i in inls:
if i == '÷' or i == '×':
outls.append(workls)
workls = []
continue
ival = int(i,16)
if unicode.is_surrogate(ival):
return []
workls.append(ival)
if workls:
outls.append(workls)
return outls
def showfun(x):
outstr = '("'
for c in x[0]:
outstr += "\\u{%x}" % c
outstr += '",&['
xfirst = True
for xx in x[1:]:
if not xfirst:
outstr += '],&['
xfirst = False
sfirst = True
for sp in xx:
if not sfirst:
outstr += ','
sfirst = False
outstr += '"'
for c in sp:
outstr += "\\u{%x}" % c
outstr += '"'
outstr += '])'
return outstr
def create_grapheme_data(f):
# rules 9.1 and 9.2 are for extended graphemes only
optsplits = ['9.1','9.2']
d = load_test_data("auxiliary/GraphemeBreakTest.txt", optsplits)
test_same = []
test_diff = []
for (c, i) in d:
allchars = [cn for s in c for cn in s]
extgraphs = []
extwork = []
extwork.extend(c[0])
for n in range(0,len(i)):
if i[n] in optsplits:
extwork.extend(c[n+1])
else:
extgraphs.append(extwork)
extwork = []
extwork.extend(c[n+1])
# these are the extended grapheme clusters
extgraphs.append(extwork)
if extgraphs == c:
test_same.append((allchars, c))
else:
test_diff.append((allchars, extgraphs, c))
stype = "&'static [(&'static str, &'static [&'static str])]"
dtype = "&'static [(&'static str, &'static [&'static str], &'static [&'static str])]"
f.write(" // official Unicode test data\n")
f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/GraphemeBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
unicode.emit_table(f, "TEST_SAME", test_same, stype, True, showfun, True)
unicode.emit_table(f, "TEST_DIFF", test_diff, dtype, True, showfun, True)
def create_words_data(f):
d = load_test_data("auxiliary/WordBreakTest.txt")
test = []
for (c, i) in d:
allchars = [cn for s in c for cn in s]
test.append((allchars, c))
wtype = "&'static [(&'static str, &'static [&'static str])]"
f.write(" // official Unicode test data\n")
f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/WordBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
unicode.emit_table(f, "TEST_WORD", test, wtype, True, showfun, True)
def create_sentence_data(f):
d = load_test_data("auxiliary/SentenceBreakTest.txt")
test = []
for (c, i) in d:
allchars = [cn for s in c for cn in s]
test.append((allchars, c))
wtype = "&'static [(&'static str, &'static [&'static str])]"
f.write(" // official Unicode test data\n")
f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/SentenceBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
unicode.emit_table(f, "TEST_SENTENCE", test, wtype, True, showfun, True)
if __name__ == "__main__":
with open("testdata.rs", "w") as rf:
rf.write(unicode.preamble)
create_grapheme_data(rf)
create_words_data(rf)
create_sentence_data(rf)

Просмотреть файл

@ -1,801 +0,0 @@
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use core::cmp;
use crate::tables::grapheme::GraphemeCat;
/// External iterator for grapheme clusters and byte offsets.
///
/// This struct is created by the [`grapheme_indices`] method on the [`UnicodeSegmentation`]
/// trait. See its documentation for more.
///
/// [`grapheme_indices`]: trait.UnicodeSegmentation.html#tymethod.grapheme_indices
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
#[derive(Clone)]
pub struct GraphemeIndices<'a> {
start_offset: usize,
iter: Graphemes<'a>,
}
impl<'a> GraphemeIndices<'a> {
#[inline]
/// View the underlying data (the part yet to be iterated) as a slice of the original string.
///
/// ```rust
/// # use unicode_segmentation::UnicodeSegmentation;
/// let mut iter = "abc".grapheme_indices(true);
/// assert_eq!(iter.as_str(), "abc");
/// iter.next();
/// assert_eq!(iter.as_str(), "bc");
/// iter.next();
/// iter.next();
/// assert_eq!(iter.as_str(), "");
/// ```
pub fn as_str(&self) -> &'a str {
self.iter.as_str()
}
}
impl<'a> Iterator for GraphemeIndices<'a> {
type Item = (usize, &'a str);
#[inline]
fn next(&mut self) -> Option<(usize, &'a str)> {
self.iter
.next()
.map(|s| (s.as_ptr() as usize - self.start_offset, s))
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
self.iter.size_hint()
}
}
impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
#[inline]
fn next_back(&mut self) -> Option<(usize, &'a str)> {
self.iter
.next_back()
.map(|s| (s.as_ptr() as usize - self.start_offset, s))
}
}
/// External iterator for a string's
/// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries).
///
/// This struct is created by the [`graphemes`] method on the [`UnicodeSegmentation`] trait. See its
/// documentation for more.
///
/// [`graphemes`]: trait.UnicodeSegmentation.html#tymethod.graphemes
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
#[derive(Clone, Debug)]
pub struct Graphemes<'a> {
string: &'a str,
cursor: GraphemeCursor,
cursor_back: GraphemeCursor,
}
impl<'a> Graphemes<'a> {
#[inline]
/// View the underlying data (the part yet to be iterated) as a slice of the original string.
///
/// ```rust
/// # use unicode_segmentation::UnicodeSegmentation;
/// let mut iter = "abc".graphemes(true);
/// assert_eq!(iter.as_str(), "abc");
/// iter.next();
/// assert_eq!(iter.as_str(), "bc");
/// iter.next();
/// iter.next();
/// assert_eq!(iter.as_str(), "");
/// ```
pub fn as_str(&self) -> &'a str {
&self.string[self.cursor.cur_cursor()..self.cursor_back.cur_cursor()]
}
}
impl<'a> Iterator for Graphemes<'a> {
type Item = &'a str;
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
let slen = self.cursor_back.cur_cursor() - self.cursor.cur_cursor();
(cmp::min(slen, 1), Some(slen))
}
#[inline]
fn next(&mut self) -> Option<&'a str> {
let start = self.cursor.cur_cursor();
if start == self.cursor_back.cur_cursor() {
return None;
}
let next = self.cursor.next_boundary(self.string, 0).unwrap().unwrap();
Some(&self.string[start..next])
}
}
impl<'a> DoubleEndedIterator for Graphemes<'a> {
#[inline]
fn next_back(&mut self) -> Option<&'a str> {
let end = self.cursor_back.cur_cursor();
if end == self.cursor.cur_cursor() {
return None;
}
let prev = self
.cursor_back
.prev_boundary(self.string, 0)
.unwrap()
.unwrap();
Some(&self.string[prev..end])
}
}
#[inline]
pub fn new_graphemes<'b>(s: &'b str, is_extended: bool) -> Graphemes<'b> {
let len = s.len();
Graphemes {
string: s,
cursor: GraphemeCursor::new(0, len, is_extended),
cursor_back: GraphemeCursor::new(len, len, is_extended),
}
}
#[inline]
pub fn new_grapheme_indices<'b>(s: &'b str, is_extended: bool) -> GraphemeIndices<'b> {
GraphemeIndices {
start_offset: s.as_ptr() as usize,
iter: new_graphemes(s, is_extended),
}
}
// maybe unify with PairResult?
// An enum describing information about a potential boundary.
#[derive(PartialEq, Eq, Clone, Debug)]
enum GraphemeState {
// No information is known.
Unknown,
// It is known to not be a boundary.
NotBreak,
// It is known to be a boundary.
Break,
// The codepoint after is a Regional Indicator Symbol, so a boundary iff
// it is preceded by an even number of RIS codepoints. (GB12, GB13)
Regional,
// The codepoint after is Extended_Pictographic,
// so whether it's a boundary depends on pre-context according to GB11.
Emoji,
}
/// Cursor-based segmenter for grapheme clusters.
///
/// This allows working with ropes and other datastructures where the string is not contiguous or
/// fully known at initialization time.
#[derive(Clone, Debug)]
pub struct GraphemeCursor {
// Current cursor position.
offset: usize,
// Total length of the string.
len: usize,
// A config flag indicating whether this cursor computes legacy or extended
// grapheme cluster boundaries (enables GB9a and GB9b if set).
is_extended: bool,
// Information about the potential boundary at `offset`
state: GraphemeState,
// Category of codepoint immediately preceding cursor, if known.
cat_before: Option<GraphemeCat>,
// Category of codepoint immediately after cursor, if known.
cat_after: Option<GraphemeCat>,
// If set, at least one more codepoint immediately preceding this offset
// is needed to resolve whether there's a boundary at `offset`.
pre_context_offset: Option<usize>,
// The number of RIS codepoints preceding `offset`. If `pre_context_offset`
// is set, then counts the number of RIS between that and `offset`, otherwise
// is an accurate count relative to the string.
ris_count: Option<usize>,
// Set if a call to `prev_boundary` or `next_boundary` was suspended due
// to needing more input.
resuming: bool,
// Cached grapheme category and associated scalar value range.
grapheme_cat_cache: (u32, u32, GraphemeCat),
}
/// An error return indicating that not enough content was available in the
/// provided chunk to satisfy the query, and that more content must be provided.
#[derive(PartialEq, Eq, Debug)]
pub enum GraphemeIncomplete {
/// More pre-context is needed. The caller should call `provide_context`
/// with a chunk ending at the offset given, then retry the query. This
/// will only be returned if the `chunk_start` parameter is nonzero.
PreContext(usize),
/// When requesting `prev_boundary`, the cursor is moving past the beginning
/// of the current chunk, so the chunk before that is requested. This will
/// only be returned if the `chunk_start` parameter is nonzero.
PrevChunk,
/// When requesting `next_boundary`, the cursor is moving past the end of the
/// current chunk, so the chunk after that is requested. This will only be
/// returned if the chunk ends before the `len` parameter provided on
/// creation of the cursor.
NextChunk, // requesting chunk following the one given
/// An error returned when the chunk given does not contain the cursor position.
InvalidOffset,
}
// An enum describing the result from lookup of a pair of categories.
#[derive(PartialEq, Eq)]
enum PairResult {
NotBreak, // definitely not a break
Break, // definitely a break
Extended, // a break iff not in extended mode
Regional, // a break if preceded by an even number of RIS
Emoji, // a break if preceded by emoji base and (Extend)*
}
#[inline]
fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult {
use self::PairResult::*;
use crate::tables::grapheme::GraphemeCat::*;
match (before, after) {
(GC_CR, GC_LF) => NotBreak, // GB3
(GC_Control, _) => Break, // GB4
(GC_CR, _) => Break, // GB4
(GC_LF, _) => Break, // GB4
(_, GC_Control) => Break, // GB5
(_, GC_CR) => Break, // GB5
(_, GC_LF) => Break, // GB5
(GC_L, GC_L) => NotBreak, // GB6
(GC_L, GC_V) => NotBreak, // GB6
(GC_L, GC_LV) => NotBreak, // GB6
(GC_L, GC_LVT) => NotBreak, // GB6
(GC_LV, GC_V) => NotBreak, // GB7
(GC_LV, GC_T) => NotBreak, // GB7
(GC_V, GC_V) => NotBreak, // GB7
(GC_V, GC_T) => NotBreak, // GB7
(GC_LVT, GC_T) => NotBreak, // GB8
(GC_T, GC_T) => NotBreak, // GB8
(_, GC_Extend) => NotBreak, // GB9
(_, GC_ZWJ) => NotBreak, // GB9
(_, GC_SpacingMark) => Extended, // GB9a
(GC_Prepend, _) => Extended, // GB9b
(GC_ZWJ, GC_Extended_Pictographic) => Emoji, // GB11
(GC_Regional_Indicator, GC_Regional_Indicator) => Regional, // GB12, GB13
(_, _) => Break, // GB999
}
}
impl GraphemeCursor {
/// Create a new cursor. The string and initial offset are given at creation
/// time, but the contents of the string are not. The `is_extended` parameter
/// controls whether extended grapheme clusters are selected.
///
/// The `offset` parameter must be on a codepoint boundary.
///
/// ```rust
/// # use unicode_segmentation::GraphemeCursor;
/// let s = "हिन्दी";
/// let mut legacy = GraphemeCursor::new(0, s.len(), false);
/// assert_eq!(legacy.next_boundary(s, 0), Ok(Some("ह".len())));
/// let mut extended = GraphemeCursor::new(0, s.len(), true);
/// assert_eq!(extended.next_boundary(s, 0), Ok(Some("हि".len())));
/// ```
pub fn new(offset: usize, len: usize, is_extended: bool) -> GraphemeCursor {
let state = if offset == 0 || offset == len {
GraphemeState::Break
} else {
GraphemeState::Unknown
};
GraphemeCursor {
offset: offset,
len: len,
state: state,
is_extended: is_extended,
cat_before: None,
cat_after: None,
pre_context_offset: None,
ris_count: None,
resuming: false,
grapheme_cat_cache: (0, 0, GraphemeCat::GC_Control),
}
}
fn grapheme_category(&mut self, ch: char) -> GraphemeCat {
use crate::tables::grapheme as gr;
use crate::tables::grapheme::GraphemeCat::*;
if ch <= '\u{7e}' {
// Special-case optimization for ascii, except U+007F. This
// improves performance even for many primarily non-ascii texts,
// due to use of punctuation and white space characters from the
// ascii range.
if ch >= '\u{20}' {
GC_Any
} else if ch == '\n' {
GC_LF
} else if ch == '\r' {
GC_CR
} else {
GC_Control
}
} else {
// If this char isn't within the cached range, update the cache to the
// range that includes it.
if (ch as u32) < self.grapheme_cat_cache.0 || (ch as u32) > self.grapheme_cat_cache.1 {
self.grapheme_cat_cache = gr::grapheme_category(ch);
}
self.grapheme_cat_cache.2
}
}
// Not sure I'm gonna keep this, the advantage over new() seems thin.
/// Set the cursor to a new location in the same string.
///
/// ```rust
/// # use unicode_segmentation::GraphemeCursor;
/// let s = "abcd";
/// let mut cursor = GraphemeCursor::new(0, s.len(), false);
/// assert_eq!(cursor.cur_cursor(), 0);
/// cursor.set_cursor(2);
/// assert_eq!(cursor.cur_cursor(), 2);
/// ```
pub fn set_cursor(&mut self, offset: usize) {
if offset != self.offset {
self.offset = offset;
self.state = if offset == 0 || offset == self.len {
GraphemeState::Break
} else {
GraphemeState::Unknown
};
// reset state derived from text around cursor
self.cat_before = None;
self.cat_after = None;
self.ris_count = None;
}
}
#[inline]
/// The current offset of the cursor. Equal to the last value provided to
/// `new()` or `set_cursor()`, or returned from `next_boundary()` or
/// `prev_boundary()`.
///
/// ```rust
/// # use unicode_segmentation::GraphemeCursor;
/// // Two flags (🇷🇸🇮🇴), each flag is two RIS codepoints, each RIS is 4 bytes.
/// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
/// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
/// assert_eq!(cursor.cur_cursor(), 4);
/// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
/// assert_eq!(cursor.cur_cursor(), 8);
/// ```
pub fn cur_cursor(&self) -> usize {
self.offset
}
/// Provide additional pre-context when it is needed to decide a boundary.
/// The end of the chunk must coincide with the value given in the
/// `GraphemeIncomplete::PreContext` request.
///
/// ```rust
/// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
/// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
/// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
/// // Not enough pre-context to decide if there's a boundary between the two flags.
/// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(8)));
/// // Provide one more Regional Indicator Symbol of pre-context
/// cursor.provide_context(&flags[4..8], 4);
/// // Still not enough context to decide.
/// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(4)));
/// // Provide additional requested context.
/// cursor.provide_context(&flags[0..4], 0);
/// // That's enough to decide (it always is when context goes to the start of the string)
/// assert_eq!(cursor.is_boundary(&flags[8..], 8), Ok(true));
/// ```
pub fn provide_context(&mut self, chunk: &str, chunk_start: usize) {
use crate::tables::grapheme as gr;
assert!(chunk_start + chunk.len() == self.pre_context_offset.unwrap());
self.pre_context_offset = None;
if self.is_extended && chunk_start + chunk.len() == self.offset {
let ch = chunk.chars().rev().next().unwrap();
if self.grapheme_category(ch) == gr::GC_Prepend {
self.decide(false); // GB9b
return;
}
}
match self.state {
GraphemeState::Regional => self.handle_regional(chunk, chunk_start),
GraphemeState::Emoji => self.handle_emoji(chunk, chunk_start),
_ => {
if self.cat_before.is_none() && self.offset == chunk.len() + chunk_start {
let ch = chunk.chars().rev().next().unwrap();
self.cat_before = Some(self.grapheme_category(ch));
}
}
}
}
#[inline]
fn decide(&mut self, is_break: bool) {
self.state = if is_break {
GraphemeState::Break
} else {
GraphemeState::NotBreak
};
}
#[inline]
fn decision(&mut self, is_break: bool) -> Result<bool, GraphemeIncomplete> {
self.decide(is_break);
Ok(is_break)
}
#[inline]
fn is_boundary_result(&self) -> Result<bool, GraphemeIncomplete> {
if self.state == GraphemeState::Break {
Ok(true)
} else if self.state == GraphemeState::NotBreak {
Ok(false)
} else if let Some(pre_context_offset) = self.pre_context_offset {
Err(GraphemeIncomplete::PreContext(pre_context_offset))
} else {
unreachable!("inconsistent state");
}
}
#[inline]
fn handle_regional(&mut self, chunk: &str, chunk_start: usize) {
use crate::tables::grapheme as gr;
let mut ris_count = self.ris_count.unwrap_or(0);
for ch in chunk.chars().rev() {
if self.grapheme_category(ch) != gr::GC_Regional_Indicator {
self.ris_count = Some(ris_count);
self.decide((ris_count % 2) == 0);
return;
}
ris_count += 1;
}
self.ris_count = Some(ris_count);
if chunk_start == 0 {
self.decide((ris_count % 2) == 0);
return;
}
self.pre_context_offset = Some(chunk_start);
self.state = GraphemeState::Regional;
}
#[inline]
fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) {
use crate::tables::grapheme as gr;
let mut iter = chunk.chars().rev();
if let Some(ch) = iter.next() {
if self.grapheme_category(ch) != gr::GC_ZWJ {
self.decide(true);
return;
}
}
for ch in iter {
match self.grapheme_category(ch) {
gr::GC_Extend => (),
gr::GC_Extended_Pictographic => {
self.decide(false);
return;
}
_ => {
self.decide(true);
return;
}
}
}
if chunk_start == 0 {
self.decide(true);
return;
}
self.pre_context_offset = Some(chunk_start);
self.state = GraphemeState::Emoji;
}
#[inline]
/// Determine whether the current cursor location is a grapheme cluster boundary.
/// Only a part of the string need be supplied. If `chunk_start` is nonzero or
/// the length of `chunk` is not equal to `len` on creation, then this method
/// may return `GraphemeIncomplete::PreContext`. The caller should then
/// call `provide_context` with the requested chunk, then retry calling this
/// method.
///
/// For partial chunks, if the cursor is not at the beginning or end of the
/// string, the chunk should contain at least the codepoint following the cursor.
/// If the string is nonempty, the chunk must be nonempty.
///
/// All calls should have consistent chunk contents (ie, if a chunk provides
/// content for a given slice, all further chunks covering that slice must have
/// the same content for it).
///
/// ```rust
/// # use unicode_segmentation::GraphemeCursor;
/// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
/// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
/// assert_eq!(cursor.is_boundary(flags, 0), Ok(true));
/// cursor.set_cursor(12);
/// assert_eq!(cursor.is_boundary(flags, 0), Ok(false));
/// ```
pub fn is_boundary(
&mut self,
chunk: &str,
chunk_start: usize,
) -> Result<bool, GraphemeIncomplete> {
use crate::tables::grapheme as gr;
if self.state == GraphemeState::Break {
return Ok(true);
}
if self.state == GraphemeState::NotBreak {
return Ok(false);
}
if self.offset < chunk_start || self.offset >= chunk_start + chunk.len() {
if self.offset > chunk_start + chunk.len() || self.cat_after.is_none() {
return Err(GraphemeIncomplete::InvalidOffset);
}
}
if let Some(pre_context_offset) = self.pre_context_offset {
return Err(GraphemeIncomplete::PreContext(pre_context_offset));
}
let offset_in_chunk = self.offset - chunk_start;
if self.cat_after.is_none() {
let ch = chunk[offset_in_chunk..].chars().next().unwrap();
self.cat_after = Some(self.grapheme_category(ch));
}
if self.offset == chunk_start {
let mut need_pre_context = true;
match self.cat_after.unwrap() {
gr::GC_Regional_Indicator => self.state = GraphemeState::Regional,
gr::GC_Extended_Pictographic => self.state = GraphemeState::Emoji,
_ => need_pre_context = self.cat_before.is_none(),
}
if need_pre_context {
self.pre_context_offset = Some(chunk_start);
return Err(GraphemeIncomplete::PreContext(chunk_start));
}
}
if self.cat_before.is_none() {
let ch = chunk[..offset_in_chunk].chars().rev().next().unwrap();
self.cat_before = Some(self.grapheme_category(ch));
}
match check_pair(self.cat_before.unwrap(), self.cat_after.unwrap()) {
PairResult::NotBreak => return self.decision(false),
PairResult::Break => return self.decision(true),
PairResult::Extended => {
let is_extended = self.is_extended;
return self.decision(!is_extended);
}
PairResult::Regional => {
if let Some(ris_count) = self.ris_count {
return self.decision((ris_count % 2) == 0);
}
self.handle_regional(&chunk[..offset_in_chunk], chunk_start);
self.is_boundary_result()
}
PairResult::Emoji => {
self.handle_emoji(&chunk[..offset_in_chunk], chunk_start);
self.is_boundary_result()
}
}
}
#[inline]
/// Find the next boundary after the current cursor position. Only a part of
/// the string need be supplied. If the chunk is incomplete, then this
/// method might return `GraphemeIncomplete::PreContext` or
/// `GraphemeIncomplete::NextChunk`. In the former case, the caller should
/// call `provide_context` with the requested chunk, then retry. In the
/// latter case, the caller should provide the chunk following the one
/// given, then retry.
///
/// See `is_boundary` for expectations on the provided chunk.
///
/// ```rust
/// # use unicode_segmentation::GraphemeCursor;
/// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
/// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
/// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
/// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(16)));
/// assert_eq!(cursor.next_boundary(flags, 0), Ok(None));
/// ```
///
/// And an example that uses partial strings:
///
/// ```rust
/// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
/// let s = "abcd";
/// let mut cursor = GraphemeCursor::new(0, s.len(), false);
/// assert_eq!(cursor.next_boundary(&s[..2], 0), Ok(Some(1)));
/// assert_eq!(cursor.next_boundary(&s[..2], 0), Err(GraphemeIncomplete::NextChunk));
/// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(2)));
/// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(3)));
/// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(4)));
/// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(None));
/// ```
pub fn next_boundary(
&mut self,
chunk: &str,
chunk_start: usize,
) -> Result<Option<usize>, GraphemeIncomplete> {
if self.offset == self.len {
return Ok(None);
}
let mut iter = chunk[self.offset - chunk_start..].chars();
let mut ch = iter.next().unwrap();
loop {
if self.resuming {
if self.cat_after.is_none() {
self.cat_after = Some(self.grapheme_category(ch));
}
} else {
self.offset += ch.len_utf8();
self.state = GraphemeState::Unknown;
self.cat_before = self.cat_after.take();
if self.cat_before.is_none() {
self.cat_before = Some(self.grapheme_category(ch));
}
if self.cat_before.unwrap() == GraphemeCat::GC_Regional_Indicator {
self.ris_count = self.ris_count.map(|c| c + 1);
} else {
self.ris_count = Some(0);
}
if let Some(next_ch) = iter.next() {
ch = next_ch;
self.cat_after = Some(self.grapheme_category(ch));
} else if self.offset == self.len {
self.decide(true);
} else {
self.resuming = true;
return Err(GraphemeIncomplete::NextChunk);
}
}
self.resuming = true;
if self.is_boundary(chunk, chunk_start)? {
self.resuming = false;
return Ok(Some(self.offset));
}
self.resuming = false;
}
}
/// Find the previous boundary after the current cursor position. Only a part
/// of the string need be supplied. If the chunk is incomplete, then this
/// method might return `GraphemeIncomplete::PreContext` or
/// `GraphemeIncomplete::PrevChunk`. In the former case, the caller should
/// call `provide_context` with the requested chunk, then retry. In the
/// latter case, the caller should provide the chunk preceding the one
/// given, then retry.
///
/// See `is_boundary` for expectations on the provided chunk.
///
/// ```rust
/// # use unicode_segmentation::GraphemeCursor;
/// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
/// let mut cursor = GraphemeCursor::new(12, flags.len(), false);
/// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(8)));
/// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(0)));
/// assert_eq!(cursor.prev_boundary(flags, 0), Ok(None));
/// ```
///
/// And an example that uses partial strings (note the exact return is not
/// guaranteed, and may be `PrevChunk` or `PreContext` arbitrarily):
///
/// ```rust
/// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
/// let s = "abcd";
/// let mut cursor = GraphemeCursor::new(4, s.len(), false);
/// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Ok(Some(3)));
/// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Err(GraphemeIncomplete::PrevChunk));
/// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(2)));
/// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(1)));
/// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(0)));
/// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(None));
/// ```
pub fn prev_boundary(
&mut self,
chunk: &str,
chunk_start: usize,
) -> Result<Option<usize>, GraphemeIncomplete> {
if self.offset == 0 {
return Ok(None);
}
if self.offset == chunk_start {
return Err(GraphemeIncomplete::PrevChunk);
}
let mut iter = chunk[..self.offset - chunk_start].chars().rev();
let mut ch = iter.next().unwrap();
loop {
if self.offset == chunk_start {
self.resuming = true;
return Err(GraphemeIncomplete::PrevChunk);
}
if self.resuming {
self.cat_before = Some(self.grapheme_category(ch));
} else {
self.offset -= ch.len_utf8();
self.cat_after = self.cat_before.take();
self.state = GraphemeState::Unknown;
if let Some(ris_count) = self.ris_count {
self.ris_count = if ris_count > 0 {
Some(ris_count - 1)
} else {
None
};
}
if let Some(prev_ch) = iter.next() {
ch = prev_ch;
self.cat_before = Some(self.grapheme_category(ch));
} else if self.offset == 0 {
self.decide(true);
} else {
self.resuming = true;
self.cat_after = Some(self.grapheme_category(ch));
return Err(GraphemeIncomplete::PrevChunk);
}
}
self.resuming = true;
if self.is_boundary(chunk, chunk_start)? {
self.resuming = false;
return Ok(Some(self.offset));
}
self.resuming = false;
}
}
}
#[test]
fn test_grapheme_cursor_ris_precontext() {
let s = "\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}";
let mut c = GraphemeCursor::new(8, s.len(), true);
assert_eq!(
c.is_boundary(&s[4..], 4),
Err(GraphemeIncomplete::PreContext(4))
);
c.provide_context(&s[..4], 0);
assert_eq!(c.is_boundary(&s[4..], 4), Ok(true));
}
#[test]
fn test_grapheme_cursor_chunk_start_require_precontext() {
let s = "\r\n";
let mut c = GraphemeCursor::new(1, s.len(), true);
assert_eq!(
c.is_boundary(&s[1..], 1),
Err(GraphemeIncomplete::PreContext(1))
);
c.provide_context(&s[..1], 0);
assert_eq!(c.is_boundary(&s[1..], 1), Ok(false));
}
#[test]
fn test_grapheme_cursor_prev_boundary() {
let s = "abcd";
let mut c = GraphemeCursor::new(3, s.len(), true);
assert_eq!(
c.prev_boundary(&s[2..], 2),
Err(GraphemeIncomplete::PrevChunk)
);
assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(2)));
}
#[test]
fn test_grapheme_cursor_prev_boundary_chunk_start() {
let s = "abcd";
let mut c = GraphemeCursor::new(2, s.len(), true);
assert_eq!(
c.prev_boundary(&s[2..], 2),
Err(GraphemeIncomplete::PrevChunk)
);
assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(1)));
}

Просмотреть файл

@ -1,307 +0,0 @@
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//! Iterators which split strings on Grapheme Cluster, Word or Sentence boundaries, according
//! to the [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/) rules.
//!
//! ```rust
//! extern crate unicode_segmentation;
//!
//! use unicode_segmentation::UnicodeSegmentation;
//!
//! fn main() {
//! let s = "a̐éö̲\r\n";
//! let g = UnicodeSegmentation::graphemes(s, true).collect::<Vec<&str>>();
//! let b: &[_] = &["a̐", "é", "ö̲", "\r\n"];
//! assert_eq!(g, b);
//!
//! let s = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
//! let w = s.unicode_words().collect::<Vec<&str>>();
//! let b: &[_] = &["The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right"];
//! assert_eq!(w, b);
//!
//! let s = "The quick (\"brown\") fox";
//! let w = s.split_word_bounds().collect::<Vec<&str>>();
//! let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox"];
//! assert_eq!(w, b);
//! }
//! ```
//!
//! # no_std
//!
//! unicode-segmentation does not depend on libstd, so it can be used in crates
//! with the `#![no_std]` attribute.
//!
//! # crates.io
//!
//! You can use this package in your project by adding the following
//! to your `Cargo.toml`:
//!
//! ```toml
//! [dependencies]
//! unicode-segmentation = "1.9.0"
//! ```
#![deny(missing_docs, unsafe_code)]
#![doc(
html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png"
)]
#![no_std]
#[cfg(test)]
#[macro_use]
extern crate std;
#[cfg(test)]
#[macro_use]
extern crate quickcheck;
pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
pub use grapheme::{GraphemeIndices, Graphemes};
pub use sentence::{USentenceBoundIndices, USentenceBounds, UnicodeSentences};
pub use tables::UNICODE_VERSION;
pub use word::{UWordBoundIndices, UWordBounds, UnicodeWordIndices, UnicodeWords};
mod grapheme;
#[rustfmt::skip]
mod tables;
mod sentence;
mod word;
#[cfg(test)]
mod test;
#[cfg(test)]
mod testdata;
/// Methods for segmenting strings according to
/// [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/).
pub trait UnicodeSegmentation {
/// Returns an iterator over the [grapheme clusters][graphemes] of `self`.
///
/// [graphemes]: http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
///
/// If `is_extended` is true, the iterator is over the
/// *extended grapheme clusters*;
/// otherwise, the iterator is over the *legacy grapheme clusters*.
/// [UAX#29](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries)
/// recommends extended grapheme cluster boundaries for general processing.
///
/// # Examples
///
/// ```
/// # use self::unicode_segmentation::UnicodeSegmentation;
/// let gr1 = UnicodeSegmentation::graphemes("a\u{310}e\u{301}o\u{308}\u{332}", true)
/// .collect::<Vec<&str>>();
/// let b: &[_] = &["a\u{310}", "e\u{301}", "o\u{308}\u{332}"];
///
/// assert_eq!(&gr1[..], b);
///
/// let gr2 = UnicodeSegmentation::graphemes("a\r\nb🇷🇺🇸🇹", true).collect::<Vec<&str>>();
/// let b: &[_] = &["a", "\r\n", "b", "🇷🇺", "🇸🇹"];
///
/// assert_eq!(&gr2[..], b);
/// ```
fn graphemes<'a>(&'a self, is_extended: bool) -> Graphemes<'a>;
/// Returns an iterator over the grapheme clusters of `self` and their
/// byte offsets. See `graphemes()` for more information.
///
/// # Examples
///
/// ```
/// # use self::unicode_segmentation::UnicodeSegmentation;
/// let gr_inds = UnicodeSegmentation::grapheme_indices("a̐éö̲\r\n", true)
/// .collect::<Vec<(usize, &str)>>();
/// let b: &[_] = &[(0, "a̐"), (3, "é"), (6, "ö̲"), (11, "\r\n")];
///
/// assert_eq!(&gr_inds[..], b);
/// ```
fn grapheme_indices<'a>(&'a self, is_extended: bool) -> GraphemeIndices<'a>;
/// Returns an iterator over the words of `self`, separated on
/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
///
/// Here, "words" are just those substrings which, after splitting on
/// UAX#29 word boundaries, contain any alphanumeric characters. That is, the
/// substring must contain at least one character with the
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
/// property, or with
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
///
/// # Example
///
/// ```
/// # use self::unicode_segmentation::UnicodeSegmentation;
/// let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
/// let uw1 = uws.unicode_words().collect::<Vec<&str>>();
/// let b: &[_] = &["The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right"];
///
/// assert_eq!(&uw1[..], b);
/// ```
fn unicode_words<'a>(&'a self) -> UnicodeWords<'a>;
/// Returns an iterator over the words of `self`, separated on
/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), and their
/// offsets.
///
/// Here, "words" are just those substrings which, after splitting on
/// UAX#29 word boundaries, contain any alphanumeric characters. That is, the
/// substring must contain at least one character with the
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
/// property, or with
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
///
/// # Example
///
/// ```
/// # use self::unicode_segmentation::UnicodeSegmentation;
/// let uwis = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
/// let uwi1 = uwis.unicode_word_indices().collect::<Vec<(usize, &str)>>();
/// let b: &[_] = &[(0, "The"), (4, "quick"), (12, "brown"), (20, "fox"), (24, "can't"),
/// (30, "jump"), (35, "32.3"), (40, "feet"), (46, "right")];
///
/// assert_eq!(&uwi1[..], b);
/// ```
fn unicode_word_indices<'a>(&'a self) -> UnicodeWordIndices<'a>;
/// Returns an iterator over substrings of `self` separated on
/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
///
/// The concatenation of the substrings returned by this function is just the original string.
///
/// # Example
///
/// ```
/// # use self::unicode_segmentation::UnicodeSegmentation;
/// let swu1 = "The quick (\"brown\") fox".split_word_bounds().collect::<Vec<&str>>();
/// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox"];
///
/// assert_eq!(&swu1[..], b);
/// ```
fn split_word_bounds<'a>(&'a self) -> UWordBounds<'a>;
/// Returns an iterator over substrings of `self`, split on UAX#29 word boundaries,
/// and their offsets. See `split_word_bounds()` for more information.
///
/// # Example
///
/// ```
/// # use self::unicode_segmentation::UnicodeSegmentation;
/// let swi1 = "Brr, it's 29.3°F!".split_word_bound_indices().collect::<Vec<(usize, &str)>>();
/// let b: &[_] = &[(0, "Brr"), (3, ","), (4, " "), (5, "it's"), (9, " "), (10, "29.3"),
/// (14, "°"), (16, "F"), (17, "!")];
///
/// assert_eq!(&swi1[..], b);
/// ```
fn split_word_bound_indices<'a>(&'a self) -> UWordBoundIndices<'a>;
/// Returns an iterator over substrings of `self` separated on
/// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
///
/// Here, "sentences" are just those substrings which, after splitting on
/// UAX#29 sentence boundaries, contain any alphanumeric characters. That is, the
/// substring must contain at least one character with the
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
/// property, or with
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
///
/// # Example
///
/// ```
/// # use self::unicode_segmentation::UnicodeSegmentation;
/// let uss = "Mr. Fox jumped. [...] The dog was too lazy.";
/// let us1 = uss.unicode_sentences().collect::<Vec<&str>>();
/// let b: &[_] = &["Mr. ", "Fox jumped. ", "The dog was too lazy."];
///
/// assert_eq!(&us1[..], b);
/// ```
fn unicode_sentences<'a>(&'a self) -> UnicodeSentences<'a>;
/// Returns an iterator over substrings of `self` separated on
/// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
///
/// The concatenation of the substrings returned by this function is just the original string.
///
/// # Example
///
/// ```
/// # use self::unicode_segmentation::UnicodeSegmentation;
/// let ssbs = "Mr. Fox jumped. [...] The dog was too lazy.";
/// let ssb1 = ssbs.split_sentence_bounds().collect::<Vec<&str>>();
/// let b: &[_] = &["Mr. ", "Fox jumped. ", "[...] ", "The dog was too lazy."];
///
/// assert_eq!(&ssb1[..], b);
/// ```
fn split_sentence_bounds<'a>(&'a self) -> USentenceBounds<'a>;
/// Returns an iterator over substrings of `self`, split on UAX#29 sentence boundaries,
/// and their offsets. See `split_sentence_bounds()` for more information.
///
/// # Example
///
/// ```
/// # use self::unicode_segmentation::UnicodeSegmentation;
/// let ssis = "Mr. Fox jumped. [...] The dog was too lazy.";
/// let ssi1 = ssis.split_sentence_bound_indices().collect::<Vec<(usize, &str)>>();
/// let b: &[_] = &[(0, "Mr. "), (4, "Fox jumped. "), (16, "[...] "),
/// (22, "The dog was too lazy.")];
///
/// assert_eq!(&ssi1[..], b);
/// ```
fn split_sentence_bound_indices<'a>(&'a self) -> USentenceBoundIndices<'a>;
}
impl UnicodeSegmentation for str {
#[inline]
fn graphemes(&self, is_extended: bool) -> Graphemes {
grapheme::new_graphemes(self, is_extended)
}
#[inline]
fn grapheme_indices(&self, is_extended: bool) -> GraphemeIndices {
grapheme::new_grapheme_indices(self, is_extended)
}
#[inline]
fn unicode_words(&self) -> UnicodeWords {
word::new_unicode_words(self)
}
#[inline]
fn unicode_word_indices(&self) -> UnicodeWordIndices {
word::new_unicode_word_indices(self)
}
#[inline]
fn split_word_bounds(&self) -> UWordBounds {
word::new_word_bounds(self)
}
#[inline]
fn split_word_bound_indices(&self) -> UWordBoundIndices {
word::new_word_bound_indices(self)
}
#[inline]
fn unicode_sentences(&self) -> UnicodeSentences {
sentence::new_unicode_sentences(self)
}
#[inline]
fn split_sentence_bounds(&self) -> USentenceBounds {
sentence::new_sentence_bounds(self)
}
#[inline]
fn split_sentence_bound_indices(&self) -> USentenceBoundIndices {
sentence::new_sentence_bound_indices(self)
}
}

Просмотреть файл

@ -1,415 +0,0 @@
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use core::cmp;
use core::iter::Filter;
// All of the logic for forward iteration over sentences
mod fwd {
use crate::tables::sentence::SentenceCat;
use core::cmp;
// Describe a parsed part of source string as described in this table:
// https://unicode.org/reports/tr29/#Default_Sentence_Boundaries
#[derive(Clone, Copy, PartialEq, Eq)]
enum StatePart {
Sot,
Eot,
Other,
CR,
LF,
Sep,
ATerm,
UpperLower,
ClosePlus,
SpPlus,
STerm,
}
#[derive(Clone, PartialEq, Eq)]
struct SentenceBreaksState(pub [StatePart; 4]);
const INITIAL_STATE: SentenceBreaksState = SentenceBreaksState([
StatePart::Sot,
StatePart::Sot,
StatePart::Sot,
StatePart::Sot,
]);
#[derive(Clone)]
pub struct SentenceBreaks<'a> {
pub string: &'a str,
pos: usize,
state: SentenceBreaksState,
}
impl SentenceBreaksState {
// Attempt to advance the internal state by one part
// Whitespace and some punctutation will be collapsed
fn next(&self, cat: SentenceCat) -> SentenceBreaksState {
let &SentenceBreaksState(parts) = self;
let parts = match (parts[3], cat) {
(StatePart::ClosePlus, SentenceCat::SC_Close) => parts,
(StatePart::SpPlus, SentenceCat::SC_Sp) => parts,
_ => [
parts[1],
parts[2],
parts[3],
match cat {
SentenceCat::SC_CR => StatePart::CR,
SentenceCat::SC_LF => StatePart::LF,
SentenceCat::SC_Sep => StatePart::Sep,
SentenceCat::SC_ATerm => StatePart::ATerm,
SentenceCat::SC_Upper | SentenceCat::SC_Lower => StatePart::UpperLower,
SentenceCat::SC_Close => StatePart::ClosePlus,
SentenceCat::SC_Sp => StatePart::SpPlus,
SentenceCat::SC_STerm => StatePart::STerm,
_ => StatePart::Other,
},
],
};
SentenceBreaksState(parts)
}
fn end(&self) -> SentenceBreaksState {
let &SentenceBreaksState(parts) = self;
SentenceBreaksState([parts[1], parts[2], parts[3], StatePart::Eot])
}
// Helper function to check if state head matches a single `StatePart`
fn match1(&self, part: StatePart) -> bool {
let &SentenceBreaksState(parts) = self;
part == parts[3]
}
// Helper function to check if first two `StateParts` in state match
// the given two
fn match2(&self, part1: StatePart, part2: StatePart) -> bool {
let &SentenceBreaksState(parts) = self;
part1 == parts[2] && part2 == parts[3]
}
}
// https://unicode.org/reports/tr29/#SB8
// TODO cache this, it is currently quadratic
fn match_sb8(state: &SentenceBreaksState, ahead: &str) -> bool {
let &SentenceBreaksState(parts) = state;
let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
if parts[idx] == StatePart::ClosePlus {
idx -= 1
}
if parts[idx] == StatePart::ATerm {
use crate::tables::sentence as se;
for next_char in ahead.chars() {
//( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower
match se::sentence_category(next_char).2 {
se::SC_Lower => return true,
se::SC_OLetter
| se::SC_Upper
| se::SC_Sep
| se::SC_CR
| se::SC_LF
| se::SC_STerm
| se::SC_ATerm => return false,
_ => continue,
}
}
}
false
}
// https://unicode.org/reports/tr29/#SB8a
fn match_sb8a(state: &SentenceBreaksState) -> bool {
// SATerm Close* Sp*
let &SentenceBreaksState(parts) = state;
let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
if parts[idx] == StatePart::ClosePlus {
idx -= 1
}
parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
}
// https://unicode.org/reports/tr29/#SB9
fn match_sb9(state: &SentenceBreaksState) -> bool {
// SATerm Close*
let &SentenceBreaksState(parts) = state;
let idx = if parts[3] == StatePart::ClosePlus {
2
} else {
3
};
parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
}
// https://unicode.org/reports/tr29/#SB11
fn match_sb11(state: &SentenceBreaksState) -> bool {
// SATerm Close* Sp* ParaSep?
let &SentenceBreaksState(parts) = state;
let mut idx = match parts[3] {
StatePart::Sep | StatePart::CR | StatePart::LF => 2,
_ => 3,
};
if parts[idx] == StatePart::SpPlus {
idx -= 1
}
if parts[idx] == StatePart::ClosePlus {
idx -= 1
}
parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
}
impl<'a> Iterator for SentenceBreaks<'a> {
// Returns the index of the character which follows a break
type Item = usize;
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
let slen = self.string.len();
// A sentence could be one character
(cmp::min(slen, 2), Some(slen + 1))
}
#[inline]
fn next(&mut self) -> Option<usize> {
use crate::tables::sentence as se;
for next_char in self.string[self.pos..].chars() {
let position_before = self.pos;
let state_before = self.state.clone();
let next_cat = se::sentence_category(next_char).2;
self.pos += next_char.len_utf8();
self.state = self.state.next(next_cat);
match next_cat {
// SB1 https://unicode.org/reports/tr29/#SB1
_ if state_before.match1(StatePart::Sot) => return Some(position_before),
// SB2 is handled when inner iterator (chars) is finished
// SB3 https://unicode.org/reports/tr29/#SB3
SentenceCat::SC_LF if state_before.match1(StatePart::CR) => continue,
// SB4 https://unicode.org/reports/tr29/#SB4
_ if state_before.match1(StatePart::Sep)
|| state_before.match1(StatePart::CR)
|| state_before.match1(StatePart::LF) =>
{
return Some(position_before)
}
// SB5 https://unicode.org/reports/tr29/#SB5
SentenceCat::SC_Extend | SentenceCat::SC_Format => self.state = state_before,
// SB6 https://unicode.org/reports/tr29/#SB6
SentenceCat::SC_Numeric if state_before.match1(StatePart::ATerm) => continue,
// SB7 https://unicode.org/reports/tr29/#SB7
SentenceCat::SC_Upper
if state_before.match2(StatePart::UpperLower, StatePart::ATerm) =>
{
continue
}
// SB8 https://unicode.org/reports/tr29/#SB8
_ if match_sb8(&state_before, &self.string[position_before..]) => continue,
// SB8a https://unicode.org/reports/tr29/#SB8a
SentenceCat::SC_SContinue | SentenceCat::SC_STerm | SentenceCat::SC_ATerm
if match_sb8a(&state_before) =>
{
continue
}
// SB9 https://unicode.org/reports/tr29/#SB9
SentenceCat::SC_Close
| SentenceCat::SC_Sp
| SentenceCat::SC_Sep
| SentenceCat::SC_CR
| SentenceCat::SC_LF
if match_sb9(&state_before) =>
{
continue
}
// SB10 https://unicode.org/reports/tr29/#SB10
SentenceCat::SC_Sp
| SentenceCat::SC_Sep
| SentenceCat::SC_CR
| SentenceCat::SC_LF
if match_sb8a(&state_before) =>
{
continue
}
// SB11 https://unicode.org/reports/tr29/#SB11
_ if match_sb11(&state_before) => return Some(position_before),
// SB998 https://unicode.org/reports/tr29/#SB998
_ => continue,
}
}
// SB2 https://unicode.org/reports/tr29/#SB2
if self.state.match1(StatePart::Sot) {
None
} else if self.state.match1(StatePart::Eot) {
None
} else {
self.state = self.state.end();
Some(self.pos)
}
}
}
pub fn new_sentence_breaks<'a>(source: &'a str) -> SentenceBreaks<'a> {
SentenceBreaks {
string: source,
pos: 0,
state: INITIAL_STATE,
}
}
}
/// An iterator over the substrings of a string which, after splitting the string on
/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries),
/// contain any characters with the
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
/// property, or with
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
///
/// This struct is created by the [`unicode_sentences`] method on the [`UnicodeSegmentation`]
/// trait. See its documentation for more.
///
/// [`unicode_sentences`]: trait.UnicodeSegmentation.html#tymethod.unicode_sentences
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
#[derive(Clone)]
pub struct UnicodeSentences<'a> {
inner: Filter<USentenceBounds<'a>, fn(&&str) -> bool>,
}
/// External iterator for a string's
/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
///
/// This struct is created by the [`split_sentence_bounds`] method on the [`UnicodeSegmentation`]
/// trait. See its documentation for more.
///
/// [`split_sentence_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bounds
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
#[derive(Clone)]
pub struct USentenceBounds<'a> {
iter: fwd::SentenceBreaks<'a>,
sentence_start: Option<usize>,
}
/// External iterator for sentence boundaries and byte offsets.
///
/// This struct is created by the [`split_sentence_bound_indices`] method on the
/// [`UnicodeSegmentation`] trait. See its documentation for more.
///
/// [`split_sentence_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bound_indices
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
#[derive(Clone)]
pub struct USentenceBoundIndices<'a> {
start_offset: usize,
iter: USentenceBounds<'a>,
}
#[inline]
pub fn new_sentence_bounds<'a>(source: &'a str) -> USentenceBounds<'a> {
USentenceBounds {
iter: fwd::new_sentence_breaks(source),
sentence_start: None,
}
}
#[inline]
pub fn new_sentence_bound_indices<'a>(source: &'a str) -> USentenceBoundIndices<'a> {
USentenceBoundIndices {
start_offset: source.as_ptr() as usize,
iter: new_sentence_bounds(source),
}
}
#[inline]
pub fn new_unicode_sentences<'b>(s: &'b str) -> UnicodeSentences<'b> {
use super::UnicodeSegmentation;
use crate::tables::util::is_alphanumeric;
fn has_alphanumeric(s: &&str) -> bool {
s.chars().any(|c| is_alphanumeric(c))
}
let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer
UnicodeSentences {
inner: s.split_sentence_bounds().filter(has_alphanumeric),
}
}
impl<'a> Iterator for UnicodeSentences<'a> {
type Item = &'a str;
#[inline]
fn next(&mut self) -> Option<&'a str> {
self.inner.next()
}
}
impl<'a> Iterator for USentenceBounds<'a> {
type Item = &'a str;
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
let (lower, upper) = self.iter.size_hint();
(cmp::max(0, lower - 1), upper.map(|u| cmp::max(0, u - 1)))
}
#[inline]
fn next(&mut self) -> Option<&'a str> {
if self.sentence_start == None {
if let Some(start_pos) = self.iter.next() {
self.sentence_start = Some(start_pos)
} else {
return None;
}
}
if let Some(break_pos) = self.iter.next() {
let start_pos = self.sentence_start.unwrap();
let sentence = &self.iter.string[start_pos..break_pos];
self.sentence_start = Some(break_pos);
Some(sentence)
} else {
None
}
}
}
impl<'a> Iterator for USentenceBoundIndices<'a> {
type Item = (usize, &'a str);
#[inline]
fn next(&mut self) -> Option<(usize, &'a str)> {
self.iter
.next()
.map(|s| (s.as_ptr() as usize - self.start_offset, s))
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
self.iter.size_hint()
}
}

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,247 +0,0 @@
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use super::UnicodeSegmentation;
use std::prelude::v1::*;
#[test]
fn test_graphemes() {
use crate::testdata::{TEST_DIFF, TEST_SAME};
pub const EXTRA_DIFF: &'static [(
&'static str,
&'static [&'static str],
&'static [&'static str],
)] = &[
// Official test suite doesn't include two Prepend chars between two other chars.
(
"\u{20}\u{600}\u{600}\u{20}",
&["\u{20}", "\u{600}\u{600}\u{20}"],
&["\u{20}", "\u{600}", "\u{600}", "\u{20}"],
),
// Test for Prepend followed by two Any chars
(
"\u{600}\u{20}\u{20}",
&["\u{600}\u{20}", "\u{20}"],
&["\u{600}", "\u{20}", "\u{20}"],
),
];
pub const EXTRA_SAME: &'static [(&'static str, &'static [&'static str])] = &[
// family emoji (more than two emoji joined by ZWJ)
(
"\u{1f468}\u{200d}\u{1f467}\u{200d}\u{1f466}",
&["\u{1f468}\u{200d}\u{1f467}\u{200d}\u{1f466}"],
),
// cartwheel emoji followed by two fitzpatrick skin tone modifiers
// (test case from issue #19)
(
"\u{1F938}\u{1F3FE}\u{1F3FE}",
&["\u{1F938}\u{1F3FE}\u{1F3FE}"],
),
];
for &(s, g) in TEST_SAME.iter().chain(EXTRA_SAME) {
// test forward iterator
assert!(UnicodeSegmentation::graphemes(s, true).eq(g.iter().cloned()));
assert!(UnicodeSegmentation::graphemes(s, false).eq(g.iter().cloned()));
// test reverse iterator
assert!(UnicodeSegmentation::graphemes(s, true)
.rev()
.eq(g.iter().rev().cloned()));
assert!(UnicodeSegmentation::graphemes(s, false)
.rev()
.eq(g.iter().rev().cloned()));
}
for &(s, gt, gf) in TEST_DIFF.iter().chain(EXTRA_DIFF) {
// test forward iterator
assert!(UnicodeSegmentation::graphemes(s, true).eq(gt.iter().cloned()));
assert!(UnicodeSegmentation::graphemes(s, false).eq(gf.iter().cloned()));
// test reverse iterator
assert!(UnicodeSegmentation::graphemes(s, true)
.rev()
.eq(gt.iter().rev().cloned()));
assert!(UnicodeSegmentation::graphemes(s, false)
.rev()
.eq(gf.iter().rev().cloned()));
}
// test the indices iterators
let s = "a̐éö̲\r\n";
let gr_inds = UnicodeSegmentation::grapheme_indices(s, true).collect::<Vec<(usize, &str)>>();
let b: &[_] = &[(0, ""), (3, ""), (6, "ö̲"), (11, "\r\n")];
assert_eq!(gr_inds, b);
let gr_inds = UnicodeSegmentation::grapheme_indices(s, true)
.rev()
.collect::<Vec<(usize, &str)>>();
let b: &[_] = &[(11, "\r\n"), (6, "ö̲"), (3, ""), (0, "")];
assert_eq!(gr_inds, b);
let mut gr_inds_iter = UnicodeSegmentation::grapheme_indices(s, true);
{
let gr_inds = gr_inds_iter.by_ref();
let e1 = gr_inds.size_hint();
assert_eq!(e1, (1, Some(13)));
let c = gr_inds.count();
assert_eq!(c, 4);
}
let e2 = gr_inds_iter.size_hint();
assert_eq!(e2, (0, Some(0)));
// make sure the reverse iterator does the right thing with "\n" at beginning of string
let s = "\n\r\n\r";
let gr = UnicodeSegmentation::graphemes(s, true)
.rev()
.collect::<Vec<&str>>();
let b: &[_] = &["\r", "\r\n", "\n"];
assert_eq!(gr, b);
}
#[test]
fn test_words() {
use crate::testdata::TEST_WORD;
// Unicode's official tests don't really test longer chains of flag emoji
// TODO This could be improved with more tests like flag emoji with interspersed Extend chars and ZWJ
const EXTRA_TESTS: &'static [(&'static str, &'static [&'static str])] = &[
(
"🇦🇫🇦🇽🇦🇱🇩🇿🇦🇸🇦🇩🇦🇴",
&["🇦🇫", "🇦🇽", "🇦🇱", "🇩🇿", "🇦🇸", "🇦🇩", "🇦🇴"],
),
("🇦🇫🇦🇽🇦🇱🇩🇿🇦🇸🇦🇩🇦", &["🇦🇫", "🇦🇽", "🇦🇱", "🇩🇿", "🇦🇸", "🇦🇩", "🇦"]),
(
"🇦a🇫🇦🇽a🇦🇱🇩🇿🇦🇸🇦🇩🇦",
&["🇦", "a", "🇫🇦", "🇽", "a", "🇦🇱", "🇩🇿", "🇦🇸", "🇦🇩", "🇦"],
),
(
"\u{1f468}\u{200d}\u{1f468}\u{200d}\u{1f466}",
&["\u{1f468}\u{200d}\u{1f468}\u{200d}\u{1f466}"],
),
("😌👎🏼", &["😌", "👎🏼"]),
// perhaps wrong, spaces should not be included?
("hello world", &["hello", " ", "world"]),
("🇨🇦🇨🇭🇿🇲🇿 hi", &["🇨🇦", "🇨🇭", "🇿🇲", "🇿", " ", "hi"]),
];
for &(s, w) in TEST_WORD.iter().chain(EXTRA_TESTS.iter()) {
macro_rules! assert_ {
($test:expr, $exp:expr, $name:expr) => {
// collect into vector for better diagnostics in failure case
let testing = $test.collect::<Vec<_>>();
let expected = $exp.collect::<Vec<_>>();
assert_eq!(
testing, expected,
"{} test for testcase ({:?}, {:?}) failed.",
$name, s, w
)
};
}
// test forward iterator
assert_!(
s.split_word_bounds(),
w.iter().cloned(),
"Forward word boundaries"
);
// test reverse iterator
assert_!(
s.split_word_bounds().rev(),
w.iter().rev().cloned(),
"Reverse word boundaries"
);
// generate offsets from word string lengths
let mut indices = vec![0];
for i in w.iter().cloned().map(|s| s.len()).scan(0, |t, n| {
*t += n;
Some(*t)
}) {
indices.push(i);
}
indices.pop();
let indices = indices;
// test forward indices iterator
assert_!(
s.split_word_bound_indices().map(|(l, _)| l),
indices.iter().cloned(),
"Forward word indices"
);
// test backward indices iterator
assert_!(
s.split_word_bound_indices().rev().map(|(l, _)| l),
indices.iter().rev().cloned(),
"Reverse word indices"
);
}
}
#[test]
fn test_sentences() {
use crate::testdata::TEST_SENTENCE;
for &(s, w) in TEST_SENTENCE.iter() {
macro_rules! assert_ {
($test:expr, $exp:expr, $name:expr) => {
// collect into vector for better diagnostics in failure case
let testing = $test.collect::<Vec<_>>();
let expected = $exp.collect::<Vec<_>>();
assert_eq!(
testing, expected,
"{} test for testcase ({:?}, {:?}) failed.",
$name, s, w
)
};
}
assert_!(
s.split_sentence_bounds(),
w.iter().cloned(),
"Forward sentence boundaries"
);
}
}
quickcheck! {
fn quickcheck_forward_reverse_graphemes_extended(s: String) -> bool {
let a = s.graphemes(true).collect::<Vec<_>>();
let mut b = s.graphemes(true).rev().collect::<Vec<_>>();
b.reverse();
a == b
}
fn quickcheck_forward_reverse_graphemes_legacy(s: String) -> bool {
let a = s.graphemes(false).collect::<Vec<_>>();
let mut b = s.graphemes(false).rev().collect::<Vec<_>>();
b.reverse();
a == b
}
fn quickcheck_join_graphemes(s: String) -> bool {
let a = s.graphemes(true).collect::<String>();
let b = s.graphemes(false).collect::<String>();
a == s && b == s
}
fn quickcheck_forward_reverse_words(s: String) -> bool {
let a = s.split_word_bounds().collect::<Vec<_>>();
let mut b = s.split_word_bounds().rev().collect::<Vec<_>>();
b.reverse();
a == b
}
fn quickcheck_join_words(s: String) -> bool {
let a = s.split_word_bounds().collect::<String>();
a == s
}
}

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,754 +0,0 @@
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use core::cmp;
use core::iter::Filter;
use crate::tables::word::WordCat;
/// An iterator over the substrings of a string which, after splitting the string on
/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
/// contain any characters with the
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
/// property, or with
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
///
/// This struct is created by the [`unicode_words`] method on the [`UnicodeSegmentation`] trait. See
/// its documentation for more.
///
/// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
pub struct UnicodeWords<'a> {
inner: Filter<UWordBounds<'a>, fn(&&str) -> bool>,
}
impl<'a> Iterator for UnicodeWords<'a> {
type Item = &'a str;
#[inline]
fn next(&mut self) -> Option<&'a str> {
self.inner.next()
}
}
impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
#[inline]
fn next_back(&mut self) -> Option<&'a str> {
self.inner.next_back()
}
}
/// An iterator over the substrings of a string which, after splitting the string on
/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
/// contain any characters with the
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
/// property, or with
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
/// This iterator also provides the byte offsets for each substring.
///
/// This struct is created by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See
/// its documentation for more.
///
/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
pub struct UnicodeWordIndices<'a> {
inner: Filter<UWordBoundIndices<'a>, fn(&(usize, &str)) -> bool>,
}
impl<'a> Iterator for UnicodeWordIndices<'a> {
type Item = (usize, &'a str);
#[inline]
fn next(&mut self) -> Option<(usize, &'a str)> {
self.inner.next()
}
}
impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> {
#[inline]
fn next_back(&mut self) -> Option<(usize, &'a str)> {
self.inner.next_back()
}
}
/// External iterator for a string's
/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
///
/// This struct is created by the [`split_word_bounds`] method on the [`UnicodeSegmentation`]
/// trait. See its documentation for more.
///
/// [`split_word_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_word_bounds
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
#[derive(Clone)]
pub struct UWordBounds<'a> {
string: &'a str,
cat: Option<WordCat>,
catb: Option<WordCat>,
}
/// External iterator for word boundaries and byte offsets.
///
/// This struct is created by the [`split_word_bound_indices`] method on the
/// [`UnicodeSegmentation`] trait. See its documentation for more.
///
/// [`split_word_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_word_bound_indices
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
#[derive(Clone)]
pub struct UWordBoundIndices<'a> {
start_offset: usize,
iter: UWordBounds<'a>,
}
impl<'a> UWordBoundIndices<'a> {
#[inline]
/// View the underlying data (the part yet to be iterated) as a slice of the original string.
///
/// ```rust
/// # use unicode_segmentation::UnicodeSegmentation;
/// let mut iter = "Hello world".split_word_bound_indices();
/// assert_eq!(iter.as_str(), "Hello world");
/// iter.next();
/// assert_eq!(iter.as_str(), " world");
/// iter.next();
/// assert_eq!(iter.as_str(), "world");
/// ```
pub fn as_str(&self) -> &'a str {
self.iter.as_str()
}
}
impl<'a> Iterator for UWordBoundIndices<'a> {
type Item = (usize, &'a str);
#[inline]
fn next(&mut self) -> Option<(usize, &'a str)> {
self.iter
.next()
.map(|s| (s.as_ptr() as usize - self.start_offset, s))
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
self.iter.size_hint()
}
}
impl<'a> DoubleEndedIterator for UWordBoundIndices<'a> {
#[inline]
fn next_back(&mut self) -> Option<(usize, &'a str)> {
self.iter
.next_back()
.map(|s| (s.as_ptr() as usize - self.start_offset, s))
}
}
// state machine for word boundary rules
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
enum UWordBoundsState {
Start,
Letter,
HLetter,
Numeric,
Katakana,
ExtendNumLet,
Regional(RegionalState),
FormatExtend(FormatExtendType),
Zwj,
Emoji,
WSegSpace,
}
// subtypes for FormatExtend state in UWordBoundsState
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
enum FormatExtendType {
AcceptAny,
AcceptNone,
RequireLetter,
RequireHLetter,
AcceptQLetter,
RequireNumeric,
}
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
enum RegionalState {
Half,
Full,
Unknown,
}
fn is_emoji(ch: char) -> bool {
use crate::tables::emoji;
emoji::emoji_category(ch).2 == emoji::EmojiCat::EC_Extended_Pictographic
}
impl<'a> Iterator for UWordBounds<'a> {
type Item = &'a str;
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
let slen = self.string.len();
(cmp::min(slen, 1), Some(slen))
}
#[inline]
fn next(&mut self) -> Option<&'a str> {
use self::FormatExtendType::*;
use self::UWordBoundsState::*;
use crate::tables::word as wd;
if self.string.len() == 0 {
return None;
}
let mut take_curr = true;
let mut take_cat = true;
let mut idx = 0;
let mut saveidx = 0;
let mut state = Start;
let mut cat = wd::WC_Any;
let mut savecat = wd::WC_Any;
// If extend/format/zwj were skipped. Handles precedence of WB3d over WB4
let mut skipped_format_extend = false;
for (curr, ch) in self.string.char_indices() {
idx = curr;
// Whether or not the previous category was ZWJ
// ZWJs get collapsed, so this handles precedence of WB3c over WB4
let prev_zwj = cat == wd::WC_ZWJ;
// if there's a category cached, grab it
cat = match self.cat {
None => wd::word_category(ch).2,
_ => self.cat.take().unwrap(),
};
take_cat = true;
// handle rule WB4
// just skip all format, extend, and zwj chars
// note that Start is a special case: if there's a bunch of Format | Extend
// characters at the beginning of a block of text, dump them out as one unit.
//
// (This is not obvious from the wording of UAX#29, but if you look at the
// test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
// then the "correct" interpretation of WB4 becomes apparent.)
if state != Start {
match cat {
wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => {
skipped_format_extend = true;
continue;
}
_ => {}
}
}
// rule WB3c
// WB4 makes all ZWJs collapse into the previous state
// but you can still be in a Zwj state if you started with Zwj
//
// This means that an EP + Zwj will collapse into EP, which is wrong,
// since EP+EP is not a boundary but EP+ZWJ+EP is
//
// Thus, we separately keep track of whether or not the last character
// was a ZWJ. This is an additional bit of state tracked outside of the
// state enum; the state enum represents the last non-zwj state encountered.
// When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state,
// however we are in the previous state for the purposes of all other rules.
if prev_zwj {
if is_emoji(ch) {
state = Emoji;
continue;
}
}
// Don't use `continue` in this match without updating `cat`
state = match state {
Start if cat == wd::WC_CR => {
idx += match self.get_next_cat(idx) {
Some(ncat) if ncat == wd::WC_LF => 1, // rule WB3
_ => 0,
};
break; // rule WB3a
}
Start => match cat {
wd::WC_ALetter => Letter, // rule WB5, WB6, WB9, WB13a
wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB6, WB7a, WB7b, WB9, WB13a
wd::WC_Numeric => Numeric, // rule WB8, WB10, WB12, WB13a
wd::WC_Katakana => Katakana, // rule WB13, WB13a
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a, WB13b
wd::WC_Regional_Indicator => Regional(RegionalState::Half), // rule WB13c
wd::WC_LF | wd::WC_Newline => break, // rule WB3a
wd::WC_ZWJ => Zwj, // rule WB3c
wd::WC_WSegSpace => WSegSpace, // rule WB3d
_ => {
if let Some(ncat) = self.get_next_cat(idx) {
// rule WB4
if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ
{
state = FormatExtend(AcceptNone);
self.cat = Some(ncat);
continue;
}
}
break; // rule WB999
}
},
WSegSpace => match cat {
wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
_ => {
take_curr = false;
break;
}
},
Zwj => {
// We already handle WB3c above.
take_curr = false;
break;
}
Letter | HLetter => match cat {
wd::WC_ALetter => Letter, // rule WB5
wd::WC_Hebrew_Letter => HLetter, // rule WB5
wd::WC_Numeric => Numeric, // rule WB9
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
wd::WC_Double_Quote if state == HLetter => {
savecat = cat;
saveidx = idx;
FormatExtend(RequireHLetter) // rule WB7b
}
wd::WC_Single_Quote if state == HLetter => {
FormatExtend(AcceptQLetter) // rule WB7a
}
wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
savecat = cat;
saveidx = idx;
FormatExtend(RequireLetter) // rule WB6
}
_ => {
take_curr = false;
break;
}
},
Numeric => match cat {
wd::WC_Numeric => Numeric, // rule WB8
wd::WC_ALetter => Letter, // rule WB10
wd::WC_Hebrew_Letter => HLetter, // rule WB10
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
savecat = cat;
saveidx = idx;
FormatExtend(RequireNumeric) // rule WB12
}
_ => {
take_curr = false;
break;
}
},
Katakana => match cat {
wd::WC_Katakana => Katakana, // rule WB13
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
_ => {
take_curr = false;
break;
}
},
ExtendNumLet => match cat {
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
wd::WC_ALetter => Letter, // rule WB13b
wd::WC_Hebrew_Letter => HLetter, // rule WB13b
wd::WC_Numeric => Numeric, // rule WB13b
wd::WC_Katakana => Katakana, // rule WB13b
_ => {
take_curr = false;
break;
}
},
Regional(RegionalState::Full) => {
// if it reaches here we've gone too far,
// a full flag can only compose with ZWJ/Extend/Format
// proceeding it.
take_curr = false;
break;
}
Regional(RegionalState::Half) => match cat {
wd::WC_Regional_Indicator => Regional(RegionalState::Full), // rule WB13c
_ => {
take_curr = false;
break;
}
},
Regional(_) => {
unreachable!("RegionalState::Unknown should not occur on forward iteration")
}
Emoji => {
// We already handle WB3c above. If you've reached this point, the emoji sequence is over.
take_curr = false;
break;
}
FormatExtend(t) => match t {
// handle FormatExtends depending on what type
RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB11
RequireLetter | AcceptQLetter if cat == wd::WC_ALetter => Letter, // rule WB7
RequireLetter | AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // WB7a
RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
AcceptNone | AcceptQLetter => {
take_curr = false; // emit all the Format|Extend characters
take_cat = false;
break;
}
_ => break, // rewind (in if statement below)
},
}
}
if let FormatExtend(t) = state {
// we were looking for something and didn't find it; we have to back up
if t == RequireLetter || t == RequireHLetter || t == RequireNumeric {
idx = saveidx;
cat = savecat;
take_curr = false;
}
}
self.cat = if take_curr {
idx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
None
} else if take_cat {
Some(cat)
} else {
None
};
let retstr = &self.string[..idx];
self.string = &self.string[idx..];
Some(retstr)
}
}
impl<'a> DoubleEndedIterator for UWordBounds<'a> {
#[inline]
fn next_back(&mut self) -> Option<&'a str> {
use self::FormatExtendType::*;
use self::UWordBoundsState::*;
use crate::tables::word as wd;
if self.string.len() == 0 {
return None;
}
let mut take_curr = true;
let mut take_cat = true;
let mut idx = self.string.len();
idx -= self.string.chars().next_back().unwrap().len_utf8();
let mut previdx = idx;
let mut saveidx = idx;
let mut state = Start;
let mut savestate = Start;
let mut cat = wd::WC_Any;
let mut skipped_format_extend = false;
for (curr, ch) in self.string.char_indices().rev() {
previdx = idx;
idx = curr;
// if there's a category cached, grab it
cat = match self.catb {
None => wd::word_category(ch).2,
_ => self.catb.take().unwrap(),
};
take_cat = true;
// backward iterator over word boundaries. Mostly the same as the forward
// iterator, with two weirdnesses:
// (1) If we encounter a single quote in the Start state, we have to check for a
// Hebrew Letter immediately before it.
// (2) Format and Extend char handling takes some gymnastics.
if cat == wd::WC_Extend || cat == wd::WC_Format || (cat == wd::WC_ZWJ && state != Zwj) {
// WB3c has more priority so we should not
// fold in that case
if match state {
FormatExtend(_) | Start => false,
_ => true,
} {
saveidx = previdx;
savestate = state;
state = FormatExtend(AcceptNone);
}
if state != Start {
continue;
}
} else if state == FormatExtend(AcceptNone) {
// finished a scan of some Format|Extend chars, restore previous state
state = savestate;
previdx = saveidx;
take_cat = false;
skipped_format_extend = true;
}
// Don't use `continue` in this match without updating `catb`
state = match state {
Start | FormatExtend(AcceptAny) => match cat {
_ if is_emoji(ch) => Zwj,
wd::WC_ALetter => Letter, // rule WB5, WB7, WB10, WB13b
wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB7, WB7c, WB10, WB13b
wd::WC_Numeric => Numeric, // rule WB8, WB9, WB11, WB13b
wd::WC_Katakana => Katakana, // rule WB13, WB13b
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
wd::WC_Regional_Indicator => Regional(RegionalState::Unknown), // rule WB13c
// rule WB4:
wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => FormatExtend(AcceptAny),
wd::WC_Single_Quote => {
saveidx = idx;
FormatExtend(AcceptQLetter) // rule WB7a
}
wd::WC_WSegSpace => WSegSpace,
wd::WC_CR | wd::WC_LF | wd::WC_Newline => {
if state == Start {
if cat == wd::WC_LF {
idx -= match self.get_prev_cat(idx) {
Some(pcat) if pcat == wd::WC_CR => 1, // rule WB3
_ => 0,
};
}
} else {
take_curr = false;
}
break; // rule WB3a
}
_ => break, // rule WB999
},
Zwj => match cat {
// rule WB3c
wd::WC_ZWJ => FormatExtend(AcceptAny),
_ => {
take_curr = false;
break;
}
},
WSegSpace => match cat {
// rule WB3d
wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
_ => {
take_curr = false;
break;
}
},
Letter | HLetter => match cat {
wd::WC_ALetter => Letter, // rule WB5
wd::WC_Hebrew_Letter => HLetter, // rule WB5
wd::WC_Numeric => Numeric, // rule WB10
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
wd::WC_Double_Quote if state == HLetter => {
saveidx = previdx;
FormatExtend(RequireHLetter) // rule WB7c
}
wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
saveidx = previdx;
FormatExtend(RequireLetter) // rule WB7
}
_ => {
take_curr = false;
break;
}
},
Numeric => match cat {
wd::WC_Numeric => Numeric, // rule WB8
wd::WC_ALetter => Letter, // rule WB9
wd::WC_Hebrew_Letter => HLetter, // rule WB9
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
saveidx = previdx;
FormatExtend(RequireNumeric) // rule WB11
}
_ => {
take_curr = false;
break;
}
},
Katakana => match cat {
wd::WC_Katakana => Katakana, // rule WB13
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
_ => {
take_curr = false;
break;
}
},
ExtendNumLet => match cat {
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
wd::WC_ALetter => Letter, // rule WB13a
wd::WC_Hebrew_Letter => HLetter, // rule WB13a
wd::WC_Numeric => Numeric, // rule WB13a
wd::WC_Katakana => Katakana, // rule WB13a
_ => {
take_curr = false;
break;
}
},
Regional(mut regional_state) => match cat {
// rule WB13c
wd::WC_Regional_Indicator => {
if regional_state == RegionalState::Unknown {
let count = self.string[..previdx]
.chars()
.rev()
.map(|c| wd::word_category(c).2)
.filter(|&c| {
!(c == wd::WC_ZWJ || c == wd::WC_Extend || c == wd::WC_Format)
})
.take_while(|&c| c == wd::WC_Regional_Indicator)
.count();
regional_state = if count % 2 == 0 {
RegionalState::Full
} else {
RegionalState::Half
};
}
if regional_state == RegionalState::Full {
take_curr = false;
break;
} else {
Regional(RegionalState::Full)
}
}
_ => {
take_curr = false;
break;
}
},
Emoji => {
if is_emoji(ch) {
// rule WB3c
Zwj
} else {
take_curr = false;
break;
}
}
FormatExtend(t) => match t {
RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB12
RequireLetter if cat == wd::WC_ALetter => Letter, // rule WB6
RequireLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB6
AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7a
RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
_ => break, // backtrack will happens
},
}
}
if let FormatExtend(t) = state {
// if we required something but didn't find it, backtrack
if t == RequireLetter
|| t == RequireHLetter
|| t == RequireNumeric
|| t == AcceptNone
|| t == AcceptQLetter
{
previdx = saveidx;
take_cat = false;
take_curr = false;
}
}
self.catb = if take_curr {
None
} else {
idx = previdx;
if take_cat {
Some(cat)
} else {
None
}
};
let retstr = &self.string[idx..];
self.string = &self.string[..idx];
Some(retstr)
}
}
impl<'a> UWordBounds<'a> {
#[inline]
/// View the underlying data (the part yet to be iterated) as a slice of the original string.
///
/// ```rust
/// # use unicode_segmentation::UnicodeSegmentation;
/// let mut iter = "Hello world".split_word_bounds();
/// assert_eq!(iter.as_str(), "Hello world");
/// iter.next();
/// assert_eq!(iter.as_str(), " world");
/// iter.next();
/// assert_eq!(iter.as_str(), "world");
/// ```
pub fn as_str(&self) -> &'a str {
self.string
}
#[inline]
fn get_next_cat(&self, idx: usize) -> Option<WordCat> {
use crate::tables::word as wd;
let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
if nidx < self.string.len() {
let nch = self.string[nidx..].chars().next().unwrap();
Some(wd::word_category(nch).2)
} else {
None
}
}
#[inline]
fn get_prev_cat(&self, idx: usize) -> Option<WordCat> {
use crate::tables::word as wd;
if idx > 0 {
let nch = self.string[..idx].chars().next_back().unwrap();
Some(wd::word_category(nch).2)
} else {
None
}
}
}
#[inline]
pub fn new_word_bounds<'b>(s: &'b str) -> UWordBounds<'b> {
UWordBounds {
string: s,
cat: None,
catb: None,
}
}
#[inline]
pub fn new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b> {
UWordBoundIndices {
start_offset: s.as_ptr() as usize,
iter: new_word_bounds(s),
}
}
#[inline]
fn has_alphanumeric(s: &&str) -> bool {
use crate::tables::util::is_alphanumeric;
s.chars().any(|c| is_alphanumeric(c))
}
#[inline]
pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> {
use super::UnicodeSegmentation;
UnicodeWords {
inner: s.split_word_bounds().filter(has_alphanumeric),
}
}
#[inline]
pub fn new_unicode_word_indices<'b>(s: &'b str) -> UnicodeWordIndices<'b> {
use super::UnicodeSegmentation;
UnicodeWordIndices {
inner: s
.split_word_bound_indices()
.filter(|(_, c)| has_alphanumeric(c)),
}
}