зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1847521 - Replace unicode-segmentation with ICU4X in WebDriver. r=webdriver-reviewers,supply-chain-reviewers,jgraham
Differential Revision: https://phabricator.services.mozilla.com/D198132
This commit is contained in:
Родитель
34342fa8e7
Коммит
33707d378d
|
@ -2054,6 +2054,7 @@ dependencies = [
|
|||
"chrono",
|
||||
"clap",
|
||||
"hyper",
|
||||
"icu_segmenter",
|
||||
"lazy_static",
|
||||
"log",
|
||||
"marionette",
|
||||
|
@ -2069,7 +2070,6 @@ dependencies = [
|
|||
"serde_yaml",
|
||||
"tempfile",
|
||||
"thiserror",
|
||||
"unicode-segmentation",
|
||||
"url",
|
||||
"uuid",
|
||||
"webdriver",
|
||||
|
@ -5810,12 +5810,6 @@ dependencies = [
|
|||
"tinyvec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicode-segmentation"
|
||||
version = "1.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0fdbf052a0783de01e944a6ce7a8cb939e295b1e7be835a1112c3b9a7f047a5a"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-width"
|
||||
version = "0.1.10"
|
||||
|
@ -6229,6 +6223,7 @@ dependencies = [
|
|||
"bytes",
|
||||
"cookie",
|
||||
"http",
|
||||
"icu_segmenter",
|
||||
"log",
|
||||
"serde",
|
||||
"serde_derive",
|
||||
|
@ -6237,7 +6232,6 @@ dependencies = [
|
|||
"time 0.3.23",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"unicode-segmentation",
|
||||
"url",
|
||||
"warp",
|
||||
]
|
||||
|
|
|
@ -710,13 +710,6 @@ user-id = 3618
|
|||
user-login = "dtolnay"
|
||||
user-name = "David Tolnay"
|
||||
|
||||
[[publisher.unicode-segmentation]]
|
||||
version = "1.10.0"
|
||||
when = "2022-09-13"
|
||||
user-id = 1139
|
||||
user-login = "Manishearth"
|
||||
user-name = "Manish Goregaokar"
|
||||
|
||||
[[publisher.unicode-width]]
|
||||
version = "0.1.10"
|
||||
when = "2022-09-13"
|
||||
|
|
|
@ -26,6 +26,7 @@ base64 = "0.21"
|
|||
chrono = "0.4.6"
|
||||
clap = { version = "4", default-features = false, features = ["cargo", "std", "suggestions", "wrap_help", "string"] }
|
||||
hyper = "0.14"
|
||||
icu_segmenter = { version = "1.4", default-features = false, features = ["auto", "compiled_data"] }
|
||||
lazy_static = "1.0"
|
||||
log = { version = "0.4", features = ["std"] }
|
||||
marionette = { path = "./marionette", version="0.5.0" }
|
||||
|
@ -40,7 +41,6 @@ serde_json = "1.0"
|
|||
serde_yaml = "0.8"
|
||||
tempfile = "3"
|
||||
thiserror = "1"
|
||||
unicode-segmentation = "1.9"
|
||||
url = "2.4"
|
||||
uuid = { version = "1.0", features = ["v4"] }
|
||||
webdriver = { path = "../webdriver", version="0.50.0" }
|
||||
|
|
|
@ -31,12 +31,12 @@
|
|||
//! [`init`]: fn.init.html
|
||||
//! [`init_with_level`]: fn.init_with_level.html
|
||||
|
||||
use icu_segmenter::GraphemeClusterSegmenter;
|
||||
use std::fmt;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::str;
|
||||
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
|
||||
use unicode_segmentation::UnicodeSegmentation;
|
||||
|
||||
use mozprofile::preferences::Pref;
|
||||
|
||||
|
@ -246,7 +246,15 @@ fn truncate_message(args: &fmt::Arguments) -> Option<(String, String)> {
|
|||
}
|
||||
|
||||
let message = format!("{}", args);
|
||||
let chars = message.graphemes(true).collect::<Vec<&str>>();
|
||||
if message.is_empty() || message.len() < MAX_STRING_LENGTH {
|
||||
return None;
|
||||
}
|
||||
let chars = GraphemeClusterSegmenter::new()
|
||||
.segment_str(&message)
|
||||
.collect::<Vec<_>>()
|
||||
.windows(2)
|
||||
.map(|i| &message[i[0]..i[1]])
|
||||
.collect::<Vec<&str>>();
|
||||
|
||||
if chars.len() > MAX_STRING_LENGTH {
|
||||
let middle: usize = MAX_STRING_LENGTH / 2;
|
||||
|
|
|
@ -26,6 +26,7 @@ base64 = "0.21"
|
|||
bytes = "1.0"
|
||||
cookie = { version = "0.16", default-features = false }
|
||||
http = "0.2"
|
||||
icu_segmenter = { version = "1.4", default-features = false, features = ["auto", "compiled_data"] }
|
||||
log = "0.4"
|
||||
serde = "1.0"
|
||||
serde_json = "1.0"
|
||||
|
@ -33,7 +34,6 @@ serde_derive = "1.0"
|
|||
time = "0.3"
|
||||
tokio = { version = "1.0", features = ["rt", "net"], optional = true}
|
||||
tokio-stream = { version = "0.1", features = ["net"], optional = true}
|
||||
unicode-segmentation = "1.2"
|
||||
url = "2.4"
|
||||
thiserror = "1"
|
||||
warp = { version = "0.3", default-features = false, optional = true }
|
||||
|
|
|
@ -3,12 +3,12 @@
|
|||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
use crate::common::{WebElement, ELEMENT_KEY};
|
||||
use icu_segmenter::GraphemeClusterSegmenter;
|
||||
use serde::de::{self, Deserialize, Deserializer};
|
||||
use serde::ser::{Serialize, Serializer};
|
||||
use serde_json::Value;
|
||||
use std::default::Default;
|
||||
use std::f64;
|
||||
use unicode_segmentation::UnicodeSegmentation;
|
||||
|
||||
#[derive(Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct ActionSequence {
|
||||
|
@ -91,7 +91,7 @@ where
|
|||
{
|
||||
String::deserialize(deserializer).map(|value| {
|
||||
// Only a single Unicode grapheme cluster is allowed
|
||||
if value.graphemes(true).count() != 1 {
|
||||
if GraphemeClusterSegmenter::new().segment_str(&value).count() != 2 {
|
||||
return Err(de::Error::custom(format!(
|
||||
"'{}' should only contain a single Unicode code point",
|
||||
value
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
|
||||
extern crate base64;
|
||||
extern crate cookie;
|
||||
extern crate icu_segmenter;
|
||||
#[macro_use]
|
||||
extern crate log;
|
||||
extern crate http;
|
||||
|
@ -16,7 +17,6 @@ extern crate serde_json;
|
|||
extern crate time;
|
||||
#[cfg(feature = "server")]
|
||||
extern crate tokio;
|
||||
extern crate unicode_segmentation;
|
||||
extern crate url;
|
||||
#[cfg(feature = "server")]
|
||||
extern crate warp;
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
{"files":{"COPYRIGHT":"23860c2a7b5d96b21569afedf033469bab9fe14a1b24a35068b8641c578ce24d","Cargo.toml":"55e5a65c91693dd47a27409e54ad6d5ce805ce003b822e4a568bfd070725e956","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"7b63ecd5f1902af1b63729947373683c32745c16a10e8e6292e2e2dcd7e90ae0","README.md":"efe7aa058e004e12d683039dbc4440e2fec3088364201a620703acedbeef8cb2","benches/graphemes.rs":"88a9f672ea7a03cc15fae36ce544a6e7234e532359402483978858ccda47db3d","benches/unicode_words.rs":"95c3a178ebe07c8cb2c560546ee911bfc4f1e1db81a6cd2c1cef1c99ed2a421a","benches/word_bounds.rs":"66acf40c0a4b06cdb6dd97c1759aba8dea961bb30cd7f223de3ebff8198520b2","scripts/unicode.py":"d4ba970a0419f33d20f3deb888be12427bfbb40aa25a5719968600d45cf4dadb","scripts/unicode_gen_breaktests.py":"ee96982d8959bec75c2382233cfca7e239f12a89a1be5fbf942601a215bb9283","src/grapheme.rs":"b5a32bdbb529e9417e8ada8d92656339b6ffb4e9bed8e6d32a0409c13a03050b","src/lib.rs":"572789173717edd0fe037ae656530663406951636c548e6793711b7d5caad910","src/sentence.rs":"aac52f69207e0b68925ab0c6c18cc36ed3da8e918006d96d724f0f19d4d9d643","src/tables.rs":"ba9fa1774b6294ed14565ec6be0f2ec316759d54e3af7c002b6848973d7b1f3c","src/test.rs":"f039fa285d510244672a067bdbe98ce7ff940e4f2ff82926466e012ac48ad95a","src/testdata.rs":"533c02ecace1bec3d46b65d101c7619bc83a2fb2c187a2c960346533c09a0e3e","src/word.rs":"6eeea9351c12f0a4404606596a487e0e8aa948ba4b134c7cb827ee41557a39fe"},"package":"0fdbf052a0783de01e944a6ce7a8cb939e295b1e7be835a1112c3b9a7f047a5a"}
|
|
@ -1,7 +0,0 @@
|
|||
Licensed under the Apache License, Version 2.0
|
||||
<LICENSE-APACHE or
|
||||
http://www.apache.org/licenses/LICENSE-2.0> or the MIT
|
||||
license <LICENSE-MIT or http://opensource.org/licenses/MIT>,
|
||||
at your option. All files in the project carrying such
|
||||
notice may not be copied, modified, or distributed except
|
||||
according to those terms.
|
|
@ -1,63 +0,0 @@
|
|||
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
|
||||
#
|
||||
# When uploading crates to the registry Cargo will automatically
|
||||
# "normalize" Cargo.toml files for maximal compatibility
|
||||
# with all versions of Cargo and also rewrite `path` dependencies
|
||||
# to registry (e.g., crates.io) dependencies.
|
||||
#
|
||||
# If you are reading this file be aware that the original Cargo.toml
|
||||
# will likely look very different (and much more reasonable).
|
||||
# See Cargo.toml.orig for the original contents.
|
||||
|
||||
[package]
|
||||
edition = "2018"
|
||||
name = "unicode-segmentation"
|
||||
version = "1.10.0"
|
||||
authors = [
|
||||
"kwantam <kwantam@gmail.com>",
|
||||
"Manish Goregaokar <manishsmail@gmail.com>",
|
||||
]
|
||||
exclude = [
|
||||
"target/*",
|
||||
"Cargo.lock",
|
||||
"scripts/tmp",
|
||||
"benches/texts/*",
|
||||
"*.txt",
|
||||
]
|
||||
description = """
|
||||
This crate provides Grapheme Cluster, Word and Sentence boundaries
|
||||
according to Unicode Standard Annex #29 rules.
|
||||
"""
|
||||
homepage = "https://github.com/unicode-rs/unicode-segmentation"
|
||||
documentation = "https://unicode-rs.github.io/unicode-segmentation"
|
||||
readme = "README.md"
|
||||
keywords = [
|
||||
"text",
|
||||
"unicode",
|
||||
"grapheme",
|
||||
"word",
|
||||
"boundary",
|
||||
]
|
||||
license = "MIT/Apache-2.0"
|
||||
repository = "https://github.com/unicode-rs/unicode-segmentation"
|
||||
|
||||
[[bench]]
|
||||
name = "graphemes"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "unicode_words"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "word_bounds"
|
||||
harness = false
|
||||
|
||||
[dev-dependencies.criterion]
|
||||
version = "0.3"
|
||||
|
||||
[dev-dependencies.quickcheck]
|
||||
version = "0.7"
|
||||
|
||||
[features]
|
||||
no_std = []
|
|
@ -1,201 +0,0 @@
|
|||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
|
@ -1,25 +0,0 @@
|
|||
Copyright (c) 2015 The Rust Project Developers
|
||||
|
||||
Permission is hereby granted, free of charge, to any
|
||||
person obtaining a copy of this software and associated
|
||||
documentation files (the "Software"), to deal in the
|
||||
Software without restriction, including without
|
||||
limitation the rights to use, copy, modify, merge,
|
||||
publish, distribute, sublicense, and/or sell copies of
|
||||
the Software, and to permit persons to whom the Software
|
||||
is furnished to do so, subject to the following
|
||||
conditions:
|
||||
|
||||
The above copyright notice and this permission notice
|
||||
shall be included in all copies or substantial portions
|
||||
of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
|
||||
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
|
||||
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
|
||||
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
|
||||
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
|
||||
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
|
@ -1,99 +0,0 @@
|
|||
Iterators which split strings on Grapheme Cluster or Word boundaries, according
|
||||
to the [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/) rules.
|
||||
|
||||
[![Build Status](https://travis-ci.org/unicode-rs/unicode-segmentation.svg)](https://travis-ci.org/unicode-rs/unicode-segmentation)
|
||||
|
||||
[Documentation](https://unicode-rs.github.io/unicode-segmentation/unicode_segmentation/index.html)
|
||||
|
||||
```rust
|
||||
use unicode_segmentation::UnicodeSegmentation;
|
||||
|
||||
fn main() {
|
||||
let s = "a̐éö̲\r\n";
|
||||
let g = s.graphemes(true).collect::<Vec<&str>>();
|
||||
let b: &[_] = &["a̐", "é", "ö̲", "\r\n"];
|
||||
assert_eq!(g, b);
|
||||
|
||||
let s = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
|
||||
let w = s.unicode_words().collect::<Vec<&str>>();
|
||||
let b: &[_] = &["The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right"];
|
||||
assert_eq!(w, b);
|
||||
|
||||
let s = "The quick (\"brown\") fox";
|
||||
let w = s.split_word_bounds().collect::<Vec<&str>>();
|
||||
let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", " ", "fox"];
|
||||
assert_eq!(w, b);
|
||||
}
|
||||
```
|
||||
|
||||
# no_std
|
||||
|
||||
unicode-segmentation does not depend on libstd, so it can be used in crates
|
||||
with the `#![no_std]` attribute.
|
||||
|
||||
# crates.io
|
||||
|
||||
You can use this package in your project by adding the following
|
||||
to your `Cargo.toml`:
|
||||
|
||||
```toml
|
||||
[dependencies]
|
||||
unicode-segmentation = "1.9.0"
|
||||
```
|
||||
|
||||
# Change Log
|
||||
|
||||
## 1.7.1
|
||||
|
||||
* Update docs on version number
|
||||
|
||||
## 1.7.0
|
||||
|
||||
* [#87](https://github.com/unicode-rs/unicode-segmentation/pull/87) Upgrade to Unicode 13
|
||||
* [#79](https://github.com/unicode-rs/unicode-segmentation/pull/79) Implement a special-case lookup for ascii grapheme categories
|
||||
* [#77](https://github.com/unicode-rs/unicode-segmentation/pull/77) Optimization for grapheme iteration
|
||||
|
||||
## 1.6.0
|
||||
|
||||
* [#72](https://github.com/unicode-rs/unicode-segmentation/pull/72) Upgrade to Unicode 12
|
||||
|
||||
## 1.5.0
|
||||
|
||||
* [#68](https://github.com/unicode-rs/unicode-segmentation/pull/68) Upgrade to Unicode 11
|
||||
|
||||
## 1.4.0
|
||||
|
||||
* [#56](https://github.com/unicode-rs/unicode-segmentation/pull/56) Upgrade to Unicode 10
|
||||
|
||||
## 1.3.0
|
||||
|
||||
* [#24](https://github.com/unicode-rs/unicode-segmentation/pull/24) Add support for sentence boundaries
|
||||
* [#44](https://github.com/unicode-rs/unicode-segmentation/pull/44) Treat `gc=No` as a subset of `gc=N`
|
||||
|
||||
## 1.2.1
|
||||
|
||||
* [#37](https://github.com/unicode-rs/unicode-segmentation/pull/37):
|
||||
Fix panic in `provide_context`.
|
||||
* [#40](https://github.com/unicode-rs/unicode-segmentation/pull/40):
|
||||
Fix crash in `prev_boundary`.
|
||||
|
||||
## 1.2.0
|
||||
|
||||
* New `GraphemeCursor` API allows random access and bidirectional iteration.
|
||||
* Fixed incorrect splitting of certain emoji modifier sequences.
|
||||
|
||||
## 1.1.0
|
||||
|
||||
* Add `as_str` methods to the iterator types.
|
||||
|
||||
## 1.0.3
|
||||
|
||||
* Code cleanup and additional tests.
|
||||
|
||||
## 1.0.1
|
||||
|
||||
* Fix a bug affecting some grapheme clusters containing Prepend characters.
|
||||
|
||||
## 1.0.0
|
||||
|
||||
* Upgrade to Unicode 9.0.0.
|
|
@ -1,63 +0,0 @@
|
|||
use criterion::{black_box, criterion_group, criterion_main, Criterion};
|
||||
use unicode_segmentation;
|
||||
|
||||
use std::fs;
|
||||
use unicode_segmentation::UnicodeSegmentation;
|
||||
|
||||
fn graphemes(c: &mut Criterion, lang: &str, path: &str) {
|
||||
let text = fs::read_to_string(path).unwrap();
|
||||
|
||||
c.bench_function(&format!("graphemes_{}", lang), |bench| {
|
||||
bench.iter(|| {
|
||||
for g in UnicodeSegmentation::graphemes(black_box(&*text), true) {
|
||||
black_box(g);
|
||||
}
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
fn graphemes_arabic(c: &mut Criterion) {
|
||||
graphemes(c, "arabic", "benches/texts/arabic.txt");
|
||||
}
|
||||
|
||||
fn graphemes_english(c: &mut Criterion) {
|
||||
graphemes(c, "english", "benches/texts/english.txt");
|
||||
}
|
||||
|
||||
fn graphemes_hindi(c: &mut Criterion) {
|
||||
graphemes(c, "hindi", "benches/texts/hindi.txt");
|
||||
}
|
||||
|
||||
fn graphemes_japanese(c: &mut Criterion) {
|
||||
graphemes(c, "japanese", "benches/texts/japanese.txt");
|
||||
}
|
||||
|
||||
fn graphemes_korean(c: &mut Criterion) {
|
||||
graphemes(c, "korean", "benches/texts/korean.txt");
|
||||
}
|
||||
|
||||
fn graphemes_mandarin(c: &mut Criterion) {
|
||||
graphemes(c, "mandarin", "benches/texts/mandarin.txt");
|
||||
}
|
||||
|
||||
fn graphemes_russian(c: &mut Criterion) {
|
||||
graphemes(c, "russian", "benches/texts/russian.txt");
|
||||
}
|
||||
|
||||
fn graphemes_source_code(c: &mut Criterion) {
|
||||
graphemes(c, "source_code", "benches/texts/source_code.txt");
|
||||
}
|
||||
|
||||
criterion_group!(
|
||||
benches,
|
||||
graphemes_arabic,
|
||||
graphemes_english,
|
||||
graphemes_hindi,
|
||||
graphemes_japanese,
|
||||
graphemes_korean,
|
||||
graphemes_mandarin,
|
||||
graphemes_russian,
|
||||
graphemes_source_code,
|
||||
);
|
||||
|
||||
criterion_main!(benches);
|
|
@ -1,64 +0,0 @@
|
|||
#[macro_use]
|
||||
extern crate bencher;
|
||||
extern crate unicode_segmentation;
|
||||
|
||||
use bencher::Bencher;
|
||||
use std::fs;
|
||||
use unicode_segmentation::UnicodeSegmentation;
|
||||
|
||||
fn unicode_words(bench: &mut Bencher, path: &str) {
|
||||
let text = fs::read_to_string(path).unwrap();
|
||||
bench.iter(|| {
|
||||
for w in text.unicode_words() {
|
||||
bencher::black_box(w);
|
||||
}
|
||||
});
|
||||
|
||||
bench.bytes = text.len() as u64;
|
||||
}
|
||||
|
||||
fn unicode_words_arabic(bench: &mut Bencher) {
|
||||
unicode_words(bench, "benches/texts/arabic.txt");
|
||||
}
|
||||
|
||||
fn unicode_words_english(bench: &mut Bencher) {
|
||||
unicode_words(bench, "benches/texts/english.txt");
|
||||
}
|
||||
|
||||
fn unicode_words_hindi(bench: &mut Bencher) {
|
||||
unicode_words(bench, "benches/texts/hindi.txt");
|
||||
}
|
||||
|
||||
fn unicode_words_japanese(bench: &mut Bencher) {
|
||||
unicode_words(bench, "benches/texts/japanese.txt");
|
||||
}
|
||||
|
||||
fn unicode_words_korean(bench: &mut Bencher) {
|
||||
unicode_words(bench, "benches/texts/korean.txt");
|
||||
}
|
||||
|
||||
fn unicode_words_mandarin(bench: &mut Bencher) {
|
||||
unicode_words(bench, "benches/texts/mandarin.txt");
|
||||
}
|
||||
|
||||
fn unicode_words_russian(bench: &mut Bencher) {
|
||||
unicode_words(bench, "benches/texts/russian.txt");
|
||||
}
|
||||
|
||||
fn unicode_words_source_code(bench: &mut Bencher) {
|
||||
unicode_words(bench, "benches/texts/source_code.txt");
|
||||
}
|
||||
|
||||
benchmark_group!(
|
||||
benches,
|
||||
unicode_words_arabic,
|
||||
unicode_words_english,
|
||||
unicode_words_hindi,
|
||||
unicode_words_japanese,
|
||||
unicode_words_korean,
|
||||
unicode_words_mandarin,
|
||||
unicode_words_russian,
|
||||
unicode_words_source_code,
|
||||
);
|
||||
|
||||
benchmark_main!(benches);
|
|
@ -1,64 +0,0 @@
|
|||
#[macro_use]
|
||||
extern crate bencher;
|
||||
extern crate unicode_segmentation;
|
||||
|
||||
use bencher::Bencher;
|
||||
use std::fs;
|
||||
use unicode_segmentation::UnicodeSegmentation;
|
||||
|
||||
fn word_bounds(bench: &mut Bencher, path: &str) {
|
||||
let text = fs::read_to_string(path).unwrap();
|
||||
bench.iter(|| {
|
||||
for w in text.split_word_bounds() {
|
||||
bencher::black_box(w);
|
||||
}
|
||||
});
|
||||
|
||||
bench.bytes = text.len() as u64;
|
||||
}
|
||||
|
||||
fn word_bounds_arabic(bench: &mut Bencher) {
|
||||
word_bounds(bench, "benches/texts/arabic.txt");
|
||||
}
|
||||
|
||||
fn word_bounds_english(bench: &mut Bencher) {
|
||||
word_bounds(bench, "benches/texts/english.txt");
|
||||
}
|
||||
|
||||
fn word_bounds_hindi(bench: &mut Bencher) {
|
||||
word_bounds(bench, "benches/texts/hindi.txt");
|
||||
}
|
||||
|
||||
fn word_bounds_japanese(bench: &mut Bencher) {
|
||||
word_bounds(bench, "benches/texts/japanese.txt");
|
||||
}
|
||||
|
||||
fn word_bounds_korean(bench: &mut Bencher) {
|
||||
word_bounds(bench, "benches/texts/korean.txt");
|
||||
}
|
||||
|
||||
fn word_bounds_mandarin(bench: &mut Bencher) {
|
||||
word_bounds(bench, "benches/texts/mandarin.txt");
|
||||
}
|
||||
|
||||
fn word_bounds_russian(bench: &mut Bencher) {
|
||||
word_bounds(bench, "benches/texts/russian.txt");
|
||||
}
|
||||
|
||||
fn word_bounds_source_code(bench: &mut Bencher) {
|
||||
word_bounds(bench, "benches/texts/source_code.txt");
|
||||
}
|
||||
|
||||
benchmark_group!(
|
||||
benches,
|
||||
word_bounds_arabic,
|
||||
word_bounds_english,
|
||||
word_bounds_hindi,
|
||||
word_bounds_japanese,
|
||||
word_bounds_korean,
|
||||
word_bounds_mandarin,
|
||||
word_bounds_russian,
|
||||
word_bounds_source_code,
|
||||
);
|
||||
|
||||
benchmark_main!(benches);
|
|
@ -1,381 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
#
|
||||
# Copyright 2011-2015 The Rust Project Developers. See the COPYRIGHT
|
||||
# file at the top-level directory of this distribution and at
|
||||
# http://rust-lang.org/COPYRIGHT.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
# option. This file may not be copied, modified, or distributed
|
||||
# except according to those terms.
|
||||
|
||||
# This script uses the following Unicode tables:
|
||||
# - DerivedCoreProperties.txt
|
||||
# - auxiliary/GraphemeBreakProperty.txt
|
||||
# - auxiliary/WordBreakProperty.txt
|
||||
# - ReadMe.txt
|
||||
# - UnicodeData.txt
|
||||
#
|
||||
# Since this should not require frequent updates, we just store this
|
||||
# out-of-line and check the unicode.rs file into git.
|
||||
|
||||
import fileinput, re, os, sys
|
||||
|
||||
preamble = '''// Copyright 2012-2018 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
|
||||
|
||||
#![allow(missing_docs, non_upper_case_globals, non_snake_case)]
|
||||
'''
|
||||
|
||||
# Mapping taken from Table 12 from:
|
||||
# http://www.unicode.org/reports/tr44/#General_Category_Values
|
||||
expanded_categories = {
|
||||
'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'],
|
||||
'Lm': ['L'], 'Lo': ['L'],
|
||||
'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'],
|
||||
'Nd': ['N'], 'Nl': ['N'], 'No': ['N'],
|
||||
'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'],
|
||||
'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'],
|
||||
'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'],
|
||||
'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'],
|
||||
'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
|
||||
}
|
||||
|
||||
# these are the surrogate codepoints, which are not valid rust characters
|
||||
surrogate_codepoints = (0xd800, 0xdfff)
|
||||
|
||||
UNICODE_VERSION = (15, 0, 0)
|
||||
|
||||
UNICODE_VERSION_NUMBER = "%s.%s.%s" %UNICODE_VERSION
|
||||
|
||||
def is_surrogate(n):
|
||||
return surrogate_codepoints[0] <= n <= surrogate_codepoints[1]
|
||||
|
||||
def fetch(f):
|
||||
if not os.path.exists(os.path.basename(f)):
|
||||
if "emoji" in f:
|
||||
os.system("curl -O https://www.unicode.org/Public/%s/ucd/emoji/%s"
|
||||
% (UNICODE_VERSION_NUMBER, f))
|
||||
else:
|
||||
os.system("curl -O https://www.unicode.org/Public/%s/ucd/%s"
|
||||
% (UNICODE_VERSION_NUMBER, f))
|
||||
|
||||
if not os.path.exists(os.path.basename(f)):
|
||||
sys.stderr.write("cannot load %s" % f)
|
||||
exit(1)
|
||||
|
||||
def load_gencats(f):
|
||||
fetch(f)
|
||||
gencats = {}
|
||||
|
||||
udict = {};
|
||||
range_start = -1;
|
||||
for line in fileinput.input(f):
|
||||
data = line.split(';');
|
||||
if len(data) != 15:
|
||||
continue
|
||||
cp = int(data[0], 16);
|
||||
if is_surrogate(cp):
|
||||
continue
|
||||
if range_start >= 0:
|
||||
for i in range(range_start, cp):
|
||||
udict[i] = data;
|
||||
range_start = -1;
|
||||
if data[1].endswith(", First>"):
|
||||
range_start = cp;
|
||||
continue;
|
||||
udict[cp] = data;
|
||||
|
||||
for code in udict:
|
||||
[code_org, name, gencat, combine, bidi,
|
||||
decomp, deci, digit, num, mirror,
|
||||
old, iso, upcase, lowcase, titlecase ] = udict[code];
|
||||
|
||||
# place letter in categories as appropriate
|
||||
for cat in [gencat, "Assigned"] + expanded_categories.get(gencat, []):
|
||||
if cat not in gencats:
|
||||
gencats[cat] = []
|
||||
gencats[cat].append(code)
|
||||
|
||||
gencats = group_cats(gencats)
|
||||
return gencats
|
||||
|
||||
def group_cats(cats):
|
||||
cats_out = {}
|
||||
for cat in cats:
|
||||
cats_out[cat] = group_cat(cats[cat])
|
||||
return cats_out
|
||||
|
||||
def group_cat(cat):
|
||||
cat_out = []
|
||||
letters = sorted(set(cat))
|
||||
cur_start = letters.pop(0)
|
||||
cur_end = cur_start
|
||||
for letter in letters:
|
||||
assert letter > cur_end, \
|
||||
"cur_end: %s, letter: %s" % (hex(cur_end), hex(letter))
|
||||
if letter == cur_end + 1:
|
||||
cur_end = letter
|
||||
else:
|
||||
cat_out.append((cur_start, cur_end))
|
||||
cur_start = cur_end = letter
|
||||
cat_out.append((cur_start, cur_end))
|
||||
return cat_out
|
||||
|
||||
def ungroup_cat(cat):
|
||||
cat_out = []
|
||||
for (lo, hi) in cat:
|
||||
while lo <= hi:
|
||||
cat_out.append(lo)
|
||||
lo += 1
|
||||
return cat_out
|
||||
|
||||
def format_table_content(f, content, indent):
|
||||
line = " "*indent
|
||||
first = True
|
||||
for chunk in content.split(","):
|
||||
if len(line) + len(chunk) < 98:
|
||||
if first:
|
||||
line += chunk
|
||||
else:
|
||||
line += ", " + chunk
|
||||
first = False
|
||||
else:
|
||||
f.write(line + ",\n")
|
||||
line = " "*indent + chunk
|
||||
f.write(line)
|
||||
|
||||
def load_properties(f, interestingprops):
|
||||
fetch(f)
|
||||
props = {}
|
||||
re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
|
||||
re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
|
||||
|
||||
for line in fileinput.input(os.path.basename(f)):
|
||||
prop = None
|
||||
d_lo = 0
|
||||
d_hi = 0
|
||||
m = re1.match(line)
|
||||
if m:
|
||||
d_lo = m.group(1)
|
||||
d_hi = m.group(1)
|
||||
prop = m.group(2)
|
||||
else:
|
||||
m = re2.match(line)
|
||||
if m:
|
||||
d_lo = m.group(1)
|
||||
d_hi = m.group(2)
|
||||
prop = m.group(3)
|
||||
else:
|
||||
continue
|
||||
if interestingprops and prop not in interestingprops:
|
||||
continue
|
||||
d_lo = int(d_lo, 16)
|
||||
d_hi = int(d_hi, 16)
|
||||
if prop not in props:
|
||||
props[prop] = []
|
||||
props[prop].append((d_lo, d_hi))
|
||||
|
||||
# optimize if possible
|
||||
for prop in props:
|
||||
props[prop] = group_cat(ungroup_cat(props[prop]))
|
||||
|
||||
return props
|
||||
|
||||
def escape_char(c):
|
||||
return "'\\u{%x}'" % c
|
||||
|
||||
def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
|
||||
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])), is_const=True):
|
||||
pub_string = "const"
|
||||
if not is_const:
|
||||
pub_string = "let"
|
||||
if is_pub:
|
||||
pub_string = "pub " + pub_string
|
||||
f.write(" %s %s: %s = &[\n" % (pub_string, name, t_type))
|
||||
data = ""
|
||||
first = True
|
||||
for dat in t_data:
|
||||
if not first:
|
||||
data += ","
|
||||
first = False
|
||||
data += pfun(dat)
|
||||
format_table_content(f, data, 8)
|
||||
f.write("\n ];\n\n")
|
||||
|
||||
def emit_util_mod(f):
|
||||
f.write("""
|
||||
pub mod util {
|
||||
#[inline]
|
||||
pub fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
|
||||
use core::cmp::Ordering::{Equal, Less, Greater};
|
||||
r.binary_search_by(|&(lo,hi)| {
|
||||
if lo <= c && c <= hi { Equal }
|
||||
else if hi < c { Less }
|
||||
else { Greater }
|
||||
}).is_ok()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn is_alphabetic(c: char) -> bool {
|
||||
match c {
|
||||
'a' ..= 'z' | 'A' ..= 'Z' => true,
|
||||
c if c > '\x7f' => super::derived_property::Alphabetic(c),
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn is_numeric(c: char) -> bool {
|
||||
match c {
|
||||
'0' ..= '9' => true,
|
||||
c if c > '\x7f' => super::general_category::N(c),
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn is_alphanumeric(c: char) -> bool {
|
||||
is_alphabetic(c) || is_numeric(c)
|
||||
}
|
||||
}
|
||||
|
||||
""")
|
||||
|
||||
def emit_property_module(f, mod, tbl, emit):
|
||||
f.write("mod %s {\n" % mod)
|
||||
for cat in sorted(emit):
|
||||
emit_table(f, "%s_table" % cat, tbl[cat], is_pub=False)
|
||||
f.write(" #[inline]\n")
|
||||
f.write(" pub fn %s(c: char) -> bool {\n" % cat)
|
||||
f.write(" super::util::bsearch_range_table(c, %s_table)\n" % cat)
|
||||
f.write(" }\n\n")
|
||||
f.write("}\n\n")
|
||||
|
||||
def emit_break_module(f, break_table, break_cats, name):
|
||||
Name = name.capitalize()
|
||||
f.write("""pub mod %s {
|
||||
use core::result::Result::{Ok, Err};
|
||||
|
||||
pub use self::%sCat::*;
|
||||
|
||||
#[allow(non_camel_case_types)]
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
|
||||
pub enum %sCat {
|
||||
""" % (name, Name, Name))
|
||||
|
||||
break_cats.append("Any")
|
||||
break_cats.sort()
|
||||
for cat in break_cats:
|
||||
f.write((" %sC_" % Name[0]) + cat + ",\n")
|
||||
f.write(""" }
|
||||
|
||||
fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> (u32, u32, %sCat) {
|
||||
use core::cmp::Ordering::{Equal, Less, Greater};
|
||||
match r.binary_search_by(|&(lo, hi, _)| {
|
||||
if lo <= c && c <= hi { Equal }
|
||||
else if hi < c { Less }
|
||||
else { Greater }
|
||||
}) {
|
||||
Ok(idx) => {
|
||||
let (lower, upper, cat) = r[idx];
|
||||
(lower as u32, upper as u32, cat)
|
||||
}
|
||||
Err(idx) => {
|
||||
(
|
||||
if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
|
||||
r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
|
||||
%sC_Any,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn %s_category(c: char) -> (u32, u32, %sCat) {
|
||||
bsearch_range_value_table(c, %s_cat_table)
|
||||
}
|
||||
|
||||
""" % (Name, Name, Name[0], name, Name, name))
|
||||
|
||||
emit_table(f, "%s_cat_table" % name, break_table, "&'static [(char, char, %sCat)]" % Name,
|
||||
pfun=lambda x: "(%s,%s,%sC_%s)" % (escape_char(x[0]), escape_char(x[1]), Name[0], x[2]),
|
||||
is_pub=False, is_const=True)
|
||||
f.write("}\n")
|
||||
|
||||
if __name__ == "__main__":
|
||||
r = "tables.rs"
|
||||
if os.path.exists(r):
|
||||
os.remove(r)
|
||||
with open(r, "w") as rf:
|
||||
# write the file's preamble
|
||||
rf.write(preamble)
|
||||
rf.write("""
|
||||
/// The version of [Unicode](http://www.unicode.org/)
|
||||
/// that this version of unicode-segmentation is based on.
|
||||
pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
|
||||
""" % UNICODE_VERSION)
|
||||
|
||||
# download and parse all the data
|
||||
gencats = load_gencats("UnicodeData.txt")
|
||||
derived = load_properties("DerivedCoreProperties.txt", ["Alphabetic"])
|
||||
|
||||
emit_util_mod(rf)
|
||||
for (name, cat, pfuns) in ("general_category", gencats, ["N"]), \
|
||||
("derived_property", derived, ["Alphabetic"]):
|
||||
emit_property_module(rf, name, cat, pfuns)
|
||||
|
||||
### grapheme cluster module
|
||||
# from http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Break_Property_Values
|
||||
grapheme_cats = load_properties("auxiliary/GraphemeBreakProperty.txt", [])
|
||||
|
||||
# Control
|
||||
# Note:
|
||||
# This category also includes Cs (surrogate codepoints), but Rust's `char`s are
|
||||
# Unicode Scalar Values only, and surrogates are thus invalid `char`s.
|
||||
# Thus, we have to remove Cs from the Control category
|
||||
grapheme_cats["Control"] = group_cat(list(
|
||||
set(ungroup_cat(grapheme_cats["Control"]))
|
||||
- set(ungroup_cat([surrogate_codepoints]))))
|
||||
|
||||
grapheme_table = []
|
||||
for cat in grapheme_cats:
|
||||
grapheme_table.extend([(x, y, cat) for (x, y) in grapheme_cats[cat]])
|
||||
emoji_props = load_properties("emoji-data.txt", ["Extended_Pictographic"])
|
||||
grapheme_table.extend([(x, y, "Extended_Pictographic") for (x, y) in emoji_props["Extended_Pictographic"]])
|
||||
grapheme_table.sort(key=lambda w: w[0])
|
||||
last = -1
|
||||
for chars in grapheme_table:
|
||||
if chars[0] <= last:
|
||||
raise "Grapheme tables and Extended_Pictographic values overlap; need to store these separately!"
|
||||
last = chars[1]
|
||||
emit_break_module(rf, grapheme_table, list(grapheme_cats.keys()) + ["Extended_Pictographic"], "grapheme")
|
||||
rf.write("\n")
|
||||
|
||||
word_cats = load_properties("auxiliary/WordBreakProperty.txt", [])
|
||||
word_table = []
|
||||
for cat in word_cats:
|
||||
word_table.extend([(x, y, cat) for (x, y) in word_cats[cat]])
|
||||
word_table.sort(key=lambda w: w[0])
|
||||
emit_break_module(rf, word_table, list(word_cats.keys()), "word")
|
||||
|
||||
# There are some emoji which are also ALetter, so this needs to be stored separately
|
||||
# For efficiency, we could still merge the two tables and produce an ALetterEP state
|
||||
emoji_table = [(x, y, "Extended_Pictographic") for (x, y) in emoji_props["Extended_Pictographic"]]
|
||||
emit_break_module(rf, emoji_table, ["Extended_Pictographic"], "emoji")
|
||||
|
||||
sentence_cats = load_properties("auxiliary/SentenceBreakProperty.txt", [])
|
||||
sentence_table = []
|
||||
for cat in sentence_cats:
|
||||
sentence_table.extend([(x, y, cat) for (x, y) in sentence_cats[cat]])
|
||||
sentence_table.sort(key=lambda w: w[0])
|
||||
emit_break_module(rf, sentence_table, list(sentence_cats.keys()), "sentence")
|
|
@ -1,212 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8
|
||||
#
|
||||
# Copyright 2015 The Rust Project Developers. See the COPYRIGHT
|
||||
# file at the top-level directory of this distribution and at
|
||||
# http://rust-lang.org/COPYRIGHT.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
# option. This file may not be copied, modified, or distributed
|
||||
# except according to those terms.
|
||||
|
||||
# This script uses the following Unicode tables:
|
||||
# - auxiliary/GraphemeBreakTest.txt
|
||||
# - auxiliary/WordBreakTest.txt
|
||||
#
|
||||
# Since this should not require frequent updates, we just store this
|
||||
# out-of-line and check the unicode.rs file into git.
|
||||
from __future__ import print_function
|
||||
|
||||
import unicode, re, os, fileinput
|
||||
|
||||
def load_test_data(f, optsplit=[]):
|
||||
testRe1 = re.compile(r"^÷\s+([^\s].*[^\s])\s+÷\s+#\s+÷\s+\[0.2\].*?([÷×].*)\s+÷\s+\[0.3\]\s*$")
|
||||
|
||||
unicode.fetch(f)
|
||||
data = []
|
||||
for line in fileinput.input(os.path.basename(f)):
|
||||
# lines that include a test start with the ÷ character
|
||||
if len(line) < 2 or not line.startswith('÷'):
|
||||
continue
|
||||
|
||||
m = testRe1.match(line)
|
||||
if not m:
|
||||
print("error: no match on line where test was expected: %s" % line)
|
||||
continue
|
||||
|
||||
# process the characters in this test case
|
||||
chars = process_split_string(m.group(1))
|
||||
# skip test case if it contains invalid characters (viz., surrogates)
|
||||
if not chars:
|
||||
continue
|
||||
|
||||
# now process test cases
|
||||
(chars, info) = process_split_info(m.group(2), chars, optsplit)
|
||||
|
||||
# make sure that we have break info for each break!
|
||||
assert len(chars) - 1 == len(info)
|
||||
|
||||
data.append((chars, info))
|
||||
|
||||
return data
|
||||
|
||||
def process_split_info(s, c, o):
|
||||
outcs = []
|
||||
outis = []
|
||||
workcs = c.pop(0)
|
||||
|
||||
# are we on a × or a ÷?
|
||||
isX = False
|
||||
if s.startswith('×'):
|
||||
isX = True
|
||||
|
||||
# find each instance of '(÷|×) [x.y] '
|
||||
while s:
|
||||
# find the currently considered rule number
|
||||
sInd = s.index('[') + 1
|
||||
eInd = s.index(']')
|
||||
|
||||
# if it's '× [a.b]' where 'a.b' is in o, then
|
||||
# we consider it a split even though it's not
|
||||
# marked as one
|
||||
# if it's ÷ then it's always a split
|
||||
if not isX or s[sInd:eInd] in o:
|
||||
outis.append(s[sInd:eInd])
|
||||
outcs.append(workcs)
|
||||
workcs = c.pop(0)
|
||||
else:
|
||||
workcs.extend(c.pop(0))
|
||||
|
||||
idx = 1
|
||||
while idx < len(s):
|
||||
if s[idx:].startswith('×'):
|
||||
isX = True
|
||||
break
|
||||
if s[idx:].startswith('÷'):
|
||||
isX = False
|
||||
break
|
||||
idx += 1
|
||||
s = s[idx:]
|
||||
|
||||
outcs.append(workcs)
|
||||
return (outcs, outis)
|
||||
|
||||
def process_split_string(s):
|
||||
outls = []
|
||||
workls = []
|
||||
|
||||
inls = s.split()
|
||||
|
||||
for i in inls:
|
||||
if i == '÷' or i == '×':
|
||||
outls.append(workls)
|
||||
workls = []
|
||||
continue
|
||||
|
||||
ival = int(i,16)
|
||||
|
||||
if unicode.is_surrogate(ival):
|
||||
return []
|
||||
|
||||
workls.append(ival)
|
||||
|
||||
if workls:
|
||||
outls.append(workls)
|
||||
|
||||
return outls
|
||||
|
||||
def showfun(x):
|
||||
outstr = '("'
|
||||
for c in x[0]:
|
||||
outstr += "\\u{%x}" % c
|
||||
outstr += '",&['
|
||||
xfirst = True
|
||||
for xx in x[1:]:
|
||||
if not xfirst:
|
||||
outstr += '],&['
|
||||
xfirst = False
|
||||
sfirst = True
|
||||
for sp in xx:
|
||||
if not sfirst:
|
||||
outstr += ','
|
||||
sfirst = False
|
||||
outstr += '"'
|
||||
for c in sp:
|
||||
outstr += "\\u{%x}" % c
|
||||
outstr += '"'
|
||||
outstr += '])'
|
||||
return outstr
|
||||
|
||||
def create_grapheme_data(f):
|
||||
# rules 9.1 and 9.2 are for extended graphemes only
|
||||
optsplits = ['9.1','9.2']
|
||||
d = load_test_data("auxiliary/GraphemeBreakTest.txt", optsplits)
|
||||
|
||||
test_same = []
|
||||
test_diff = []
|
||||
|
||||
for (c, i) in d:
|
||||
allchars = [cn for s in c for cn in s]
|
||||
extgraphs = []
|
||||
extwork = []
|
||||
|
||||
extwork.extend(c[0])
|
||||
for n in range(0,len(i)):
|
||||
if i[n] in optsplits:
|
||||
extwork.extend(c[n+1])
|
||||
else:
|
||||
extgraphs.append(extwork)
|
||||
extwork = []
|
||||
extwork.extend(c[n+1])
|
||||
|
||||
# these are the extended grapheme clusters
|
||||
extgraphs.append(extwork)
|
||||
|
||||
if extgraphs == c:
|
||||
test_same.append((allchars, c))
|
||||
else:
|
||||
test_diff.append((allchars, extgraphs, c))
|
||||
|
||||
stype = "&'static [(&'static str, &'static [&'static str])]"
|
||||
dtype = "&'static [(&'static str, &'static [&'static str], &'static [&'static str])]"
|
||||
f.write(" // official Unicode test data\n")
|
||||
f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/GraphemeBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
|
||||
unicode.emit_table(f, "TEST_SAME", test_same, stype, True, showfun, True)
|
||||
unicode.emit_table(f, "TEST_DIFF", test_diff, dtype, True, showfun, True)
|
||||
|
||||
def create_words_data(f):
|
||||
d = load_test_data("auxiliary/WordBreakTest.txt")
|
||||
|
||||
test = []
|
||||
|
||||
for (c, i) in d:
|
||||
allchars = [cn for s in c for cn in s]
|
||||
test.append((allchars, c))
|
||||
|
||||
wtype = "&'static [(&'static str, &'static [&'static str])]"
|
||||
f.write(" // official Unicode test data\n")
|
||||
f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/WordBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
|
||||
unicode.emit_table(f, "TEST_WORD", test, wtype, True, showfun, True)
|
||||
|
||||
def create_sentence_data(f):
|
||||
d = load_test_data("auxiliary/SentenceBreakTest.txt")
|
||||
|
||||
test = []
|
||||
|
||||
for (c, i) in d:
|
||||
allchars = [cn for s in c for cn in s]
|
||||
test.append((allchars, c))
|
||||
|
||||
wtype = "&'static [(&'static str, &'static [&'static str])]"
|
||||
f.write(" // official Unicode test data\n")
|
||||
f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/SentenceBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
|
||||
unicode.emit_table(f, "TEST_SENTENCE", test, wtype, True, showfun, True)
|
||||
|
||||
if __name__ == "__main__":
|
||||
with open("testdata.rs", "w") as rf:
|
||||
rf.write(unicode.preamble)
|
||||
create_grapheme_data(rf)
|
||||
create_words_data(rf)
|
||||
create_sentence_data(rf)
|
|
@ -1,801 +0,0 @@
|
|||
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use core::cmp;
|
||||
|
||||
use crate::tables::grapheme::GraphemeCat;
|
||||
|
||||
/// External iterator for grapheme clusters and byte offsets.
|
||||
///
|
||||
/// This struct is created by the [`grapheme_indices`] method on the [`UnicodeSegmentation`]
|
||||
/// trait. See its documentation for more.
|
||||
///
|
||||
/// [`grapheme_indices`]: trait.UnicodeSegmentation.html#tymethod.grapheme_indices
|
||||
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
|
||||
#[derive(Clone)]
|
||||
pub struct GraphemeIndices<'a> {
|
||||
start_offset: usize,
|
||||
iter: Graphemes<'a>,
|
||||
}
|
||||
|
||||
impl<'a> GraphemeIndices<'a> {
|
||||
#[inline]
|
||||
/// View the underlying data (the part yet to be iterated) as a slice of the original string.
|
||||
///
|
||||
/// ```rust
|
||||
/// # use unicode_segmentation::UnicodeSegmentation;
|
||||
/// let mut iter = "abc".grapheme_indices(true);
|
||||
/// assert_eq!(iter.as_str(), "abc");
|
||||
/// iter.next();
|
||||
/// assert_eq!(iter.as_str(), "bc");
|
||||
/// iter.next();
|
||||
/// iter.next();
|
||||
/// assert_eq!(iter.as_str(), "");
|
||||
/// ```
|
||||
pub fn as_str(&self) -> &'a str {
|
||||
self.iter.as_str()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for GraphemeIndices<'a> {
|
||||
type Item = (usize, &'a str);
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<(usize, &'a str)> {
|
||||
self.iter
|
||||
.next()
|
||||
.map(|s| (s.as_ptr() as usize - self.start_offset, s))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
self.iter.size_hint()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
|
||||
#[inline]
|
||||
fn next_back(&mut self) -> Option<(usize, &'a str)> {
|
||||
self.iter
|
||||
.next_back()
|
||||
.map(|s| (s.as_ptr() as usize - self.start_offset, s))
|
||||
}
|
||||
}
|
||||
|
||||
/// External iterator for a string's
|
||||
/// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries).
|
||||
///
|
||||
/// This struct is created by the [`graphemes`] method on the [`UnicodeSegmentation`] trait. See its
|
||||
/// documentation for more.
|
||||
///
|
||||
/// [`graphemes`]: trait.UnicodeSegmentation.html#tymethod.graphemes
|
||||
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Graphemes<'a> {
|
||||
string: &'a str,
|
||||
cursor: GraphemeCursor,
|
||||
cursor_back: GraphemeCursor,
|
||||
}
|
||||
|
||||
impl<'a> Graphemes<'a> {
|
||||
#[inline]
|
||||
/// View the underlying data (the part yet to be iterated) as a slice of the original string.
|
||||
///
|
||||
/// ```rust
|
||||
/// # use unicode_segmentation::UnicodeSegmentation;
|
||||
/// let mut iter = "abc".graphemes(true);
|
||||
/// assert_eq!(iter.as_str(), "abc");
|
||||
/// iter.next();
|
||||
/// assert_eq!(iter.as_str(), "bc");
|
||||
/// iter.next();
|
||||
/// iter.next();
|
||||
/// assert_eq!(iter.as_str(), "");
|
||||
/// ```
|
||||
pub fn as_str(&self) -> &'a str {
|
||||
&self.string[self.cursor.cur_cursor()..self.cursor_back.cur_cursor()]
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for Graphemes<'a> {
|
||||
type Item = &'a str;
|
||||
|
||||
#[inline]
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
let slen = self.cursor_back.cur_cursor() - self.cursor.cur_cursor();
|
||||
(cmp::min(slen, 1), Some(slen))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<&'a str> {
|
||||
let start = self.cursor.cur_cursor();
|
||||
if start == self.cursor_back.cur_cursor() {
|
||||
return None;
|
||||
}
|
||||
let next = self.cursor.next_boundary(self.string, 0).unwrap().unwrap();
|
||||
Some(&self.string[start..next])
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> DoubleEndedIterator for Graphemes<'a> {
|
||||
#[inline]
|
||||
fn next_back(&mut self) -> Option<&'a str> {
|
||||
let end = self.cursor_back.cur_cursor();
|
||||
if end == self.cursor.cur_cursor() {
|
||||
return None;
|
||||
}
|
||||
let prev = self
|
||||
.cursor_back
|
||||
.prev_boundary(self.string, 0)
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
Some(&self.string[prev..end])
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn new_graphemes<'b>(s: &'b str, is_extended: bool) -> Graphemes<'b> {
|
||||
let len = s.len();
|
||||
Graphemes {
|
||||
string: s,
|
||||
cursor: GraphemeCursor::new(0, len, is_extended),
|
||||
cursor_back: GraphemeCursor::new(len, len, is_extended),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn new_grapheme_indices<'b>(s: &'b str, is_extended: bool) -> GraphemeIndices<'b> {
|
||||
GraphemeIndices {
|
||||
start_offset: s.as_ptr() as usize,
|
||||
iter: new_graphemes(s, is_extended),
|
||||
}
|
||||
}
|
||||
|
||||
// maybe unify with PairResult?
|
||||
// An enum describing information about a potential boundary.
|
||||
#[derive(PartialEq, Eq, Clone, Debug)]
|
||||
enum GraphemeState {
|
||||
// No information is known.
|
||||
Unknown,
|
||||
// It is known to not be a boundary.
|
||||
NotBreak,
|
||||
// It is known to be a boundary.
|
||||
Break,
|
||||
// The codepoint after is a Regional Indicator Symbol, so a boundary iff
|
||||
// it is preceded by an even number of RIS codepoints. (GB12, GB13)
|
||||
Regional,
|
||||
// The codepoint after is Extended_Pictographic,
|
||||
// so whether it's a boundary depends on pre-context according to GB11.
|
||||
Emoji,
|
||||
}
|
||||
|
||||
/// Cursor-based segmenter for grapheme clusters.
|
||||
///
|
||||
/// This allows working with ropes and other datastructures where the string is not contiguous or
|
||||
/// fully known at initialization time.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct GraphemeCursor {
|
||||
// Current cursor position.
|
||||
offset: usize,
|
||||
// Total length of the string.
|
||||
len: usize,
|
||||
// A config flag indicating whether this cursor computes legacy or extended
|
||||
// grapheme cluster boundaries (enables GB9a and GB9b if set).
|
||||
is_extended: bool,
|
||||
// Information about the potential boundary at `offset`
|
||||
state: GraphemeState,
|
||||
// Category of codepoint immediately preceding cursor, if known.
|
||||
cat_before: Option<GraphemeCat>,
|
||||
// Category of codepoint immediately after cursor, if known.
|
||||
cat_after: Option<GraphemeCat>,
|
||||
// If set, at least one more codepoint immediately preceding this offset
|
||||
// is needed to resolve whether there's a boundary at `offset`.
|
||||
pre_context_offset: Option<usize>,
|
||||
// The number of RIS codepoints preceding `offset`. If `pre_context_offset`
|
||||
// is set, then counts the number of RIS between that and `offset`, otherwise
|
||||
// is an accurate count relative to the string.
|
||||
ris_count: Option<usize>,
|
||||
// Set if a call to `prev_boundary` or `next_boundary` was suspended due
|
||||
// to needing more input.
|
||||
resuming: bool,
|
||||
// Cached grapheme category and associated scalar value range.
|
||||
grapheme_cat_cache: (u32, u32, GraphemeCat),
|
||||
}
|
||||
|
||||
/// An error return indicating that not enough content was available in the
|
||||
/// provided chunk to satisfy the query, and that more content must be provided.
|
||||
#[derive(PartialEq, Eq, Debug)]
|
||||
pub enum GraphemeIncomplete {
|
||||
/// More pre-context is needed. The caller should call `provide_context`
|
||||
/// with a chunk ending at the offset given, then retry the query. This
|
||||
/// will only be returned if the `chunk_start` parameter is nonzero.
|
||||
PreContext(usize),
|
||||
|
||||
/// When requesting `prev_boundary`, the cursor is moving past the beginning
|
||||
/// of the current chunk, so the chunk before that is requested. This will
|
||||
/// only be returned if the `chunk_start` parameter is nonzero.
|
||||
PrevChunk,
|
||||
|
||||
/// When requesting `next_boundary`, the cursor is moving past the end of the
|
||||
/// current chunk, so the chunk after that is requested. This will only be
|
||||
/// returned if the chunk ends before the `len` parameter provided on
|
||||
/// creation of the cursor.
|
||||
NextChunk, // requesting chunk following the one given
|
||||
|
||||
/// An error returned when the chunk given does not contain the cursor position.
|
||||
InvalidOffset,
|
||||
}
|
||||
|
||||
// An enum describing the result from lookup of a pair of categories.
|
||||
#[derive(PartialEq, Eq)]
|
||||
enum PairResult {
|
||||
NotBreak, // definitely not a break
|
||||
Break, // definitely a break
|
||||
Extended, // a break iff not in extended mode
|
||||
Regional, // a break if preceded by an even number of RIS
|
||||
Emoji, // a break if preceded by emoji base and (Extend)*
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult {
|
||||
use self::PairResult::*;
|
||||
use crate::tables::grapheme::GraphemeCat::*;
|
||||
match (before, after) {
|
||||
(GC_CR, GC_LF) => NotBreak, // GB3
|
||||
(GC_Control, _) => Break, // GB4
|
||||
(GC_CR, _) => Break, // GB4
|
||||
(GC_LF, _) => Break, // GB4
|
||||
(_, GC_Control) => Break, // GB5
|
||||
(_, GC_CR) => Break, // GB5
|
||||
(_, GC_LF) => Break, // GB5
|
||||
(GC_L, GC_L) => NotBreak, // GB6
|
||||
(GC_L, GC_V) => NotBreak, // GB6
|
||||
(GC_L, GC_LV) => NotBreak, // GB6
|
||||
(GC_L, GC_LVT) => NotBreak, // GB6
|
||||
(GC_LV, GC_V) => NotBreak, // GB7
|
||||
(GC_LV, GC_T) => NotBreak, // GB7
|
||||
(GC_V, GC_V) => NotBreak, // GB7
|
||||
(GC_V, GC_T) => NotBreak, // GB7
|
||||
(GC_LVT, GC_T) => NotBreak, // GB8
|
||||
(GC_T, GC_T) => NotBreak, // GB8
|
||||
(_, GC_Extend) => NotBreak, // GB9
|
||||
(_, GC_ZWJ) => NotBreak, // GB9
|
||||
(_, GC_SpacingMark) => Extended, // GB9a
|
||||
(GC_Prepend, _) => Extended, // GB9b
|
||||
(GC_ZWJ, GC_Extended_Pictographic) => Emoji, // GB11
|
||||
(GC_Regional_Indicator, GC_Regional_Indicator) => Regional, // GB12, GB13
|
||||
(_, _) => Break, // GB999
|
||||
}
|
||||
}
|
||||
|
||||
impl GraphemeCursor {
|
||||
/// Create a new cursor. The string and initial offset are given at creation
|
||||
/// time, but the contents of the string are not. The `is_extended` parameter
|
||||
/// controls whether extended grapheme clusters are selected.
|
||||
///
|
||||
/// The `offset` parameter must be on a codepoint boundary.
|
||||
///
|
||||
/// ```rust
|
||||
/// # use unicode_segmentation::GraphemeCursor;
|
||||
/// let s = "हिन्दी";
|
||||
/// let mut legacy = GraphemeCursor::new(0, s.len(), false);
|
||||
/// assert_eq!(legacy.next_boundary(s, 0), Ok(Some("ह".len())));
|
||||
/// let mut extended = GraphemeCursor::new(0, s.len(), true);
|
||||
/// assert_eq!(extended.next_boundary(s, 0), Ok(Some("हि".len())));
|
||||
/// ```
|
||||
pub fn new(offset: usize, len: usize, is_extended: bool) -> GraphemeCursor {
|
||||
let state = if offset == 0 || offset == len {
|
||||
GraphemeState::Break
|
||||
} else {
|
||||
GraphemeState::Unknown
|
||||
};
|
||||
GraphemeCursor {
|
||||
offset: offset,
|
||||
len: len,
|
||||
state: state,
|
||||
is_extended: is_extended,
|
||||
cat_before: None,
|
||||
cat_after: None,
|
||||
pre_context_offset: None,
|
||||
ris_count: None,
|
||||
resuming: false,
|
||||
grapheme_cat_cache: (0, 0, GraphemeCat::GC_Control),
|
||||
}
|
||||
}
|
||||
|
||||
fn grapheme_category(&mut self, ch: char) -> GraphemeCat {
|
||||
use crate::tables::grapheme as gr;
|
||||
use crate::tables::grapheme::GraphemeCat::*;
|
||||
|
||||
if ch <= '\u{7e}' {
|
||||
// Special-case optimization for ascii, except U+007F. This
|
||||
// improves performance even for many primarily non-ascii texts,
|
||||
// due to use of punctuation and white space characters from the
|
||||
// ascii range.
|
||||
if ch >= '\u{20}' {
|
||||
GC_Any
|
||||
} else if ch == '\n' {
|
||||
GC_LF
|
||||
} else if ch == '\r' {
|
||||
GC_CR
|
||||
} else {
|
||||
GC_Control
|
||||
}
|
||||
} else {
|
||||
// If this char isn't within the cached range, update the cache to the
|
||||
// range that includes it.
|
||||
if (ch as u32) < self.grapheme_cat_cache.0 || (ch as u32) > self.grapheme_cat_cache.1 {
|
||||
self.grapheme_cat_cache = gr::grapheme_category(ch);
|
||||
}
|
||||
self.grapheme_cat_cache.2
|
||||
}
|
||||
}
|
||||
|
||||
// Not sure I'm gonna keep this, the advantage over new() seems thin.
|
||||
|
||||
/// Set the cursor to a new location in the same string.
|
||||
///
|
||||
/// ```rust
|
||||
/// # use unicode_segmentation::GraphemeCursor;
|
||||
/// let s = "abcd";
|
||||
/// let mut cursor = GraphemeCursor::new(0, s.len(), false);
|
||||
/// assert_eq!(cursor.cur_cursor(), 0);
|
||||
/// cursor.set_cursor(2);
|
||||
/// assert_eq!(cursor.cur_cursor(), 2);
|
||||
/// ```
|
||||
pub fn set_cursor(&mut self, offset: usize) {
|
||||
if offset != self.offset {
|
||||
self.offset = offset;
|
||||
self.state = if offset == 0 || offset == self.len {
|
||||
GraphemeState::Break
|
||||
} else {
|
||||
GraphemeState::Unknown
|
||||
};
|
||||
// reset state derived from text around cursor
|
||||
self.cat_before = None;
|
||||
self.cat_after = None;
|
||||
self.ris_count = None;
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// The current offset of the cursor. Equal to the last value provided to
|
||||
/// `new()` or `set_cursor()`, or returned from `next_boundary()` or
|
||||
/// `prev_boundary()`.
|
||||
///
|
||||
/// ```rust
|
||||
/// # use unicode_segmentation::GraphemeCursor;
|
||||
/// // Two flags (🇷🇸🇮🇴), each flag is two RIS codepoints, each RIS is 4 bytes.
|
||||
/// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
|
||||
/// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
|
||||
/// assert_eq!(cursor.cur_cursor(), 4);
|
||||
/// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
|
||||
/// assert_eq!(cursor.cur_cursor(), 8);
|
||||
/// ```
|
||||
pub fn cur_cursor(&self) -> usize {
|
||||
self.offset
|
||||
}
|
||||
|
||||
/// Provide additional pre-context when it is needed to decide a boundary.
|
||||
/// The end of the chunk must coincide with the value given in the
|
||||
/// `GraphemeIncomplete::PreContext` request.
|
||||
///
|
||||
/// ```rust
|
||||
/// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
|
||||
/// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
|
||||
/// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
|
||||
/// // Not enough pre-context to decide if there's a boundary between the two flags.
|
||||
/// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(8)));
|
||||
/// // Provide one more Regional Indicator Symbol of pre-context
|
||||
/// cursor.provide_context(&flags[4..8], 4);
|
||||
/// // Still not enough context to decide.
|
||||
/// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(4)));
|
||||
/// // Provide additional requested context.
|
||||
/// cursor.provide_context(&flags[0..4], 0);
|
||||
/// // That's enough to decide (it always is when context goes to the start of the string)
|
||||
/// assert_eq!(cursor.is_boundary(&flags[8..], 8), Ok(true));
|
||||
/// ```
|
||||
pub fn provide_context(&mut self, chunk: &str, chunk_start: usize) {
|
||||
use crate::tables::grapheme as gr;
|
||||
assert!(chunk_start + chunk.len() == self.pre_context_offset.unwrap());
|
||||
self.pre_context_offset = None;
|
||||
if self.is_extended && chunk_start + chunk.len() == self.offset {
|
||||
let ch = chunk.chars().rev().next().unwrap();
|
||||
if self.grapheme_category(ch) == gr::GC_Prepend {
|
||||
self.decide(false); // GB9b
|
||||
return;
|
||||
}
|
||||
}
|
||||
match self.state {
|
||||
GraphemeState::Regional => self.handle_regional(chunk, chunk_start),
|
||||
GraphemeState::Emoji => self.handle_emoji(chunk, chunk_start),
|
||||
_ => {
|
||||
if self.cat_before.is_none() && self.offset == chunk.len() + chunk_start {
|
||||
let ch = chunk.chars().rev().next().unwrap();
|
||||
self.cat_before = Some(self.grapheme_category(ch));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn decide(&mut self, is_break: bool) {
|
||||
self.state = if is_break {
|
||||
GraphemeState::Break
|
||||
} else {
|
||||
GraphemeState::NotBreak
|
||||
};
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn decision(&mut self, is_break: bool) -> Result<bool, GraphemeIncomplete> {
|
||||
self.decide(is_break);
|
||||
Ok(is_break)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn is_boundary_result(&self) -> Result<bool, GraphemeIncomplete> {
|
||||
if self.state == GraphemeState::Break {
|
||||
Ok(true)
|
||||
} else if self.state == GraphemeState::NotBreak {
|
||||
Ok(false)
|
||||
} else if let Some(pre_context_offset) = self.pre_context_offset {
|
||||
Err(GraphemeIncomplete::PreContext(pre_context_offset))
|
||||
} else {
|
||||
unreachable!("inconsistent state");
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn handle_regional(&mut self, chunk: &str, chunk_start: usize) {
|
||||
use crate::tables::grapheme as gr;
|
||||
let mut ris_count = self.ris_count.unwrap_or(0);
|
||||
for ch in chunk.chars().rev() {
|
||||
if self.grapheme_category(ch) != gr::GC_Regional_Indicator {
|
||||
self.ris_count = Some(ris_count);
|
||||
self.decide((ris_count % 2) == 0);
|
||||
return;
|
||||
}
|
||||
ris_count += 1;
|
||||
}
|
||||
self.ris_count = Some(ris_count);
|
||||
if chunk_start == 0 {
|
||||
self.decide((ris_count % 2) == 0);
|
||||
return;
|
||||
}
|
||||
self.pre_context_offset = Some(chunk_start);
|
||||
self.state = GraphemeState::Regional;
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) {
|
||||
use crate::tables::grapheme as gr;
|
||||
let mut iter = chunk.chars().rev();
|
||||
if let Some(ch) = iter.next() {
|
||||
if self.grapheme_category(ch) != gr::GC_ZWJ {
|
||||
self.decide(true);
|
||||
return;
|
||||
}
|
||||
}
|
||||
for ch in iter {
|
||||
match self.grapheme_category(ch) {
|
||||
gr::GC_Extend => (),
|
||||
gr::GC_Extended_Pictographic => {
|
||||
self.decide(false);
|
||||
return;
|
||||
}
|
||||
_ => {
|
||||
self.decide(true);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
if chunk_start == 0 {
|
||||
self.decide(true);
|
||||
return;
|
||||
}
|
||||
self.pre_context_offset = Some(chunk_start);
|
||||
self.state = GraphemeState::Emoji;
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Determine whether the current cursor location is a grapheme cluster boundary.
|
||||
/// Only a part of the string need be supplied. If `chunk_start` is nonzero or
|
||||
/// the length of `chunk` is not equal to `len` on creation, then this method
|
||||
/// may return `GraphemeIncomplete::PreContext`. The caller should then
|
||||
/// call `provide_context` with the requested chunk, then retry calling this
|
||||
/// method.
|
||||
///
|
||||
/// For partial chunks, if the cursor is not at the beginning or end of the
|
||||
/// string, the chunk should contain at least the codepoint following the cursor.
|
||||
/// If the string is nonempty, the chunk must be nonempty.
|
||||
///
|
||||
/// All calls should have consistent chunk contents (ie, if a chunk provides
|
||||
/// content for a given slice, all further chunks covering that slice must have
|
||||
/// the same content for it).
|
||||
///
|
||||
/// ```rust
|
||||
/// # use unicode_segmentation::GraphemeCursor;
|
||||
/// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
|
||||
/// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
|
||||
/// assert_eq!(cursor.is_boundary(flags, 0), Ok(true));
|
||||
/// cursor.set_cursor(12);
|
||||
/// assert_eq!(cursor.is_boundary(flags, 0), Ok(false));
|
||||
/// ```
|
||||
pub fn is_boundary(
|
||||
&mut self,
|
||||
chunk: &str,
|
||||
chunk_start: usize,
|
||||
) -> Result<bool, GraphemeIncomplete> {
|
||||
use crate::tables::grapheme as gr;
|
||||
if self.state == GraphemeState::Break {
|
||||
return Ok(true);
|
||||
}
|
||||
if self.state == GraphemeState::NotBreak {
|
||||
return Ok(false);
|
||||
}
|
||||
if self.offset < chunk_start || self.offset >= chunk_start + chunk.len() {
|
||||
if self.offset > chunk_start + chunk.len() || self.cat_after.is_none() {
|
||||
return Err(GraphemeIncomplete::InvalidOffset);
|
||||
}
|
||||
}
|
||||
if let Some(pre_context_offset) = self.pre_context_offset {
|
||||
return Err(GraphemeIncomplete::PreContext(pre_context_offset));
|
||||
}
|
||||
let offset_in_chunk = self.offset - chunk_start;
|
||||
if self.cat_after.is_none() {
|
||||
let ch = chunk[offset_in_chunk..].chars().next().unwrap();
|
||||
self.cat_after = Some(self.grapheme_category(ch));
|
||||
}
|
||||
if self.offset == chunk_start {
|
||||
let mut need_pre_context = true;
|
||||
match self.cat_after.unwrap() {
|
||||
gr::GC_Regional_Indicator => self.state = GraphemeState::Regional,
|
||||
gr::GC_Extended_Pictographic => self.state = GraphemeState::Emoji,
|
||||
_ => need_pre_context = self.cat_before.is_none(),
|
||||
}
|
||||
if need_pre_context {
|
||||
self.pre_context_offset = Some(chunk_start);
|
||||
return Err(GraphemeIncomplete::PreContext(chunk_start));
|
||||
}
|
||||
}
|
||||
if self.cat_before.is_none() {
|
||||
let ch = chunk[..offset_in_chunk].chars().rev().next().unwrap();
|
||||
self.cat_before = Some(self.grapheme_category(ch));
|
||||
}
|
||||
match check_pair(self.cat_before.unwrap(), self.cat_after.unwrap()) {
|
||||
PairResult::NotBreak => return self.decision(false),
|
||||
PairResult::Break => return self.decision(true),
|
||||
PairResult::Extended => {
|
||||
let is_extended = self.is_extended;
|
||||
return self.decision(!is_extended);
|
||||
}
|
||||
PairResult::Regional => {
|
||||
if let Some(ris_count) = self.ris_count {
|
||||
return self.decision((ris_count % 2) == 0);
|
||||
}
|
||||
self.handle_regional(&chunk[..offset_in_chunk], chunk_start);
|
||||
self.is_boundary_result()
|
||||
}
|
||||
PairResult::Emoji => {
|
||||
self.handle_emoji(&chunk[..offset_in_chunk], chunk_start);
|
||||
self.is_boundary_result()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Find the next boundary after the current cursor position. Only a part of
|
||||
/// the string need be supplied. If the chunk is incomplete, then this
|
||||
/// method might return `GraphemeIncomplete::PreContext` or
|
||||
/// `GraphemeIncomplete::NextChunk`. In the former case, the caller should
|
||||
/// call `provide_context` with the requested chunk, then retry. In the
|
||||
/// latter case, the caller should provide the chunk following the one
|
||||
/// given, then retry.
|
||||
///
|
||||
/// See `is_boundary` for expectations on the provided chunk.
|
||||
///
|
||||
/// ```rust
|
||||
/// # use unicode_segmentation::GraphemeCursor;
|
||||
/// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
|
||||
/// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
|
||||
/// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
|
||||
/// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(16)));
|
||||
/// assert_eq!(cursor.next_boundary(flags, 0), Ok(None));
|
||||
/// ```
|
||||
///
|
||||
/// And an example that uses partial strings:
|
||||
///
|
||||
/// ```rust
|
||||
/// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
|
||||
/// let s = "abcd";
|
||||
/// let mut cursor = GraphemeCursor::new(0, s.len(), false);
|
||||
/// assert_eq!(cursor.next_boundary(&s[..2], 0), Ok(Some(1)));
|
||||
/// assert_eq!(cursor.next_boundary(&s[..2], 0), Err(GraphemeIncomplete::NextChunk));
|
||||
/// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(2)));
|
||||
/// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(3)));
|
||||
/// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(4)));
|
||||
/// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(None));
|
||||
/// ```
|
||||
pub fn next_boundary(
|
||||
&mut self,
|
||||
chunk: &str,
|
||||
chunk_start: usize,
|
||||
) -> Result<Option<usize>, GraphemeIncomplete> {
|
||||
if self.offset == self.len {
|
||||
return Ok(None);
|
||||
}
|
||||
let mut iter = chunk[self.offset - chunk_start..].chars();
|
||||
let mut ch = iter.next().unwrap();
|
||||
loop {
|
||||
if self.resuming {
|
||||
if self.cat_after.is_none() {
|
||||
self.cat_after = Some(self.grapheme_category(ch));
|
||||
}
|
||||
} else {
|
||||
self.offset += ch.len_utf8();
|
||||
self.state = GraphemeState::Unknown;
|
||||
self.cat_before = self.cat_after.take();
|
||||
if self.cat_before.is_none() {
|
||||
self.cat_before = Some(self.grapheme_category(ch));
|
||||
}
|
||||
if self.cat_before.unwrap() == GraphemeCat::GC_Regional_Indicator {
|
||||
self.ris_count = self.ris_count.map(|c| c + 1);
|
||||
} else {
|
||||
self.ris_count = Some(0);
|
||||
}
|
||||
if let Some(next_ch) = iter.next() {
|
||||
ch = next_ch;
|
||||
self.cat_after = Some(self.grapheme_category(ch));
|
||||
} else if self.offset == self.len {
|
||||
self.decide(true);
|
||||
} else {
|
||||
self.resuming = true;
|
||||
return Err(GraphemeIncomplete::NextChunk);
|
||||
}
|
||||
}
|
||||
self.resuming = true;
|
||||
if self.is_boundary(chunk, chunk_start)? {
|
||||
self.resuming = false;
|
||||
return Ok(Some(self.offset));
|
||||
}
|
||||
self.resuming = false;
|
||||
}
|
||||
}
|
||||
|
||||
/// Find the previous boundary after the current cursor position. Only a part
|
||||
/// of the string need be supplied. If the chunk is incomplete, then this
|
||||
/// method might return `GraphemeIncomplete::PreContext` or
|
||||
/// `GraphemeIncomplete::PrevChunk`. In the former case, the caller should
|
||||
/// call `provide_context` with the requested chunk, then retry. In the
|
||||
/// latter case, the caller should provide the chunk preceding the one
|
||||
/// given, then retry.
|
||||
///
|
||||
/// See `is_boundary` for expectations on the provided chunk.
|
||||
///
|
||||
/// ```rust
|
||||
/// # use unicode_segmentation::GraphemeCursor;
|
||||
/// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
|
||||
/// let mut cursor = GraphemeCursor::new(12, flags.len(), false);
|
||||
/// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(8)));
|
||||
/// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(0)));
|
||||
/// assert_eq!(cursor.prev_boundary(flags, 0), Ok(None));
|
||||
/// ```
|
||||
///
|
||||
/// And an example that uses partial strings (note the exact return is not
|
||||
/// guaranteed, and may be `PrevChunk` or `PreContext` arbitrarily):
|
||||
///
|
||||
/// ```rust
|
||||
/// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
|
||||
/// let s = "abcd";
|
||||
/// let mut cursor = GraphemeCursor::new(4, s.len(), false);
|
||||
/// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Ok(Some(3)));
|
||||
/// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Err(GraphemeIncomplete::PrevChunk));
|
||||
/// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(2)));
|
||||
/// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(1)));
|
||||
/// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(0)));
|
||||
/// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(None));
|
||||
/// ```
|
||||
pub fn prev_boundary(
|
||||
&mut self,
|
||||
chunk: &str,
|
||||
chunk_start: usize,
|
||||
) -> Result<Option<usize>, GraphemeIncomplete> {
|
||||
if self.offset == 0 {
|
||||
return Ok(None);
|
||||
}
|
||||
if self.offset == chunk_start {
|
||||
return Err(GraphemeIncomplete::PrevChunk);
|
||||
}
|
||||
let mut iter = chunk[..self.offset - chunk_start].chars().rev();
|
||||
let mut ch = iter.next().unwrap();
|
||||
loop {
|
||||
if self.offset == chunk_start {
|
||||
self.resuming = true;
|
||||
return Err(GraphemeIncomplete::PrevChunk);
|
||||
}
|
||||
if self.resuming {
|
||||
self.cat_before = Some(self.grapheme_category(ch));
|
||||
} else {
|
||||
self.offset -= ch.len_utf8();
|
||||
self.cat_after = self.cat_before.take();
|
||||
self.state = GraphemeState::Unknown;
|
||||
if let Some(ris_count) = self.ris_count {
|
||||
self.ris_count = if ris_count > 0 {
|
||||
Some(ris_count - 1)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
}
|
||||
if let Some(prev_ch) = iter.next() {
|
||||
ch = prev_ch;
|
||||
self.cat_before = Some(self.grapheme_category(ch));
|
||||
} else if self.offset == 0 {
|
||||
self.decide(true);
|
||||
} else {
|
||||
self.resuming = true;
|
||||
self.cat_after = Some(self.grapheme_category(ch));
|
||||
return Err(GraphemeIncomplete::PrevChunk);
|
||||
}
|
||||
}
|
||||
self.resuming = true;
|
||||
if self.is_boundary(chunk, chunk_start)? {
|
||||
self.resuming = false;
|
||||
return Ok(Some(self.offset));
|
||||
}
|
||||
self.resuming = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_grapheme_cursor_ris_precontext() {
|
||||
let s = "\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}";
|
||||
let mut c = GraphemeCursor::new(8, s.len(), true);
|
||||
assert_eq!(
|
||||
c.is_boundary(&s[4..], 4),
|
||||
Err(GraphemeIncomplete::PreContext(4))
|
||||
);
|
||||
c.provide_context(&s[..4], 0);
|
||||
assert_eq!(c.is_boundary(&s[4..], 4), Ok(true));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_grapheme_cursor_chunk_start_require_precontext() {
|
||||
let s = "\r\n";
|
||||
let mut c = GraphemeCursor::new(1, s.len(), true);
|
||||
assert_eq!(
|
||||
c.is_boundary(&s[1..], 1),
|
||||
Err(GraphemeIncomplete::PreContext(1))
|
||||
);
|
||||
c.provide_context(&s[..1], 0);
|
||||
assert_eq!(c.is_boundary(&s[1..], 1), Ok(false));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_grapheme_cursor_prev_boundary() {
|
||||
let s = "abcd";
|
||||
let mut c = GraphemeCursor::new(3, s.len(), true);
|
||||
assert_eq!(
|
||||
c.prev_boundary(&s[2..], 2),
|
||||
Err(GraphemeIncomplete::PrevChunk)
|
||||
);
|
||||
assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(2)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_grapheme_cursor_prev_boundary_chunk_start() {
|
||||
let s = "abcd";
|
||||
let mut c = GraphemeCursor::new(2, s.len(), true);
|
||||
assert_eq!(
|
||||
c.prev_boundary(&s[2..], 2),
|
||||
Err(GraphemeIncomplete::PrevChunk)
|
||||
);
|
||||
assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(1)));
|
||||
}
|
|
@ -1,307 +0,0 @@
|
|||
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
//! Iterators which split strings on Grapheme Cluster, Word or Sentence boundaries, according
|
||||
//! to the [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/) rules.
|
||||
//!
|
||||
//! ```rust
|
||||
//! extern crate unicode_segmentation;
|
||||
//!
|
||||
//! use unicode_segmentation::UnicodeSegmentation;
|
||||
//!
|
||||
//! fn main() {
|
||||
//! let s = "a̐éö̲\r\n";
|
||||
//! let g = UnicodeSegmentation::graphemes(s, true).collect::<Vec<&str>>();
|
||||
//! let b: &[_] = &["a̐", "é", "ö̲", "\r\n"];
|
||||
//! assert_eq!(g, b);
|
||||
//!
|
||||
//! let s = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
|
||||
//! let w = s.unicode_words().collect::<Vec<&str>>();
|
||||
//! let b: &[_] = &["The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right"];
|
||||
//! assert_eq!(w, b);
|
||||
//!
|
||||
//! let s = "The quick (\"brown\") fox";
|
||||
//! let w = s.split_word_bounds().collect::<Vec<&str>>();
|
||||
//! let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox"];
|
||||
//! assert_eq!(w, b);
|
||||
//! }
|
||||
//! ```
|
||||
//!
|
||||
//! # no_std
|
||||
//!
|
||||
//! unicode-segmentation does not depend on libstd, so it can be used in crates
|
||||
//! with the `#![no_std]` attribute.
|
||||
//!
|
||||
//! # crates.io
|
||||
//!
|
||||
//! You can use this package in your project by adding the following
|
||||
//! to your `Cargo.toml`:
|
||||
//!
|
||||
//! ```toml
|
||||
//! [dependencies]
|
||||
//! unicode-segmentation = "1.9.0"
|
||||
//! ```
|
||||
|
||||
#![deny(missing_docs, unsafe_code)]
|
||||
#![doc(
|
||||
html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
|
||||
html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png"
|
||||
)]
|
||||
#![no_std]
|
||||
|
||||
#[cfg(test)]
|
||||
#[macro_use]
|
||||
extern crate std;
|
||||
|
||||
#[cfg(test)]
|
||||
#[macro_use]
|
||||
extern crate quickcheck;
|
||||
|
||||
pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
|
||||
pub use grapheme::{GraphemeIndices, Graphemes};
|
||||
pub use sentence::{USentenceBoundIndices, USentenceBounds, UnicodeSentences};
|
||||
pub use tables::UNICODE_VERSION;
|
||||
pub use word::{UWordBoundIndices, UWordBounds, UnicodeWordIndices, UnicodeWords};
|
||||
|
||||
mod grapheme;
|
||||
#[rustfmt::skip]
|
||||
mod tables;
|
||||
mod sentence;
|
||||
mod word;
|
||||
|
||||
#[cfg(test)]
|
||||
mod test;
|
||||
#[cfg(test)]
|
||||
mod testdata;
|
||||
|
||||
/// Methods for segmenting strings according to
|
||||
/// [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/).
|
||||
pub trait UnicodeSegmentation {
|
||||
/// Returns an iterator over the [grapheme clusters][graphemes] of `self`.
|
||||
///
|
||||
/// [graphemes]: http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
|
||||
///
|
||||
/// If `is_extended` is true, the iterator is over the
|
||||
/// *extended grapheme clusters*;
|
||||
/// otherwise, the iterator is over the *legacy grapheme clusters*.
|
||||
/// [UAX#29](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries)
|
||||
/// recommends extended grapheme cluster boundaries for general processing.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// # use self::unicode_segmentation::UnicodeSegmentation;
|
||||
/// let gr1 = UnicodeSegmentation::graphemes("a\u{310}e\u{301}o\u{308}\u{332}", true)
|
||||
/// .collect::<Vec<&str>>();
|
||||
/// let b: &[_] = &["a\u{310}", "e\u{301}", "o\u{308}\u{332}"];
|
||||
///
|
||||
/// assert_eq!(&gr1[..], b);
|
||||
///
|
||||
/// let gr2 = UnicodeSegmentation::graphemes("a\r\nb🇷🇺🇸🇹", true).collect::<Vec<&str>>();
|
||||
/// let b: &[_] = &["a", "\r\n", "b", "🇷🇺", "🇸🇹"];
|
||||
///
|
||||
/// assert_eq!(&gr2[..], b);
|
||||
/// ```
|
||||
fn graphemes<'a>(&'a self, is_extended: bool) -> Graphemes<'a>;
|
||||
|
||||
/// Returns an iterator over the grapheme clusters of `self` and their
|
||||
/// byte offsets. See `graphemes()` for more information.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// # use self::unicode_segmentation::UnicodeSegmentation;
|
||||
/// let gr_inds = UnicodeSegmentation::grapheme_indices("a̐éö̲\r\n", true)
|
||||
/// .collect::<Vec<(usize, &str)>>();
|
||||
/// let b: &[_] = &[(0, "a̐"), (3, "é"), (6, "ö̲"), (11, "\r\n")];
|
||||
///
|
||||
/// assert_eq!(&gr_inds[..], b);
|
||||
/// ```
|
||||
fn grapheme_indices<'a>(&'a self, is_extended: bool) -> GraphemeIndices<'a>;
|
||||
|
||||
/// Returns an iterator over the words of `self`, separated on
|
||||
/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
|
||||
///
|
||||
/// Here, "words" are just those substrings which, after splitting on
|
||||
/// UAX#29 word boundaries, contain any alphanumeric characters. That is, the
|
||||
/// substring must contain at least one character with the
|
||||
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
|
||||
/// property, or with
|
||||
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// # use self::unicode_segmentation::UnicodeSegmentation;
|
||||
/// let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
|
||||
/// let uw1 = uws.unicode_words().collect::<Vec<&str>>();
|
||||
/// let b: &[_] = &["The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right"];
|
||||
///
|
||||
/// assert_eq!(&uw1[..], b);
|
||||
/// ```
|
||||
fn unicode_words<'a>(&'a self) -> UnicodeWords<'a>;
|
||||
|
||||
/// Returns an iterator over the words of `self`, separated on
|
||||
/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), and their
|
||||
/// offsets.
|
||||
///
|
||||
/// Here, "words" are just those substrings which, after splitting on
|
||||
/// UAX#29 word boundaries, contain any alphanumeric characters. That is, the
|
||||
/// substring must contain at least one character with the
|
||||
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
|
||||
/// property, or with
|
||||
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// # use self::unicode_segmentation::UnicodeSegmentation;
|
||||
/// let uwis = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
|
||||
/// let uwi1 = uwis.unicode_word_indices().collect::<Vec<(usize, &str)>>();
|
||||
/// let b: &[_] = &[(0, "The"), (4, "quick"), (12, "brown"), (20, "fox"), (24, "can't"),
|
||||
/// (30, "jump"), (35, "32.3"), (40, "feet"), (46, "right")];
|
||||
///
|
||||
/// assert_eq!(&uwi1[..], b);
|
||||
/// ```
|
||||
fn unicode_word_indices<'a>(&'a self) -> UnicodeWordIndices<'a>;
|
||||
|
||||
/// Returns an iterator over substrings of `self` separated on
|
||||
/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
|
||||
///
|
||||
/// The concatenation of the substrings returned by this function is just the original string.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// # use self::unicode_segmentation::UnicodeSegmentation;
|
||||
/// let swu1 = "The quick (\"brown\") fox".split_word_bounds().collect::<Vec<&str>>();
|
||||
/// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox"];
|
||||
///
|
||||
/// assert_eq!(&swu1[..], b);
|
||||
/// ```
|
||||
fn split_word_bounds<'a>(&'a self) -> UWordBounds<'a>;
|
||||
|
||||
/// Returns an iterator over substrings of `self`, split on UAX#29 word boundaries,
|
||||
/// and their offsets. See `split_word_bounds()` for more information.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// # use self::unicode_segmentation::UnicodeSegmentation;
|
||||
/// let swi1 = "Brr, it's 29.3°F!".split_word_bound_indices().collect::<Vec<(usize, &str)>>();
|
||||
/// let b: &[_] = &[(0, "Brr"), (3, ","), (4, " "), (5, "it's"), (9, " "), (10, "29.3"),
|
||||
/// (14, "°"), (16, "F"), (17, "!")];
|
||||
///
|
||||
/// assert_eq!(&swi1[..], b);
|
||||
/// ```
|
||||
fn split_word_bound_indices<'a>(&'a self) -> UWordBoundIndices<'a>;
|
||||
|
||||
/// Returns an iterator over substrings of `self` separated on
|
||||
/// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
|
||||
///
|
||||
/// Here, "sentences" are just those substrings which, after splitting on
|
||||
/// UAX#29 sentence boundaries, contain any alphanumeric characters. That is, the
|
||||
/// substring must contain at least one character with the
|
||||
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
|
||||
/// property, or with
|
||||
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// # use self::unicode_segmentation::UnicodeSegmentation;
|
||||
/// let uss = "Mr. Fox jumped. [...] The dog was too lazy.";
|
||||
/// let us1 = uss.unicode_sentences().collect::<Vec<&str>>();
|
||||
/// let b: &[_] = &["Mr. ", "Fox jumped. ", "The dog was too lazy."];
|
||||
///
|
||||
/// assert_eq!(&us1[..], b);
|
||||
/// ```
|
||||
fn unicode_sentences<'a>(&'a self) -> UnicodeSentences<'a>;
|
||||
|
||||
/// Returns an iterator over substrings of `self` separated on
|
||||
/// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
|
||||
///
|
||||
/// The concatenation of the substrings returned by this function is just the original string.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// # use self::unicode_segmentation::UnicodeSegmentation;
|
||||
/// let ssbs = "Mr. Fox jumped. [...] The dog was too lazy.";
|
||||
/// let ssb1 = ssbs.split_sentence_bounds().collect::<Vec<&str>>();
|
||||
/// let b: &[_] = &["Mr. ", "Fox jumped. ", "[...] ", "The dog was too lazy."];
|
||||
///
|
||||
/// assert_eq!(&ssb1[..], b);
|
||||
/// ```
|
||||
fn split_sentence_bounds<'a>(&'a self) -> USentenceBounds<'a>;
|
||||
|
||||
/// Returns an iterator over substrings of `self`, split on UAX#29 sentence boundaries,
|
||||
/// and their offsets. See `split_sentence_bounds()` for more information.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// # use self::unicode_segmentation::UnicodeSegmentation;
|
||||
/// let ssis = "Mr. Fox jumped. [...] The dog was too lazy.";
|
||||
/// let ssi1 = ssis.split_sentence_bound_indices().collect::<Vec<(usize, &str)>>();
|
||||
/// let b: &[_] = &[(0, "Mr. "), (4, "Fox jumped. "), (16, "[...] "),
|
||||
/// (22, "The dog was too lazy.")];
|
||||
///
|
||||
/// assert_eq!(&ssi1[..], b);
|
||||
/// ```
|
||||
fn split_sentence_bound_indices<'a>(&'a self) -> USentenceBoundIndices<'a>;
|
||||
}
|
||||
|
||||
impl UnicodeSegmentation for str {
|
||||
#[inline]
|
||||
fn graphemes(&self, is_extended: bool) -> Graphemes {
|
||||
grapheme::new_graphemes(self, is_extended)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn grapheme_indices(&self, is_extended: bool) -> GraphemeIndices {
|
||||
grapheme::new_grapheme_indices(self, is_extended)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn unicode_words(&self) -> UnicodeWords {
|
||||
word::new_unicode_words(self)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn unicode_word_indices(&self) -> UnicodeWordIndices {
|
||||
word::new_unicode_word_indices(self)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn split_word_bounds(&self) -> UWordBounds {
|
||||
word::new_word_bounds(self)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn split_word_bound_indices(&self) -> UWordBoundIndices {
|
||||
word::new_word_bound_indices(self)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn unicode_sentences(&self) -> UnicodeSentences {
|
||||
sentence::new_unicode_sentences(self)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn split_sentence_bounds(&self) -> USentenceBounds {
|
||||
sentence::new_sentence_bounds(self)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn split_sentence_bound_indices(&self) -> USentenceBoundIndices {
|
||||
sentence::new_sentence_bound_indices(self)
|
||||
}
|
||||
}
|
|
@ -1,415 +0,0 @@
|
|||
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use core::cmp;
|
||||
use core::iter::Filter;
|
||||
|
||||
// All of the logic for forward iteration over sentences
|
||||
mod fwd {
|
||||
use crate::tables::sentence::SentenceCat;
|
||||
use core::cmp;
|
||||
|
||||
// Describe a parsed part of source string as described in this table:
|
||||
// https://unicode.org/reports/tr29/#Default_Sentence_Boundaries
|
||||
#[derive(Clone, Copy, PartialEq, Eq)]
|
||||
enum StatePart {
|
||||
Sot,
|
||||
Eot,
|
||||
Other,
|
||||
CR,
|
||||
LF,
|
||||
Sep,
|
||||
ATerm,
|
||||
UpperLower,
|
||||
ClosePlus,
|
||||
SpPlus,
|
||||
STerm,
|
||||
}
|
||||
|
||||
#[derive(Clone, PartialEq, Eq)]
|
||||
struct SentenceBreaksState(pub [StatePart; 4]);
|
||||
|
||||
const INITIAL_STATE: SentenceBreaksState = SentenceBreaksState([
|
||||
StatePart::Sot,
|
||||
StatePart::Sot,
|
||||
StatePart::Sot,
|
||||
StatePart::Sot,
|
||||
]);
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct SentenceBreaks<'a> {
|
||||
pub string: &'a str,
|
||||
pos: usize,
|
||||
state: SentenceBreaksState,
|
||||
}
|
||||
|
||||
impl SentenceBreaksState {
|
||||
// Attempt to advance the internal state by one part
|
||||
// Whitespace and some punctutation will be collapsed
|
||||
fn next(&self, cat: SentenceCat) -> SentenceBreaksState {
|
||||
let &SentenceBreaksState(parts) = self;
|
||||
let parts = match (parts[3], cat) {
|
||||
(StatePart::ClosePlus, SentenceCat::SC_Close) => parts,
|
||||
(StatePart::SpPlus, SentenceCat::SC_Sp) => parts,
|
||||
_ => [
|
||||
parts[1],
|
||||
parts[2],
|
||||
parts[3],
|
||||
match cat {
|
||||
SentenceCat::SC_CR => StatePart::CR,
|
||||
SentenceCat::SC_LF => StatePart::LF,
|
||||
SentenceCat::SC_Sep => StatePart::Sep,
|
||||
SentenceCat::SC_ATerm => StatePart::ATerm,
|
||||
SentenceCat::SC_Upper | SentenceCat::SC_Lower => StatePart::UpperLower,
|
||||
SentenceCat::SC_Close => StatePart::ClosePlus,
|
||||
SentenceCat::SC_Sp => StatePart::SpPlus,
|
||||
SentenceCat::SC_STerm => StatePart::STerm,
|
||||
_ => StatePart::Other,
|
||||
},
|
||||
],
|
||||
};
|
||||
SentenceBreaksState(parts)
|
||||
}
|
||||
|
||||
fn end(&self) -> SentenceBreaksState {
|
||||
let &SentenceBreaksState(parts) = self;
|
||||
SentenceBreaksState([parts[1], parts[2], parts[3], StatePart::Eot])
|
||||
}
|
||||
|
||||
// Helper function to check if state head matches a single `StatePart`
|
||||
fn match1(&self, part: StatePart) -> bool {
|
||||
let &SentenceBreaksState(parts) = self;
|
||||
part == parts[3]
|
||||
}
|
||||
|
||||
// Helper function to check if first two `StateParts` in state match
|
||||
// the given two
|
||||
fn match2(&self, part1: StatePart, part2: StatePart) -> bool {
|
||||
let &SentenceBreaksState(parts) = self;
|
||||
part1 == parts[2] && part2 == parts[3]
|
||||
}
|
||||
}
|
||||
|
||||
// https://unicode.org/reports/tr29/#SB8
|
||||
// TODO cache this, it is currently quadratic
|
||||
fn match_sb8(state: &SentenceBreaksState, ahead: &str) -> bool {
|
||||
let &SentenceBreaksState(parts) = state;
|
||||
let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
|
||||
if parts[idx] == StatePart::ClosePlus {
|
||||
idx -= 1
|
||||
}
|
||||
|
||||
if parts[idx] == StatePart::ATerm {
|
||||
use crate::tables::sentence as se;
|
||||
|
||||
for next_char in ahead.chars() {
|
||||
//( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower
|
||||
match se::sentence_category(next_char).2 {
|
||||
se::SC_Lower => return true,
|
||||
se::SC_OLetter
|
||||
| se::SC_Upper
|
||||
| se::SC_Sep
|
||||
| se::SC_CR
|
||||
| se::SC_LF
|
||||
| se::SC_STerm
|
||||
| se::SC_ATerm => return false,
|
||||
_ => continue,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
// https://unicode.org/reports/tr29/#SB8a
|
||||
fn match_sb8a(state: &SentenceBreaksState) -> bool {
|
||||
// SATerm Close* Sp*
|
||||
let &SentenceBreaksState(parts) = state;
|
||||
let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
|
||||
if parts[idx] == StatePart::ClosePlus {
|
||||
idx -= 1
|
||||
}
|
||||
parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
|
||||
}
|
||||
|
||||
// https://unicode.org/reports/tr29/#SB9
|
||||
fn match_sb9(state: &SentenceBreaksState) -> bool {
|
||||
// SATerm Close*
|
||||
let &SentenceBreaksState(parts) = state;
|
||||
let idx = if parts[3] == StatePart::ClosePlus {
|
||||
2
|
||||
} else {
|
||||
3
|
||||
};
|
||||
parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
|
||||
}
|
||||
|
||||
// https://unicode.org/reports/tr29/#SB11
|
||||
fn match_sb11(state: &SentenceBreaksState) -> bool {
|
||||
// SATerm Close* Sp* ParaSep?
|
||||
let &SentenceBreaksState(parts) = state;
|
||||
let mut idx = match parts[3] {
|
||||
StatePart::Sep | StatePart::CR | StatePart::LF => 2,
|
||||
_ => 3,
|
||||
};
|
||||
|
||||
if parts[idx] == StatePart::SpPlus {
|
||||
idx -= 1
|
||||
}
|
||||
if parts[idx] == StatePart::ClosePlus {
|
||||
idx -= 1
|
||||
}
|
||||
|
||||
parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
|
||||
}
|
||||
|
||||
impl<'a> Iterator for SentenceBreaks<'a> {
|
||||
// Returns the index of the character which follows a break
|
||||
type Item = usize;
|
||||
|
||||
#[inline]
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
let slen = self.string.len();
|
||||
// A sentence could be one character
|
||||
(cmp::min(slen, 2), Some(slen + 1))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<usize> {
|
||||
use crate::tables::sentence as se;
|
||||
|
||||
for next_char in self.string[self.pos..].chars() {
|
||||
let position_before = self.pos;
|
||||
let state_before = self.state.clone();
|
||||
|
||||
let next_cat = se::sentence_category(next_char).2;
|
||||
|
||||
self.pos += next_char.len_utf8();
|
||||
self.state = self.state.next(next_cat);
|
||||
|
||||
match next_cat {
|
||||
// SB1 https://unicode.org/reports/tr29/#SB1
|
||||
_ if state_before.match1(StatePart::Sot) => return Some(position_before),
|
||||
|
||||
// SB2 is handled when inner iterator (chars) is finished
|
||||
|
||||
// SB3 https://unicode.org/reports/tr29/#SB3
|
||||
SentenceCat::SC_LF if state_before.match1(StatePart::CR) => continue,
|
||||
|
||||
// SB4 https://unicode.org/reports/tr29/#SB4
|
||||
_ if state_before.match1(StatePart::Sep)
|
||||
|| state_before.match1(StatePart::CR)
|
||||
|| state_before.match1(StatePart::LF) =>
|
||||
{
|
||||
return Some(position_before)
|
||||
}
|
||||
|
||||
// SB5 https://unicode.org/reports/tr29/#SB5
|
||||
SentenceCat::SC_Extend | SentenceCat::SC_Format => self.state = state_before,
|
||||
|
||||
// SB6 https://unicode.org/reports/tr29/#SB6
|
||||
SentenceCat::SC_Numeric if state_before.match1(StatePart::ATerm) => continue,
|
||||
|
||||
// SB7 https://unicode.org/reports/tr29/#SB7
|
||||
SentenceCat::SC_Upper
|
||||
if state_before.match2(StatePart::UpperLower, StatePart::ATerm) =>
|
||||
{
|
||||
continue
|
||||
}
|
||||
|
||||
// SB8 https://unicode.org/reports/tr29/#SB8
|
||||
_ if match_sb8(&state_before, &self.string[position_before..]) => continue,
|
||||
|
||||
// SB8a https://unicode.org/reports/tr29/#SB8a
|
||||
SentenceCat::SC_SContinue | SentenceCat::SC_STerm | SentenceCat::SC_ATerm
|
||||
if match_sb8a(&state_before) =>
|
||||
{
|
||||
continue
|
||||
}
|
||||
|
||||
// SB9 https://unicode.org/reports/tr29/#SB9
|
||||
SentenceCat::SC_Close
|
||||
| SentenceCat::SC_Sp
|
||||
| SentenceCat::SC_Sep
|
||||
| SentenceCat::SC_CR
|
||||
| SentenceCat::SC_LF
|
||||
if match_sb9(&state_before) =>
|
||||
{
|
||||
continue
|
||||
}
|
||||
|
||||
// SB10 https://unicode.org/reports/tr29/#SB10
|
||||
SentenceCat::SC_Sp
|
||||
| SentenceCat::SC_Sep
|
||||
| SentenceCat::SC_CR
|
||||
| SentenceCat::SC_LF
|
||||
if match_sb8a(&state_before) =>
|
||||
{
|
||||
continue
|
||||
}
|
||||
|
||||
// SB11 https://unicode.org/reports/tr29/#SB11
|
||||
_ if match_sb11(&state_before) => return Some(position_before),
|
||||
|
||||
// SB998 https://unicode.org/reports/tr29/#SB998
|
||||
_ => continue,
|
||||
}
|
||||
}
|
||||
|
||||
// SB2 https://unicode.org/reports/tr29/#SB2
|
||||
if self.state.match1(StatePart::Sot) {
|
||||
None
|
||||
} else if self.state.match1(StatePart::Eot) {
|
||||
None
|
||||
} else {
|
||||
self.state = self.state.end();
|
||||
Some(self.pos)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_sentence_breaks<'a>(source: &'a str) -> SentenceBreaks<'a> {
|
||||
SentenceBreaks {
|
||||
string: source,
|
||||
pos: 0,
|
||||
state: INITIAL_STATE,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// An iterator over the substrings of a string which, after splitting the string on
|
||||
/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries),
|
||||
/// contain any characters with the
|
||||
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
|
||||
/// property, or with
|
||||
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
|
||||
///
|
||||
/// This struct is created by the [`unicode_sentences`] method on the [`UnicodeSegmentation`]
|
||||
/// trait. See its documentation for more.
|
||||
///
|
||||
/// [`unicode_sentences`]: trait.UnicodeSegmentation.html#tymethod.unicode_sentences
|
||||
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
|
||||
#[derive(Clone)]
|
||||
pub struct UnicodeSentences<'a> {
|
||||
inner: Filter<USentenceBounds<'a>, fn(&&str) -> bool>,
|
||||
}
|
||||
|
||||
/// External iterator for a string's
|
||||
/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
|
||||
///
|
||||
/// This struct is created by the [`split_sentence_bounds`] method on the [`UnicodeSegmentation`]
|
||||
/// trait. See its documentation for more.
|
||||
///
|
||||
/// [`split_sentence_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bounds
|
||||
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
|
||||
#[derive(Clone)]
|
||||
pub struct USentenceBounds<'a> {
|
||||
iter: fwd::SentenceBreaks<'a>,
|
||||
sentence_start: Option<usize>,
|
||||
}
|
||||
|
||||
/// External iterator for sentence boundaries and byte offsets.
|
||||
///
|
||||
/// This struct is created by the [`split_sentence_bound_indices`] method on the
|
||||
/// [`UnicodeSegmentation`] trait. See its documentation for more.
|
||||
///
|
||||
/// [`split_sentence_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bound_indices
|
||||
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
|
||||
#[derive(Clone)]
|
||||
pub struct USentenceBoundIndices<'a> {
|
||||
start_offset: usize,
|
||||
iter: USentenceBounds<'a>,
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn new_sentence_bounds<'a>(source: &'a str) -> USentenceBounds<'a> {
|
||||
USentenceBounds {
|
||||
iter: fwd::new_sentence_breaks(source),
|
||||
sentence_start: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn new_sentence_bound_indices<'a>(source: &'a str) -> USentenceBoundIndices<'a> {
|
||||
USentenceBoundIndices {
|
||||
start_offset: source.as_ptr() as usize,
|
||||
iter: new_sentence_bounds(source),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn new_unicode_sentences<'b>(s: &'b str) -> UnicodeSentences<'b> {
|
||||
use super::UnicodeSegmentation;
|
||||
use crate::tables::util::is_alphanumeric;
|
||||
|
||||
fn has_alphanumeric(s: &&str) -> bool {
|
||||
s.chars().any(|c| is_alphanumeric(c))
|
||||
}
|
||||
let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer
|
||||
|
||||
UnicodeSentences {
|
||||
inner: s.split_sentence_bounds().filter(has_alphanumeric),
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for UnicodeSentences<'a> {
|
||||
type Item = &'a str;
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<&'a str> {
|
||||
self.inner.next()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for USentenceBounds<'a> {
|
||||
type Item = &'a str;
|
||||
|
||||
#[inline]
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
let (lower, upper) = self.iter.size_hint();
|
||||
(cmp::max(0, lower - 1), upper.map(|u| cmp::max(0, u - 1)))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<&'a str> {
|
||||
if self.sentence_start == None {
|
||||
if let Some(start_pos) = self.iter.next() {
|
||||
self.sentence_start = Some(start_pos)
|
||||
} else {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(break_pos) = self.iter.next() {
|
||||
let start_pos = self.sentence_start.unwrap();
|
||||
let sentence = &self.iter.string[start_pos..break_pos];
|
||||
self.sentence_start = Some(break_pos);
|
||||
Some(sentence)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for USentenceBoundIndices<'a> {
|
||||
type Item = (usize, &'a str);
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<(usize, &'a str)> {
|
||||
self.iter
|
||||
.next()
|
||||
.map(|s| (s.as_ptr() as usize - self.start_offset, s))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
self.iter.size_hint()
|
||||
}
|
||||
}
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,247 +0,0 @@
|
|||
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use super::UnicodeSegmentation;
|
||||
|
||||
use std::prelude::v1::*;
|
||||
|
||||
#[test]
|
||||
fn test_graphemes() {
|
||||
use crate::testdata::{TEST_DIFF, TEST_SAME};
|
||||
|
||||
pub const EXTRA_DIFF: &'static [(
|
||||
&'static str,
|
||||
&'static [&'static str],
|
||||
&'static [&'static str],
|
||||
)] = &[
|
||||
// Official test suite doesn't include two Prepend chars between two other chars.
|
||||
(
|
||||
"\u{20}\u{600}\u{600}\u{20}",
|
||||
&["\u{20}", "\u{600}\u{600}\u{20}"],
|
||||
&["\u{20}", "\u{600}", "\u{600}", "\u{20}"],
|
||||
),
|
||||
// Test for Prepend followed by two Any chars
|
||||
(
|
||||
"\u{600}\u{20}\u{20}",
|
||||
&["\u{600}\u{20}", "\u{20}"],
|
||||
&["\u{600}", "\u{20}", "\u{20}"],
|
||||
),
|
||||
];
|
||||
|
||||
pub const EXTRA_SAME: &'static [(&'static str, &'static [&'static str])] = &[
|
||||
// family emoji (more than two emoji joined by ZWJ)
|
||||
(
|
||||
"\u{1f468}\u{200d}\u{1f467}\u{200d}\u{1f466}",
|
||||
&["\u{1f468}\u{200d}\u{1f467}\u{200d}\u{1f466}"],
|
||||
),
|
||||
// cartwheel emoji followed by two fitzpatrick skin tone modifiers
|
||||
// (test case from issue #19)
|
||||
(
|
||||
"\u{1F938}\u{1F3FE}\u{1F3FE}",
|
||||
&["\u{1F938}\u{1F3FE}\u{1F3FE}"],
|
||||
),
|
||||
];
|
||||
|
||||
for &(s, g) in TEST_SAME.iter().chain(EXTRA_SAME) {
|
||||
// test forward iterator
|
||||
assert!(UnicodeSegmentation::graphemes(s, true).eq(g.iter().cloned()));
|
||||
assert!(UnicodeSegmentation::graphemes(s, false).eq(g.iter().cloned()));
|
||||
|
||||
// test reverse iterator
|
||||
assert!(UnicodeSegmentation::graphemes(s, true)
|
||||
.rev()
|
||||
.eq(g.iter().rev().cloned()));
|
||||
assert!(UnicodeSegmentation::graphemes(s, false)
|
||||
.rev()
|
||||
.eq(g.iter().rev().cloned()));
|
||||
}
|
||||
|
||||
for &(s, gt, gf) in TEST_DIFF.iter().chain(EXTRA_DIFF) {
|
||||
// test forward iterator
|
||||
assert!(UnicodeSegmentation::graphemes(s, true).eq(gt.iter().cloned()));
|
||||
assert!(UnicodeSegmentation::graphemes(s, false).eq(gf.iter().cloned()));
|
||||
|
||||
// test reverse iterator
|
||||
assert!(UnicodeSegmentation::graphemes(s, true)
|
||||
.rev()
|
||||
.eq(gt.iter().rev().cloned()));
|
||||
assert!(UnicodeSegmentation::graphemes(s, false)
|
||||
.rev()
|
||||
.eq(gf.iter().rev().cloned()));
|
||||
}
|
||||
|
||||
// test the indices iterators
|
||||
let s = "a̐éö̲\r\n";
|
||||
let gr_inds = UnicodeSegmentation::grapheme_indices(s, true).collect::<Vec<(usize, &str)>>();
|
||||
let b: &[_] = &[(0, "a̐"), (3, "é"), (6, "ö̲"), (11, "\r\n")];
|
||||
assert_eq!(gr_inds, b);
|
||||
let gr_inds = UnicodeSegmentation::grapheme_indices(s, true)
|
||||
.rev()
|
||||
.collect::<Vec<(usize, &str)>>();
|
||||
let b: &[_] = &[(11, "\r\n"), (6, "ö̲"), (3, "é"), (0, "a̐")];
|
||||
assert_eq!(gr_inds, b);
|
||||
let mut gr_inds_iter = UnicodeSegmentation::grapheme_indices(s, true);
|
||||
{
|
||||
let gr_inds = gr_inds_iter.by_ref();
|
||||
let e1 = gr_inds.size_hint();
|
||||
assert_eq!(e1, (1, Some(13)));
|
||||
let c = gr_inds.count();
|
||||
assert_eq!(c, 4);
|
||||
}
|
||||
let e2 = gr_inds_iter.size_hint();
|
||||
assert_eq!(e2, (0, Some(0)));
|
||||
|
||||
// make sure the reverse iterator does the right thing with "\n" at beginning of string
|
||||
let s = "\n\r\n\r";
|
||||
let gr = UnicodeSegmentation::graphemes(s, true)
|
||||
.rev()
|
||||
.collect::<Vec<&str>>();
|
||||
let b: &[_] = &["\r", "\r\n", "\n"];
|
||||
assert_eq!(gr, b);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_words() {
|
||||
use crate::testdata::TEST_WORD;
|
||||
|
||||
// Unicode's official tests don't really test longer chains of flag emoji
|
||||
// TODO This could be improved with more tests like flag emoji with interspersed Extend chars and ZWJ
|
||||
const EXTRA_TESTS: &'static [(&'static str, &'static [&'static str])] = &[
|
||||
(
|
||||
"🇦🇫🇦🇽🇦🇱🇩🇿🇦🇸🇦🇩🇦🇴",
|
||||
&["🇦🇫", "🇦🇽", "🇦🇱", "🇩🇿", "🇦🇸", "🇦🇩", "🇦🇴"],
|
||||
),
|
||||
("🇦🇫🇦🇽🇦🇱🇩🇿🇦🇸🇦🇩🇦", &["🇦🇫", "🇦🇽", "🇦🇱", "🇩🇿", "🇦🇸", "🇦🇩", "🇦"]),
|
||||
(
|
||||
"🇦a🇫🇦🇽a🇦🇱🇩🇿🇦🇸🇦🇩🇦",
|
||||
&["🇦", "a", "🇫🇦", "🇽", "a", "🇦🇱", "🇩🇿", "🇦🇸", "🇦🇩", "🇦"],
|
||||
),
|
||||
(
|
||||
"\u{1f468}\u{200d}\u{1f468}\u{200d}\u{1f466}",
|
||||
&["\u{1f468}\u{200d}\u{1f468}\u{200d}\u{1f466}"],
|
||||
),
|
||||
("😌👎🏼", &["😌", "👎🏼"]),
|
||||
// perhaps wrong, spaces should not be included?
|
||||
("hello world", &["hello", " ", "world"]),
|
||||
("🇨🇦🇨🇭🇿🇲🇿 hi", &["🇨🇦", "🇨🇭", "🇿🇲", "🇿", " ", "hi"]),
|
||||
];
|
||||
for &(s, w) in TEST_WORD.iter().chain(EXTRA_TESTS.iter()) {
|
||||
macro_rules! assert_ {
|
||||
($test:expr, $exp:expr, $name:expr) => {
|
||||
// collect into vector for better diagnostics in failure case
|
||||
let testing = $test.collect::<Vec<_>>();
|
||||
let expected = $exp.collect::<Vec<_>>();
|
||||
assert_eq!(
|
||||
testing, expected,
|
||||
"{} test for testcase ({:?}, {:?}) failed.",
|
||||
$name, s, w
|
||||
)
|
||||
};
|
||||
}
|
||||
// test forward iterator
|
||||
assert_!(
|
||||
s.split_word_bounds(),
|
||||
w.iter().cloned(),
|
||||
"Forward word boundaries"
|
||||
);
|
||||
|
||||
// test reverse iterator
|
||||
assert_!(
|
||||
s.split_word_bounds().rev(),
|
||||
w.iter().rev().cloned(),
|
||||
"Reverse word boundaries"
|
||||
);
|
||||
|
||||
// generate offsets from word string lengths
|
||||
let mut indices = vec![0];
|
||||
for i in w.iter().cloned().map(|s| s.len()).scan(0, |t, n| {
|
||||
*t += n;
|
||||
Some(*t)
|
||||
}) {
|
||||
indices.push(i);
|
||||
}
|
||||
indices.pop();
|
||||
let indices = indices;
|
||||
|
||||
// test forward indices iterator
|
||||
assert_!(
|
||||
s.split_word_bound_indices().map(|(l, _)| l),
|
||||
indices.iter().cloned(),
|
||||
"Forward word indices"
|
||||
);
|
||||
|
||||
// test backward indices iterator
|
||||
assert_!(
|
||||
s.split_word_bound_indices().rev().map(|(l, _)| l),
|
||||
indices.iter().rev().cloned(),
|
||||
"Reverse word indices"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sentences() {
|
||||
use crate::testdata::TEST_SENTENCE;
|
||||
|
||||
for &(s, w) in TEST_SENTENCE.iter() {
|
||||
macro_rules! assert_ {
|
||||
($test:expr, $exp:expr, $name:expr) => {
|
||||
// collect into vector for better diagnostics in failure case
|
||||
let testing = $test.collect::<Vec<_>>();
|
||||
let expected = $exp.collect::<Vec<_>>();
|
||||
assert_eq!(
|
||||
testing, expected,
|
||||
"{} test for testcase ({:?}, {:?}) failed.",
|
||||
$name, s, w
|
||||
)
|
||||
};
|
||||
}
|
||||
|
||||
assert_!(
|
||||
s.split_sentence_bounds(),
|
||||
w.iter().cloned(),
|
||||
"Forward sentence boundaries"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
quickcheck! {
|
||||
fn quickcheck_forward_reverse_graphemes_extended(s: String) -> bool {
|
||||
let a = s.graphemes(true).collect::<Vec<_>>();
|
||||
let mut b = s.graphemes(true).rev().collect::<Vec<_>>();
|
||||
b.reverse();
|
||||
a == b
|
||||
}
|
||||
|
||||
fn quickcheck_forward_reverse_graphemes_legacy(s: String) -> bool {
|
||||
let a = s.graphemes(false).collect::<Vec<_>>();
|
||||
let mut b = s.graphemes(false).rev().collect::<Vec<_>>();
|
||||
b.reverse();
|
||||
a == b
|
||||
}
|
||||
|
||||
fn quickcheck_join_graphemes(s: String) -> bool {
|
||||
let a = s.graphemes(true).collect::<String>();
|
||||
let b = s.graphemes(false).collect::<String>();
|
||||
a == s && b == s
|
||||
}
|
||||
|
||||
fn quickcheck_forward_reverse_words(s: String) -> bool {
|
||||
let a = s.split_word_bounds().collect::<Vec<_>>();
|
||||
let mut b = s.split_word_bounds().rev().collect::<Vec<_>>();
|
||||
b.reverse();
|
||||
a == b
|
||||
}
|
||||
|
||||
fn quickcheck_join_words(s: String) -> bool {
|
||||
let a = s.split_word_bounds().collect::<String>();
|
||||
a == s
|
||||
}
|
||||
}
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,754 +0,0 @@
|
|||
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use core::cmp;
|
||||
use core::iter::Filter;
|
||||
|
||||
use crate::tables::word::WordCat;
|
||||
|
||||
/// An iterator over the substrings of a string which, after splitting the string on
|
||||
/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
|
||||
/// contain any characters with the
|
||||
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
|
||||
/// property, or with
|
||||
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
|
||||
///
|
||||
/// This struct is created by the [`unicode_words`] method on the [`UnicodeSegmentation`] trait. See
|
||||
/// its documentation for more.
|
||||
///
|
||||
/// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words
|
||||
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
|
||||
pub struct UnicodeWords<'a> {
|
||||
inner: Filter<UWordBounds<'a>, fn(&&str) -> bool>,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for UnicodeWords<'a> {
|
||||
type Item = &'a str;
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<&'a str> {
|
||||
self.inner.next()
|
||||
}
|
||||
}
|
||||
impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
|
||||
#[inline]
|
||||
fn next_back(&mut self) -> Option<&'a str> {
|
||||
self.inner.next_back()
|
||||
}
|
||||
}
|
||||
|
||||
/// An iterator over the substrings of a string which, after splitting the string on
|
||||
/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
|
||||
/// contain any characters with the
|
||||
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
|
||||
/// property, or with
|
||||
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
|
||||
/// This iterator also provides the byte offsets for each substring.
|
||||
///
|
||||
/// This struct is created by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See
|
||||
/// its documentation for more.
|
||||
///
|
||||
/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices
|
||||
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
|
||||
pub struct UnicodeWordIndices<'a> {
|
||||
inner: Filter<UWordBoundIndices<'a>, fn(&(usize, &str)) -> bool>,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for UnicodeWordIndices<'a> {
|
||||
type Item = (usize, &'a str);
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<(usize, &'a str)> {
|
||||
self.inner.next()
|
||||
}
|
||||
}
|
||||
impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> {
|
||||
#[inline]
|
||||
fn next_back(&mut self) -> Option<(usize, &'a str)> {
|
||||
self.inner.next_back()
|
||||
}
|
||||
}
|
||||
|
||||
/// External iterator for a string's
|
||||
/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
|
||||
///
|
||||
/// This struct is created by the [`split_word_bounds`] method on the [`UnicodeSegmentation`]
|
||||
/// trait. See its documentation for more.
|
||||
///
|
||||
/// [`split_word_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_word_bounds
|
||||
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
|
||||
#[derive(Clone)]
|
||||
pub struct UWordBounds<'a> {
|
||||
string: &'a str,
|
||||
cat: Option<WordCat>,
|
||||
catb: Option<WordCat>,
|
||||
}
|
||||
|
||||
/// External iterator for word boundaries and byte offsets.
|
||||
///
|
||||
/// This struct is created by the [`split_word_bound_indices`] method on the
|
||||
/// [`UnicodeSegmentation`] trait. See its documentation for more.
|
||||
///
|
||||
/// [`split_word_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_word_bound_indices
|
||||
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
|
||||
#[derive(Clone)]
|
||||
pub struct UWordBoundIndices<'a> {
|
||||
start_offset: usize,
|
||||
iter: UWordBounds<'a>,
|
||||
}
|
||||
|
||||
impl<'a> UWordBoundIndices<'a> {
|
||||
#[inline]
|
||||
/// View the underlying data (the part yet to be iterated) as a slice of the original string.
|
||||
///
|
||||
/// ```rust
|
||||
/// # use unicode_segmentation::UnicodeSegmentation;
|
||||
/// let mut iter = "Hello world".split_word_bound_indices();
|
||||
/// assert_eq!(iter.as_str(), "Hello world");
|
||||
/// iter.next();
|
||||
/// assert_eq!(iter.as_str(), " world");
|
||||
/// iter.next();
|
||||
/// assert_eq!(iter.as_str(), "world");
|
||||
/// ```
|
||||
pub fn as_str(&self) -> &'a str {
|
||||
self.iter.as_str()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for UWordBoundIndices<'a> {
|
||||
type Item = (usize, &'a str);
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<(usize, &'a str)> {
|
||||
self.iter
|
||||
.next()
|
||||
.map(|s| (s.as_ptr() as usize - self.start_offset, s))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
self.iter.size_hint()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> DoubleEndedIterator for UWordBoundIndices<'a> {
|
||||
#[inline]
|
||||
fn next_back(&mut self) -> Option<(usize, &'a str)> {
|
||||
self.iter
|
||||
.next_back()
|
||||
.map(|s| (s.as_ptr() as usize - self.start_offset, s))
|
||||
}
|
||||
}
|
||||
|
||||
// state machine for word boundary rules
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
|
||||
enum UWordBoundsState {
|
||||
Start,
|
||||
Letter,
|
||||
HLetter,
|
||||
Numeric,
|
||||
Katakana,
|
||||
ExtendNumLet,
|
||||
Regional(RegionalState),
|
||||
FormatExtend(FormatExtendType),
|
||||
Zwj,
|
||||
Emoji,
|
||||
WSegSpace,
|
||||
}
|
||||
|
||||
// subtypes for FormatExtend state in UWordBoundsState
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
|
||||
enum FormatExtendType {
|
||||
AcceptAny,
|
||||
AcceptNone,
|
||||
RequireLetter,
|
||||
RequireHLetter,
|
||||
AcceptQLetter,
|
||||
RequireNumeric,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
|
||||
enum RegionalState {
|
||||
Half,
|
||||
Full,
|
||||
Unknown,
|
||||
}
|
||||
|
||||
fn is_emoji(ch: char) -> bool {
|
||||
use crate::tables::emoji;
|
||||
emoji::emoji_category(ch).2 == emoji::EmojiCat::EC_Extended_Pictographic
|
||||
}
|
||||
|
||||
impl<'a> Iterator for UWordBounds<'a> {
|
||||
type Item = &'a str;
|
||||
|
||||
#[inline]
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
let slen = self.string.len();
|
||||
(cmp::min(slen, 1), Some(slen))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<&'a str> {
|
||||
use self::FormatExtendType::*;
|
||||
use self::UWordBoundsState::*;
|
||||
use crate::tables::word as wd;
|
||||
if self.string.len() == 0 {
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut take_curr = true;
|
||||
let mut take_cat = true;
|
||||
let mut idx = 0;
|
||||
let mut saveidx = 0;
|
||||
let mut state = Start;
|
||||
let mut cat = wd::WC_Any;
|
||||
let mut savecat = wd::WC_Any;
|
||||
|
||||
// If extend/format/zwj were skipped. Handles precedence of WB3d over WB4
|
||||
let mut skipped_format_extend = false;
|
||||
for (curr, ch) in self.string.char_indices() {
|
||||
idx = curr;
|
||||
// Whether or not the previous category was ZWJ
|
||||
// ZWJs get collapsed, so this handles precedence of WB3c over WB4
|
||||
let prev_zwj = cat == wd::WC_ZWJ;
|
||||
// if there's a category cached, grab it
|
||||
cat = match self.cat {
|
||||
None => wd::word_category(ch).2,
|
||||
_ => self.cat.take().unwrap(),
|
||||
};
|
||||
take_cat = true;
|
||||
|
||||
// handle rule WB4
|
||||
// just skip all format, extend, and zwj chars
|
||||
// note that Start is a special case: if there's a bunch of Format | Extend
|
||||
// characters at the beginning of a block of text, dump them out as one unit.
|
||||
//
|
||||
// (This is not obvious from the wording of UAX#29, but if you look at the
|
||||
// test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
|
||||
// then the "correct" interpretation of WB4 becomes apparent.)
|
||||
if state != Start {
|
||||
match cat {
|
||||
wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => {
|
||||
skipped_format_extend = true;
|
||||
continue;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
// rule WB3c
|
||||
// WB4 makes all ZWJs collapse into the previous state
|
||||
// but you can still be in a Zwj state if you started with Zwj
|
||||
//
|
||||
// This means that an EP + Zwj will collapse into EP, which is wrong,
|
||||
// since EP+EP is not a boundary but EP+ZWJ+EP is
|
||||
//
|
||||
// Thus, we separately keep track of whether or not the last character
|
||||
// was a ZWJ. This is an additional bit of state tracked outside of the
|
||||
// state enum; the state enum represents the last non-zwj state encountered.
|
||||
// When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state,
|
||||
// however we are in the previous state for the purposes of all other rules.
|
||||
if prev_zwj {
|
||||
if is_emoji(ch) {
|
||||
state = Emoji;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// Don't use `continue` in this match without updating `cat`
|
||||
state = match state {
|
||||
Start if cat == wd::WC_CR => {
|
||||
idx += match self.get_next_cat(idx) {
|
||||
Some(ncat) if ncat == wd::WC_LF => 1, // rule WB3
|
||||
_ => 0,
|
||||
};
|
||||
break; // rule WB3a
|
||||
}
|
||||
Start => match cat {
|
||||
wd::WC_ALetter => Letter, // rule WB5, WB6, WB9, WB13a
|
||||
wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB6, WB7a, WB7b, WB9, WB13a
|
||||
wd::WC_Numeric => Numeric, // rule WB8, WB10, WB12, WB13a
|
||||
wd::WC_Katakana => Katakana, // rule WB13, WB13a
|
||||
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a, WB13b
|
||||
wd::WC_Regional_Indicator => Regional(RegionalState::Half), // rule WB13c
|
||||
wd::WC_LF | wd::WC_Newline => break, // rule WB3a
|
||||
wd::WC_ZWJ => Zwj, // rule WB3c
|
||||
wd::WC_WSegSpace => WSegSpace, // rule WB3d
|
||||
_ => {
|
||||
if let Some(ncat) = self.get_next_cat(idx) {
|
||||
// rule WB4
|
||||
if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ
|
||||
{
|
||||
state = FormatExtend(AcceptNone);
|
||||
self.cat = Some(ncat);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
break; // rule WB999
|
||||
}
|
||||
},
|
||||
WSegSpace => match cat {
|
||||
wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
|
||||
_ => {
|
||||
take_curr = false;
|
||||
break;
|
||||
}
|
||||
},
|
||||
Zwj => {
|
||||
// We already handle WB3c above.
|
||||
take_curr = false;
|
||||
break;
|
||||
}
|
||||
Letter | HLetter => match cat {
|
||||
wd::WC_ALetter => Letter, // rule WB5
|
||||
wd::WC_Hebrew_Letter => HLetter, // rule WB5
|
||||
wd::WC_Numeric => Numeric, // rule WB9
|
||||
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
|
||||
wd::WC_Double_Quote if state == HLetter => {
|
||||
savecat = cat;
|
||||
saveidx = idx;
|
||||
FormatExtend(RequireHLetter) // rule WB7b
|
||||
}
|
||||
wd::WC_Single_Quote if state == HLetter => {
|
||||
FormatExtend(AcceptQLetter) // rule WB7a
|
||||
}
|
||||
wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
|
||||
savecat = cat;
|
||||
saveidx = idx;
|
||||
FormatExtend(RequireLetter) // rule WB6
|
||||
}
|
||||
_ => {
|
||||
take_curr = false;
|
||||
break;
|
||||
}
|
||||
},
|
||||
Numeric => match cat {
|
||||
wd::WC_Numeric => Numeric, // rule WB8
|
||||
wd::WC_ALetter => Letter, // rule WB10
|
||||
wd::WC_Hebrew_Letter => HLetter, // rule WB10
|
||||
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
|
||||
wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
|
||||
savecat = cat;
|
||||
saveidx = idx;
|
||||
FormatExtend(RequireNumeric) // rule WB12
|
||||
}
|
||||
_ => {
|
||||
take_curr = false;
|
||||
break;
|
||||
}
|
||||
},
|
||||
Katakana => match cat {
|
||||
wd::WC_Katakana => Katakana, // rule WB13
|
||||
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
|
||||
_ => {
|
||||
take_curr = false;
|
||||
break;
|
||||
}
|
||||
},
|
||||
ExtendNumLet => match cat {
|
||||
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
|
||||
wd::WC_ALetter => Letter, // rule WB13b
|
||||
wd::WC_Hebrew_Letter => HLetter, // rule WB13b
|
||||
wd::WC_Numeric => Numeric, // rule WB13b
|
||||
wd::WC_Katakana => Katakana, // rule WB13b
|
||||
_ => {
|
||||
take_curr = false;
|
||||
break;
|
||||
}
|
||||
},
|
||||
Regional(RegionalState::Full) => {
|
||||
// if it reaches here we've gone too far,
|
||||
// a full flag can only compose with ZWJ/Extend/Format
|
||||
// proceeding it.
|
||||
take_curr = false;
|
||||
break;
|
||||
}
|
||||
Regional(RegionalState::Half) => match cat {
|
||||
wd::WC_Regional_Indicator => Regional(RegionalState::Full), // rule WB13c
|
||||
_ => {
|
||||
take_curr = false;
|
||||
break;
|
||||
}
|
||||
},
|
||||
Regional(_) => {
|
||||
unreachable!("RegionalState::Unknown should not occur on forward iteration")
|
||||
}
|
||||
Emoji => {
|
||||
// We already handle WB3c above. If you've reached this point, the emoji sequence is over.
|
||||
take_curr = false;
|
||||
break;
|
||||
}
|
||||
FormatExtend(t) => match t {
|
||||
// handle FormatExtends depending on what type
|
||||
RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB11
|
||||
RequireLetter | AcceptQLetter if cat == wd::WC_ALetter => Letter, // rule WB7
|
||||
RequireLetter | AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // WB7a
|
||||
RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
|
||||
AcceptNone | AcceptQLetter => {
|
||||
take_curr = false; // emit all the Format|Extend characters
|
||||
take_cat = false;
|
||||
break;
|
||||
}
|
||||
_ => break, // rewind (in if statement below)
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
if let FormatExtend(t) = state {
|
||||
// we were looking for something and didn't find it; we have to back up
|
||||
if t == RequireLetter || t == RequireHLetter || t == RequireNumeric {
|
||||
idx = saveidx;
|
||||
cat = savecat;
|
||||
take_curr = false;
|
||||
}
|
||||
}
|
||||
|
||||
self.cat = if take_curr {
|
||||
idx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
|
||||
None
|
||||
} else if take_cat {
|
||||
Some(cat)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let retstr = &self.string[..idx];
|
||||
self.string = &self.string[idx..];
|
||||
Some(retstr)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> DoubleEndedIterator for UWordBounds<'a> {
|
||||
#[inline]
|
||||
fn next_back(&mut self) -> Option<&'a str> {
|
||||
use self::FormatExtendType::*;
|
||||
use self::UWordBoundsState::*;
|
||||
use crate::tables::word as wd;
|
||||
if self.string.len() == 0 {
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut take_curr = true;
|
||||
let mut take_cat = true;
|
||||
let mut idx = self.string.len();
|
||||
idx -= self.string.chars().next_back().unwrap().len_utf8();
|
||||
let mut previdx = idx;
|
||||
let mut saveidx = idx;
|
||||
let mut state = Start;
|
||||
let mut savestate = Start;
|
||||
let mut cat = wd::WC_Any;
|
||||
|
||||
let mut skipped_format_extend = false;
|
||||
|
||||
for (curr, ch) in self.string.char_indices().rev() {
|
||||
previdx = idx;
|
||||
idx = curr;
|
||||
|
||||
// if there's a category cached, grab it
|
||||
cat = match self.catb {
|
||||
None => wd::word_category(ch).2,
|
||||
_ => self.catb.take().unwrap(),
|
||||
};
|
||||
take_cat = true;
|
||||
|
||||
// backward iterator over word boundaries. Mostly the same as the forward
|
||||
// iterator, with two weirdnesses:
|
||||
// (1) If we encounter a single quote in the Start state, we have to check for a
|
||||
// Hebrew Letter immediately before it.
|
||||
// (2) Format and Extend char handling takes some gymnastics.
|
||||
|
||||
if cat == wd::WC_Extend || cat == wd::WC_Format || (cat == wd::WC_ZWJ && state != Zwj) {
|
||||
// WB3c has more priority so we should not
|
||||
// fold in that case
|
||||
if match state {
|
||||
FormatExtend(_) | Start => false,
|
||||
_ => true,
|
||||
} {
|
||||
saveidx = previdx;
|
||||
savestate = state;
|
||||
state = FormatExtend(AcceptNone);
|
||||
}
|
||||
|
||||
if state != Start {
|
||||
continue;
|
||||
}
|
||||
} else if state == FormatExtend(AcceptNone) {
|
||||
// finished a scan of some Format|Extend chars, restore previous state
|
||||
state = savestate;
|
||||
previdx = saveidx;
|
||||
take_cat = false;
|
||||
skipped_format_extend = true;
|
||||
}
|
||||
|
||||
// Don't use `continue` in this match without updating `catb`
|
||||
state = match state {
|
||||
Start | FormatExtend(AcceptAny) => match cat {
|
||||
_ if is_emoji(ch) => Zwj,
|
||||
wd::WC_ALetter => Letter, // rule WB5, WB7, WB10, WB13b
|
||||
wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB7, WB7c, WB10, WB13b
|
||||
wd::WC_Numeric => Numeric, // rule WB8, WB9, WB11, WB13b
|
||||
wd::WC_Katakana => Katakana, // rule WB13, WB13b
|
||||
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
|
||||
wd::WC_Regional_Indicator => Regional(RegionalState::Unknown), // rule WB13c
|
||||
// rule WB4:
|
||||
wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => FormatExtend(AcceptAny),
|
||||
wd::WC_Single_Quote => {
|
||||
saveidx = idx;
|
||||
FormatExtend(AcceptQLetter) // rule WB7a
|
||||
}
|
||||
wd::WC_WSegSpace => WSegSpace,
|
||||
wd::WC_CR | wd::WC_LF | wd::WC_Newline => {
|
||||
if state == Start {
|
||||
if cat == wd::WC_LF {
|
||||
idx -= match self.get_prev_cat(idx) {
|
||||
Some(pcat) if pcat == wd::WC_CR => 1, // rule WB3
|
||||
_ => 0,
|
||||
};
|
||||
}
|
||||
} else {
|
||||
take_curr = false;
|
||||
}
|
||||
break; // rule WB3a
|
||||
}
|
||||
_ => break, // rule WB999
|
||||
},
|
||||
Zwj => match cat {
|
||||
// rule WB3c
|
||||
wd::WC_ZWJ => FormatExtend(AcceptAny),
|
||||
_ => {
|
||||
take_curr = false;
|
||||
break;
|
||||
}
|
||||
},
|
||||
WSegSpace => match cat {
|
||||
// rule WB3d
|
||||
wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
|
||||
_ => {
|
||||
take_curr = false;
|
||||
break;
|
||||
}
|
||||
},
|
||||
Letter | HLetter => match cat {
|
||||
wd::WC_ALetter => Letter, // rule WB5
|
||||
wd::WC_Hebrew_Letter => HLetter, // rule WB5
|
||||
wd::WC_Numeric => Numeric, // rule WB10
|
||||
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
|
||||
wd::WC_Double_Quote if state == HLetter => {
|
||||
saveidx = previdx;
|
||||
FormatExtend(RequireHLetter) // rule WB7c
|
||||
}
|
||||
wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
|
||||
saveidx = previdx;
|
||||
FormatExtend(RequireLetter) // rule WB7
|
||||
}
|
||||
_ => {
|
||||
take_curr = false;
|
||||
break;
|
||||
}
|
||||
},
|
||||
Numeric => match cat {
|
||||
wd::WC_Numeric => Numeric, // rule WB8
|
||||
wd::WC_ALetter => Letter, // rule WB9
|
||||
wd::WC_Hebrew_Letter => HLetter, // rule WB9
|
||||
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
|
||||
wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
|
||||
saveidx = previdx;
|
||||
FormatExtend(RequireNumeric) // rule WB11
|
||||
}
|
||||
_ => {
|
||||
take_curr = false;
|
||||
break;
|
||||
}
|
||||
},
|
||||
Katakana => match cat {
|
||||
wd::WC_Katakana => Katakana, // rule WB13
|
||||
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
|
||||
_ => {
|
||||
take_curr = false;
|
||||
break;
|
||||
}
|
||||
},
|
||||
ExtendNumLet => match cat {
|
||||
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
|
||||
wd::WC_ALetter => Letter, // rule WB13a
|
||||
wd::WC_Hebrew_Letter => HLetter, // rule WB13a
|
||||
wd::WC_Numeric => Numeric, // rule WB13a
|
||||
wd::WC_Katakana => Katakana, // rule WB13a
|
||||
_ => {
|
||||
take_curr = false;
|
||||
break;
|
||||
}
|
||||
},
|
||||
Regional(mut regional_state) => match cat {
|
||||
// rule WB13c
|
||||
wd::WC_Regional_Indicator => {
|
||||
if regional_state == RegionalState::Unknown {
|
||||
let count = self.string[..previdx]
|
||||
.chars()
|
||||
.rev()
|
||||
.map(|c| wd::word_category(c).2)
|
||||
.filter(|&c| {
|
||||
!(c == wd::WC_ZWJ || c == wd::WC_Extend || c == wd::WC_Format)
|
||||
})
|
||||
.take_while(|&c| c == wd::WC_Regional_Indicator)
|
||||
.count();
|
||||
regional_state = if count % 2 == 0 {
|
||||
RegionalState::Full
|
||||
} else {
|
||||
RegionalState::Half
|
||||
};
|
||||
}
|
||||
if regional_state == RegionalState::Full {
|
||||
take_curr = false;
|
||||
break;
|
||||
} else {
|
||||
Regional(RegionalState::Full)
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
take_curr = false;
|
||||
break;
|
||||
}
|
||||
},
|
||||
Emoji => {
|
||||
if is_emoji(ch) {
|
||||
// rule WB3c
|
||||
Zwj
|
||||
} else {
|
||||
take_curr = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
FormatExtend(t) => match t {
|
||||
RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB12
|
||||
RequireLetter if cat == wd::WC_ALetter => Letter, // rule WB6
|
||||
RequireLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB6
|
||||
AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7a
|
||||
RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
|
||||
_ => break, // backtrack will happens
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
if let FormatExtend(t) = state {
|
||||
// if we required something but didn't find it, backtrack
|
||||
if t == RequireLetter
|
||||
|| t == RequireHLetter
|
||||
|| t == RequireNumeric
|
||||
|| t == AcceptNone
|
||||
|| t == AcceptQLetter
|
||||
{
|
||||
previdx = saveidx;
|
||||
take_cat = false;
|
||||
take_curr = false;
|
||||
}
|
||||
}
|
||||
|
||||
self.catb = if take_curr {
|
||||
None
|
||||
} else {
|
||||
idx = previdx;
|
||||
if take_cat {
|
||||
Some(cat)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
let retstr = &self.string[idx..];
|
||||
self.string = &self.string[..idx];
|
||||
Some(retstr)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> UWordBounds<'a> {
|
||||
#[inline]
|
||||
/// View the underlying data (the part yet to be iterated) as a slice of the original string.
|
||||
///
|
||||
/// ```rust
|
||||
/// # use unicode_segmentation::UnicodeSegmentation;
|
||||
/// let mut iter = "Hello world".split_word_bounds();
|
||||
/// assert_eq!(iter.as_str(), "Hello world");
|
||||
/// iter.next();
|
||||
/// assert_eq!(iter.as_str(), " world");
|
||||
/// iter.next();
|
||||
/// assert_eq!(iter.as_str(), "world");
|
||||
/// ```
|
||||
pub fn as_str(&self) -> &'a str {
|
||||
self.string
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_next_cat(&self, idx: usize) -> Option<WordCat> {
|
||||
use crate::tables::word as wd;
|
||||
let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
|
||||
if nidx < self.string.len() {
|
||||
let nch = self.string[nidx..].chars().next().unwrap();
|
||||
Some(wd::word_category(nch).2)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_prev_cat(&self, idx: usize) -> Option<WordCat> {
|
||||
use crate::tables::word as wd;
|
||||
if idx > 0 {
|
||||
let nch = self.string[..idx].chars().next_back().unwrap();
|
||||
Some(wd::word_category(nch).2)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn new_word_bounds<'b>(s: &'b str) -> UWordBounds<'b> {
|
||||
UWordBounds {
|
||||
string: s,
|
||||
cat: None,
|
||||
catb: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b> {
|
||||
UWordBoundIndices {
|
||||
start_offset: s.as_ptr() as usize,
|
||||
iter: new_word_bounds(s),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn has_alphanumeric(s: &&str) -> bool {
|
||||
use crate::tables::util::is_alphanumeric;
|
||||
|
||||
s.chars().any(|c| is_alphanumeric(c))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> {
|
||||
use super::UnicodeSegmentation;
|
||||
|
||||
UnicodeWords {
|
||||
inner: s.split_word_bounds().filter(has_alphanumeric),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn new_unicode_word_indices<'b>(s: &'b str) -> UnicodeWordIndices<'b> {
|
||||
use super::UnicodeSegmentation;
|
||||
|
||||
UnicodeWordIndices {
|
||||
inner: s
|
||||
.split_word_bound_indices()
|
||||
.filter(|(_, c)| has_alphanumeric(c)),
|
||||
}
|
||||
}
|
Загрузка…
Ссылка в новой задаче