Bug 1847521 - Replace unicode-segmentation with ICU4X in WebDriver. r=webdriver-reviewers,supply-chain-reviewers,jgraham

Differential Revision: https://phabricator.services.mozilla.com/D198132
2024-01-18 02:29:19 +00:00 · 2024-01-18 02:29:19 +00:00 · 33707d378d
--- a/Cargo.lock
+++ b/Cargo.lock
@ -2054,6 +2054,7 @@ dependencies = [
 "chrono",
 "clap",
 "hyper",
+ "icu_segmenter",
 "lazy_static",
 "log",
 "marionette",
@ -2069,7 +2070,6 @@ dependencies = [
 "serde_yaml",
 "tempfile",
 "thiserror",
- "unicode-segmentation",
 "url",
 "uuid",
 "webdriver",
@ -5810,12 +5810,6 @@ dependencies = [
 "tinyvec",
 ]

-[[package]]
-name = "unicode-segmentation"
-version = "1.10.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0fdbf052a0783de01e944a6ce7a8cb939e295b1e7be835a1112c3b9a7f047a5a"
-
 [[package]]
 name = "unicode-width"
 version = "0.1.10"
@ -6229,6 +6223,7 @@ dependencies = [
 "bytes",
 "cookie",
 "http",
+ "icu_segmenter",
 "log",
 "serde",
 "serde_derive",
@ -6237,7 +6232,6 @@ dependencies = [
 "time 0.3.23",
 "tokio",
 "tokio-stream",
- "unicode-segmentation",
 "url",
 "warp",
 ]
--- a/supply-chain/imports.lock
+++ b/supply-chain/imports.lock
@ -710,13 +710,6 @@ user-id = 3618
 user-login = "dtolnay"
 user-name = "David Tolnay"

-[[publisher.unicode-segmentation]]
-version = "1.10.0"
-when = "2022-09-13"
-user-id = 1139
-user-login = "Manishearth"
-user-name = "Manish Goregaokar"
-
 [[publisher.unicode-width]]
 version = "0.1.10"
 when = "2022-09-13"
--- a/testing/geckodriver/Cargo.toml
+++ b/testing/geckodriver/Cargo.toml
@ -26,6 +26,7 @@ base64 = "0.21"
 chrono = "0.4.6"
 clap = { version = "4", default-features = false, features = ["cargo", "std", "suggestions", "wrap_help", "string"] }
 hyper = "0.14"
+icu_segmenter = { version = "1.4", default-features = false, features = ["auto", "compiled_data"] }
 lazy_static = "1.0"
 log = { version = "0.4", features = ["std"] }
 marionette = { path = "./marionette", version="0.5.0" }
@ -40,7 +41,6 @@ serde_json = "1.0"
 serde_yaml = "0.8"
 tempfile = "3"
 thiserror = "1"
-unicode-segmentation = "1.9"
 url = "2.4"
 uuid = { version = "1.0", features = ["v4"] }
 webdriver = { path = "../webdriver", version="0.50.0" }
--- a/testing/geckodriver/src/logging.rs
+++ b/testing/geckodriver/src/logging.rs
@ -31,12 +31,12 @@
 //! [`init`]: fn.init.html
 //! [`init_with_level`]: fn.init_with_level.html

+use icu_segmenter::GraphemeClusterSegmenter;
 use std::fmt;
 use std::io;
 use std::io::Write;
 use std::str;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
-use unicode_segmentation::UnicodeSegmentation;

 use mozprofile::preferences::Pref;

@ -246,7 +246,15 @@ fn truncate_message(args: &fmt::Arguments) -> Option<(String, String)> {
    }

    let message = format!("{}", args);
-    let chars = message.graphemes(true).collect::<Vec<&str>>();
+    if message.is_empty() || message.len() < MAX_STRING_LENGTH {
+        return None;
+    }
+    let chars = GraphemeClusterSegmenter::new()
+        .segment_str(&message)
+        .collect::<Vec<_>>()
+        .windows(2)
+        .map(|i| &message[i[0]..i[1]])
+        .collect::<Vec<&str>>();

    if chars.len() > MAX_STRING_LENGTH {
        let middle: usize = MAX_STRING_LENGTH / 2;
--- a/testing/webdriver/Cargo.toml
+++ b/testing/webdriver/Cargo.toml
@ -26,6 +26,7 @@ base64 = "0.21"
 bytes = "1.0"
 cookie = { version = "0.16", default-features = false }
 http = "0.2"
+icu_segmenter = { version = "1.4", default-features = false, features = ["auto", "compiled_data"] }
 log = "0.4"
 serde = "1.0"
 serde_json = "1.0"
@ -33,7 +34,6 @@ serde_derive = "1.0"
 time = "0.3"
 tokio = { version = "1.0", features = ["rt", "net"], optional = true}
 tokio-stream = { version = "0.1", features = ["net"], optional = true}
-unicode-segmentation = "1.2"
 url = "2.4"
 thiserror = "1"
 warp = { version = "0.3", default-features = false, optional = true }
--- a/testing/webdriver/src/actions.rs
+++ b/testing/webdriver/src/actions.rs
@ -3,12 +3,12 @@
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

 use crate::common::{WebElement, ELEMENT_KEY};
+use icu_segmenter::GraphemeClusterSegmenter;
 use serde::de::{self, Deserialize, Deserializer};
 use serde::ser::{Serialize, Serializer};
 use serde_json::Value;
 use std::default::Default;
 use std::f64;
-use unicode_segmentation::UnicodeSegmentation;

 #[derive(Debug, PartialEq, Serialize, Deserialize)]
 pub struct ActionSequence {
@ -91,7 +91,7 @@ where
 {
    String::deserialize(deserializer).map(|value| {
        // Only a single Unicode grapheme cluster is allowed
-        if value.graphemes(true).count() != 1 {
+        if GraphemeClusterSegmenter::new().segment_str(&value).count() != 2 {
            return Err(de::Error::custom(format!(
                "'{}' should only contain a single Unicode code point",
                value
--- a/testing/webdriver/src/lib.rs
+++ b/testing/webdriver/src/lib.rs
@ -6,6 +6,7 @@

 extern crate base64;
 extern crate cookie;
+extern crate icu_segmenter;
 #[macro_use]
 extern crate log;
 extern crate http;
@ -16,7 +17,6 @@ extern crate serde_json;
 extern crate time;
 #[cfg(feature = "server")]
 extern crate tokio;
-extern crate unicode_segmentation;
 extern crate url;
 #[cfg(feature = "server")]
 extern crate warp;
--- a/third_party/rust/unicode-segmentation/.cargo-checksum.json
+++ b/third_party/rust/unicode-segmentation/.cargo-checksum.json
@ -1 +0,0 @@
-{"files":{"COPYRIGHT":"23860c2a7b5d96b21569afedf033469bab9fe14a1b24a35068b8641c578ce24d","Cargo.toml":"55e5a65c91693dd47a27409e54ad6d5ce805ce003b822e4a568bfd070725e956","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"7b63ecd5f1902af1b63729947373683c32745c16a10e8e6292e2e2dcd7e90ae0","README.md":"efe7aa058e004e12d683039dbc4440e2fec3088364201a620703acedbeef8cb2","benches/graphemes.rs":"88a9f672ea7a03cc15fae36ce544a6e7234e532359402483978858ccda47db3d","benches/unicode_words.rs":"95c3a178ebe07c8cb2c560546ee911bfc4f1e1db81a6cd2c1cef1c99ed2a421a","benches/word_bounds.rs":"66acf40c0a4b06cdb6dd97c1759aba8dea961bb30cd7f223de3ebff8198520b2","scripts/unicode.py":"d4ba970a0419f33d20f3deb888be12427bfbb40aa25a5719968600d45cf4dadb","scripts/unicode_gen_breaktests.py":"ee96982d8959bec75c2382233cfca7e239f12a89a1be5fbf942601a215bb9283","src/grapheme.rs":"b5a32bdbb529e9417e8ada8d92656339b6ffb4e9bed8e6d32a0409c13a03050b","src/lib.rs":"572789173717edd0fe037ae656530663406951636c548e6793711b7d5caad910","src/sentence.rs":"aac52f69207e0b68925ab0c6c18cc36ed3da8e918006d96d724f0f19d4d9d643","src/tables.rs":"ba9fa1774b6294ed14565ec6be0f2ec316759d54e3af7c002b6848973d7b1f3c","src/test.rs":"f039fa285d510244672a067bdbe98ce7ff940e4f2ff82926466e012ac48ad95a","src/testdata.rs":"533c02ecace1bec3d46b65d101c7619bc83a2fb2c187a2c960346533c09a0e3e","src/word.rs":"6eeea9351c12f0a4404606596a487e0e8aa948ba4b134c7cb827ee41557a39fe"},"package":"0fdbf052a0783de01e944a6ce7a8cb939e295b1e7be835a1112c3b9a7f047a5a"}
--- a/third_party/rust/unicode-segmentation/COPYRIGHT
+++ b/third_party/rust/unicode-segmentation/COPYRIGHT
@ -1,7 +0,0 @@
-Licensed under the Apache License, Version 2.0
-<LICENSE-APACHE or
-http://www.apache.org/licenses/LICENSE-2.0> or the MIT
-license <LICENSE-MIT or http://opensource.org/licenses/MIT>,
-at your option. All files in the project carrying such
-notice may not be copied, modified, or distributed except
-according to those terms.
--- a/third_party/rust/unicode-segmentation/Cargo.toml
+++ b/third_party/rust/unicode-segmentation/Cargo.toml
@ -1,63 +0,0 @@
-# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
-#
-# When uploading crates to the registry Cargo will automatically
-# "normalize" Cargo.toml files for maximal compatibility
-# with all versions of Cargo and also rewrite `path` dependencies
-# to registry (e.g., crates.io) dependencies.
-#
-# If you are reading this file be aware that the original Cargo.toml
-# will likely look very different (and much more reasonable).
-# See Cargo.toml.orig for the original contents.
-
-[package]
-edition = "2018"
-name = "unicode-segmentation"
-version = "1.10.0"
-authors = [
-    "kwantam <kwantam@gmail.com>",
-    "Manish Goregaokar <manishsmail@gmail.com>",
-]
-exclude = [
-    "target/*",
-    "Cargo.lock",
-    "scripts/tmp",
-    "benches/texts/*",
-    "*.txt",
-]
-description = """
-This crate provides Grapheme Cluster, Word and Sentence boundaries
-according to Unicode Standard Annex #29 rules.
-"""
-homepage = "https://github.com/unicode-rs/unicode-segmentation"
-documentation = "https://unicode-rs.github.io/unicode-segmentation"
-readme = "README.md"
-keywords = [
-    "text",
-    "unicode",
-    "grapheme",
-    "word",
-    "boundary",
-]
-license = "MIT/Apache-2.0"
-repository = "https://github.com/unicode-rs/unicode-segmentation"
-
-[[bench]]
-name = "graphemes"
-harness = false
-
-[[bench]]
-name = "unicode_words"
-harness = false
-
-[[bench]]
-name = "word_bounds"
-harness = false
-
-[dev-dependencies.criterion]
-version = "0.3"
-
-[dev-dependencies.quickcheck]
-version = "0.7"
-
-[features]
-no_std = []
--- a/third_party/rust/unicode-segmentation/LICENSE-APACHE
+++ b/third_party/rust/unicode-segmentation/LICENSE-APACHE
@ -1,201 +0,0 @@
-                              Apache License
-                        Version 2.0, January 2004
-                     http://www.apache.org/licenses/
-
-TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-1. Definitions.
-
-   "License" shall mean the terms and conditions for use, reproduction,
-   and distribution as defined by Sections 1 through 9 of this document.
-
-   "Licensor" shall mean the copyright owner or entity authorized by
-   the copyright owner that is granting the License.
-
-   "Legal Entity" shall mean the union of the acting entity and all
-   other entities that control, are controlled by, or are under common
-   control with that entity. For the purposes of this definition,
-   "control" means (i) the power, direct or indirect, to cause the
-   direction or management of such entity, whether by contract or
-   otherwise, or (ii) ownership of fifty percent (50%) or more of the
-   outstanding shares, or (iii) beneficial ownership of such entity.
-
-   "You" (or "Your") shall mean an individual or Legal Entity
-   exercising permissions granted by this License.
-
-   "Source" form shall mean the preferred form for making modifications,
-   including but not limited to software source code, documentation
-   source, and configuration files.
-
-   "Object" form shall mean any form resulting from mechanical
-   transformation or translation of a Source form, including but
-   not limited to compiled object code, generated documentation,
-   and conversions to other media types.
-
-   "Work" shall mean the work of authorship, whether in Source or
-   Object form, made available under the License, as indicated by a
-   copyright notice that is included in or attached to the work
-   (an example is provided in the Appendix below).
-
-   "Derivative Works" shall mean any work, whether in Source or Object
-   form, that is based on (or derived from) the Work and for which the
-   editorial revisions, annotations, elaborations, or other modifications
-   represent, as a whole, an original work of authorship. For the purposes
-   of this License, Derivative Works shall not include works that remain
-   separable from, or merely link (or bind by name) to the interfaces of,
-   the Work and Derivative Works thereof.
-
-   "Contribution" shall mean any work of authorship, including
-   the original version of the Work and any modifications or additions
-   to that Work or Derivative Works thereof, that is intentionally
-   submitted to Licensor for inclusion in the Work by the copyright owner
-   or by an individual or Legal Entity authorized to submit on behalf of
-   the copyright owner. For the purposes of this definition, "submitted"
-   means any form of electronic, verbal, or written communication sent
-   to the Licensor or its representatives, including but not limited to
-   communication on electronic mailing lists, source code control systems,
-   and issue tracking systems that are managed by, or on behalf of, the
-   Licensor for the purpose of discussing and improving the Work, but
-   excluding communication that is conspicuously marked or otherwise
-   designated in writing by the copyright owner as "Not a Contribution."
-
-   "Contributor" shall mean Licensor and any individual or Legal Entity
-   on behalf of whom a Contribution has been received by Licensor and
-   subsequently incorporated within the Work.
-
-2. Grant of Copyright License. Subject to the terms and conditions of
-   this License, each Contributor hereby grants to You a perpetual,
-   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-   copyright license to reproduce, prepare Derivative Works of,
-   publicly display, publicly perform, sublicense, and distribute the
-   Work and such Derivative Works in Source or Object form.
-
-3. Grant of Patent License. Subject to the terms and conditions of
-   this License, each Contributor hereby grants to You a perpetual,
-   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-   (except as stated in this section) patent license to make, have made,
-   use, offer to sell, sell, import, and otherwise transfer the Work,
-   where such license applies only to those patent claims licensable
-   by such Contributor that are necessarily infringed by their
-   Contribution(s) alone or by combination of their Contribution(s)
-   with the Work to which such Contribution(s) was submitted. If You
-   institute patent litigation against any entity (including a
-   cross-claim or counterclaim in a lawsuit) alleging that the Work
-   or a Contribution incorporated within the Work constitutes direct
-   or contributory patent infringement, then any patent licenses
-   granted to You under this License for that Work shall terminate
-   as of the date such litigation is filed.
-
-4. Redistribution. You may reproduce and distribute copies of the
-   Work or Derivative Works thereof in any medium, with or without
-   modifications, and in Source or Object form, provided that You
-   meet the following conditions:
-
-   (a) You must give any other recipients of the Work or
-       Derivative Works a copy of this License; and
-
-   (b) You must cause any modified files to carry prominent notices
-       stating that You changed the files; and
-
-   (c) You must retain, in the Source form of any Derivative Works
-       that You distribute, all copyright, patent, trademark, and
-       attribution notices from the Source form of the Work,
-       excluding those notices that do not pertain to any part of
-       the Derivative Works; and
-
-   (d) If the Work includes a "NOTICE" text file as part of its
-       distribution, then any Derivative Works that You distribute must
-       include a readable copy of the attribution notices contained
-       within such NOTICE file, excluding those notices that do not
-       pertain to any part of the Derivative Works, in at least one
-       of the following places: within a NOTICE text file distributed
-       as part of the Derivative Works; within the Source form or
-       documentation, if provided along with the Derivative Works; or,
-       within a display generated by the Derivative Works, if and
-       wherever such third-party notices normally appear. The contents
-       of the NOTICE file are for informational purposes only and
-       do not modify the License. You may add Your own attribution
-       notices within Derivative Works that You distribute, alongside
-       or as an addendum to the NOTICE text from the Work, provided
-       that such additional attribution notices cannot be construed
-       as modifying the License.
-
-   You may add Your own copyright statement to Your modifications and
-   may provide additional or different license terms and conditions
-   for use, reproduction, or distribution of Your modifications, or
-   for any such Derivative Works as a whole, provided Your use,
-   reproduction, and distribution of the Work otherwise complies with
-   the conditions stated in this License.
-
-5. Submission of Contributions. Unless You explicitly state otherwise,
-   any Contribution intentionally submitted for inclusion in the Work
-   by You to the Licensor shall be under the terms and conditions of
-   this License, without any additional terms or conditions.
-   Notwithstanding the above, nothing herein shall supersede or modify
-   the terms of any separate license agreement you may have executed
-   with Licensor regarding such Contributions.
-
-6. Trademarks. This License does not grant permission to use the trade
-   names, trademarks, service marks, or product names of the Licensor,
-   except as required for reasonable and customary use in describing the
-   origin of the Work and reproducing the content of the NOTICE file.
-
-7. Disclaimer of Warranty. Unless required by applicable law or
-   agreed to in writing, Licensor provides the Work (and each
-   Contributor provides its Contributions) on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-   implied, including, without limitation, any warranties or conditions
-   of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-   PARTICULAR PURPOSE. You are solely responsible for determining the
-   appropriateness of using or redistributing the Work and assume any
-   risks associated with Your exercise of permissions under this License.
-
-8. Limitation of Liability. In no event and under no legal theory,
-   whether in tort (including negligence), contract, or otherwise,
-   unless required by applicable law (such as deliberate and grossly
-   negligent acts) or agreed to in writing, shall any Contributor be
-   liable to You for damages, including any direct, indirect, special,
-   incidental, or consequential damages of any character arising as a
-   result of this License or out of the use or inability to use the
-   Work (including but not limited to damages for loss of goodwill,
-   work stoppage, computer failure or malfunction, or any and all
-   other commercial damages or losses), even if such Contributor
-   has been advised of the possibility of such damages.
-
-9. Accepting Warranty or Additional Liability. While redistributing
-   the Work or Derivative Works thereof, You may choose to offer,
-   and charge a fee for, acceptance of support, warranty, indemnity,
-   or other liability obligations and/or rights consistent with this
-   License. However, in accepting such obligations, You may act only
-   on Your own behalf and on Your sole responsibility, not on behalf
-   of any other Contributor, and only if You agree to indemnify,
-   defend, and hold each Contributor harmless for any liability
-   incurred by, or claims asserted against, such Contributor by reason
-   of your accepting any such warranty or additional liability.
-
-END OF TERMS AND CONDITIONS
-
-APPENDIX: How to apply the Apache License to your work.
-
-   To apply the Apache License to your work, attach the following
-   boilerplate notice, with the fields enclosed by brackets "[]"
-   replaced with your own identifying information. (Don't include
-   the brackets!)  The text should be enclosed in the appropriate
-   comment syntax for the file format. We also recommend that a
-   file or class name and description of purpose be included on the
-   same "printed page" as the copyright notice for easier
-   identification within third-party archives.
-
-Copyright [yyyy] [name of copyright owner]
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-	http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--- a/third_party/rust/unicode-segmentation/LICENSE-MIT
+++ b/third_party/rust/unicode-segmentation/LICENSE-MIT
@ -1,25 +0,0 @@
-Copyright (c) 2015 The Rust Project Developers
-
-Permission is hereby granted, free of charge, to any
-person obtaining a copy of this software and associated
-documentation files (the "Software"), to deal in the
-Software without restriction, including without
-limitation the rights to use, copy, modify, merge,
-publish, distribute, sublicense, and/or sell copies of
-the Software, and to permit persons to whom the Software
-is furnished to do so, subject to the following
-conditions:
-
-The above copyright notice and this permission notice
-shall be included in all copies or substantial portions
-of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
-ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
-TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
-PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
-SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
-IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
--- a/third_party/rust/unicode-segmentation/README.md
+++ b/third_party/rust/unicode-segmentation/README.md
@ -1,99 +0,0 @@
-Iterators which split strings on Grapheme Cluster or Word boundaries, according
-to the [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/) rules.
-
-[![Build Status](https://travis-ci.org/unicode-rs/unicode-segmentation.svg)](https://travis-ci.org/unicode-rs/unicode-segmentation)
-
-[Documentation](https://unicode-rs.github.io/unicode-segmentation/unicode_segmentation/index.html)
-
-```rust
-use unicode_segmentation::UnicodeSegmentation;
-
-fn main() {
-    let s = "a̐éö̲\r\n";
-    let g = s.graphemes(true).collect::<Vec<&str>>();
-    let b: &[_] = &["a̐", "é", "ö̲", "\r\n"];
-    assert_eq!(g, b);
-
-    let s = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
-    let w = s.unicode_words().collect::<Vec<&str>>();
-    let b: &[_] = &["The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right"];
-    assert_eq!(w, b);
-
-    let s = "The quick (\"brown\")  fox";
-    let w = s.split_word_bounds().collect::<Vec<&str>>();
-    let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", " ", "fox"];
-    assert_eq!(w, b);
-}
-```
-
-# no_std
-
-unicode-segmentation does not depend on libstd, so it can be used in crates
-with the `#![no_std]` attribute.
-
-# crates.io
-
-You can use this package in your project by adding the following
-to your `Cargo.toml`:
-
-```toml
-[dependencies]
-unicode-segmentation = "1.9.0"
-```
-
-# Change Log
-
-## 1.7.1
-
-* Update docs on version number
-
-## 1.7.0
-
-* [#87](https://github.com/unicode-rs/unicode-segmentation/pull/87) Upgrade to Unicode 13
-* [#79](https://github.com/unicode-rs/unicode-segmentation/pull/79) Implement a special-case lookup for ascii grapheme categories
-* [#77](https://github.com/unicode-rs/unicode-segmentation/pull/77) Optimization for grapheme iteration
-
-## 1.6.0
-
-* [#72](https://github.com/unicode-rs/unicode-segmentation/pull/72) Upgrade to Unicode 12
-
-## 1.5.0
-
-* [#68](https://github.com/unicode-rs/unicode-segmentation/pull/68) Upgrade to Unicode 11
-
-## 1.4.0
-
-* [#56](https://github.com/unicode-rs/unicode-segmentation/pull/56) Upgrade to Unicode 10
-
-## 1.3.0
-
-* [#24](https://github.com/unicode-rs/unicode-segmentation/pull/24) Add support for sentence boundaries
-* [#44](https://github.com/unicode-rs/unicode-segmentation/pull/44) Treat `gc=No` as a subset of `gc=N`
-
-## 1.2.1
-
-* [#37](https://github.com/unicode-rs/unicode-segmentation/pull/37):
-  Fix panic in `provide_context`.
-* [#40](https://github.com/unicode-rs/unicode-segmentation/pull/40):
-  Fix crash in `prev_boundary`.
-
-## 1.2.0
-
-* New `GraphemeCursor` API allows random access and bidirectional iteration.
-* Fixed incorrect splitting of certain emoji modifier sequences.
-
-## 1.1.0
-
-* Add `as_str` methods to the iterator types.
-
-## 1.0.3
-
-* Code cleanup and additional tests.
-
-## 1.0.1
-
-* Fix a bug affecting some grapheme clusters containing Prepend characters.
-
-## 1.0.0
-
-* Upgrade to Unicode 9.0.0.
--- a/third_party/rust/unicode-segmentation/benches/graphemes.rs
+++ b/third_party/rust/unicode-segmentation/benches/graphemes.rs
@ -1,63 +0,0 @@
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
-use unicode_segmentation;
-
-use std::fs;
-use unicode_segmentation::UnicodeSegmentation;
-
-fn graphemes(c: &mut Criterion, lang: &str, path: &str) {
-    let text = fs::read_to_string(path).unwrap();
-
-    c.bench_function(&format!("graphemes_{}", lang), |bench| {
-        bench.iter(|| {
-            for g in UnicodeSegmentation::graphemes(black_box(&*text), true) {
-                black_box(g);
-            }
-        })
-    });
-}
-
-fn graphemes_arabic(c: &mut Criterion) {
-    graphemes(c, "arabic", "benches/texts/arabic.txt");
-}
-
-fn graphemes_english(c: &mut Criterion) {
-    graphemes(c, "english", "benches/texts/english.txt");
-}
-
-fn graphemes_hindi(c: &mut Criterion) {
-    graphemes(c, "hindi", "benches/texts/hindi.txt");
-}
-
-fn graphemes_japanese(c: &mut Criterion) {
-    graphemes(c, "japanese", "benches/texts/japanese.txt");
-}
-
-fn graphemes_korean(c: &mut Criterion) {
-    graphemes(c, "korean", "benches/texts/korean.txt");
-}
-
-fn graphemes_mandarin(c: &mut Criterion) {
-    graphemes(c, "mandarin", "benches/texts/mandarin.txt");
-}
-
-fn graphemes_russian(c: &mut Criterion) {
-    graphemes(c, "russian", "benches/texts/russian.txt");
-}
-
-fn graphemes_source_code(c: &mut Criterion) {
-    graphemes(c, "source_code", "benches/texts/source_code.txt");
-}
-
-criterion_group!(
-    benches,
-    graphemes_arabic,
-    graphemes_english,
-    graphemes_hindi,
-    graphemes_japanese,
-    graphemes_korean,
-    graphemes_mandarin,
-    graphemes_russian,
-    graphemes_source_code,
-);
-
-criterion_main!(benches);
--- a/third_party/rust/unicode-segmentation/benches/unicode_words.rs
+++ b/third_party/rust/unicode-segmentation/benches/unicode_words.rs
@ -1,64 +0,0 @@
-#[macro_use]
-extern crate bencher;
-extern crate unicode_segmentation;
-
-use bencher::Bencher;
-use std::fs;
-use unicode_segmentation::UnicodeSegmentation;
-
-fn unicode_words(bench: &mut Bencher, path: &str) {
-    let text = fs::read_to_string(path).unwrap();
-    bench.iter(|| {
-        for w in text.unicode_words() {
-            bencher::black_box(w);
-        }
-    });
-
-    bench.bytes = text.len() as u64;
-}
-
-fn unicode_words_arabic(bench: &mut Bencher) {
-    unicode_words(bench, "benches/texts/arabic.txt");
-}
-
-fn unicode_words_english(bench: &mut Bencher) {
-    unicode_words(bench, "benches/texts/english.txt");
-}
-
-fn unicode_words_hindi(bench: &mut Bencher) {
-    unicode_words(bench, "benches/texts/hindi.txt");
-}
-
-fn unicode_words_japanese(bench: &mut Bencher) {
-    unicode_words(bench, "benches/texts/japanese.txt");
-}
-
-fn unicode_words_korean(bench: &mut Bencher) {
-    unicode_words(bench, "benches/texts/korean.txt");
-}
-
-fn unicode_words_mandarin(bench: &mut Bencher) {
-    unicode_words(bench, "benches/texts/mandarin.txt");
-}
-
-fn unicode_words_russian(bench: &mut Bencher) {
-    unicode_words(bench, "benches/texts/russian.txt");
-}
-
-fn unicode_words_source_code(bench: &mut Bencher) {
-    unicode_words(bench, "benches/texts/source_code.txt");
-}
-
-benchmark_group!(
-    benches,
-    unicode_words_arabic,
-    unicode_words_english,
-    unicode_words_hindi,
-    unicode_words_japanese,
-    unicode_words_korean,
-    unicode_words_mandarin,
-    unicode_words_russian,
-    unicode_words_source_code,
-);
-
-benchmark_main!(benches);
--- a/third_party/rust/unicode-segmentation/benches/word_bounds.rs
+++ b/third_party/rust/unicode-segmentation/benches/word_bounds.rs
@ -1,64 +0,0 @@
-#[macro_use]
-extern crate bencher;
-extern crate unicode_segmentation;
-
-use bencher::Bencher;
-use std::fs;
-use unicode_segmentation::UnicodeSegmentation;
-
-fn word_bounds(bench: &mut Bencher, path: &str) {
-    let text = fs::read_to_string(path).unwrap();
-    bench.iter(|| {
-        for w in text.split_word_bounds() {
-            bencher::black_box(w);
-        }
-    });
-
-    bench.bytes = text.len() as u64;
-}
-
-fn word_bounds_arabic(bench: &mut Bencher) {
-    word_bounds(bench, "benches/texts/arabic.txt");
-}
-
-fn word_bounds_english(bench: &mut Bencher) {
-    word_bounds(bench, "benches/texts/english.txt");
-}
-
-fn word_bounds_hindi(bench: &mut Bencher) {
-    word_bounds(bench, "benches/texts/hindi.txt");
-}
-
-fn word_bounds_japanese(bench: &mut Bencher) {
-    word_bounds(bench, "benches/texts/japanese.txt");
-}
-
-fn word_bounds_korean(bench: &mut Bencher) {
-    word_bounds(bench, "benches/texts/korean.txt");
-}
-
-fn word_bounds_mandarin(bench: &mut Bencher) {
-    word_bounds(bench, "benches/texts/mandarin.txt");
-}
-
-fn word_bounds_russian(bench: &mut Bencher) {
-    word_bounds(bench, "benches/texts/russian.txt");
-}
-
-fn word_bounds_source_code(bench: &mut Bencher) {
-    word_bounds(bench, "benches/texts/source_code.txt");
-}
-
-benchmark_group!(
-    benches,
-    word_bounds_arabic,
-    word_bounds_english,
-    word_bounds_hindi,
-    word_bounds_japanese,
-    word_bounds_korean,
-    word_bounds_mandarin,
-    word_bounds_russian,
-    word_bounds_source_code,
-);
-
-benchmark_main!(benches);
--- a/third_party/rust/unicode-segmentation/scripts/unicode.py
+++ b/third_party/rust/unicode-segmentation/scripts/unicode.py
@ -1,381 +0,0 @@
-#!/usr/bin/env python
-#
-# Copyright 2011-2015 The Rust Project Developers. See the COPYRIGHT
-# file at the top-level directory of this distribution and at
-# http://rust-lang.org/COPYRIGHT.
-#
-# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
-# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
-# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
-# option. This file may not be copied, modified, or distributed
-# except according to those terms.
-
-# This script uses the following Unicode tables:
-# - DerivedCoreProperties.txt
-# - auxiliary/GraphemeBreakProperty.txt
-# - auxiliary/WordBreakProperty.txt
-# - ReadMe.txt
-# - UnicodeData.txt
-#
-# Since this should not require frequent updates, we just store this
-# out-of-line and check the unicode.rs file into git.
-
-import fileinput, re, os, sys
-
-preamble = '''// Copyright 2012-2018 The Rust Project Developers. See the COPYRIGHT
-// file at the top-level directory of this distribution and at
-// http://rust-lang.org/COPYRIGHT.
-//
-// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
-// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
-// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
-// option. This file may not be copied, modified, or distributed
-// except according to those terms.
-
-// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
-
-#![allow(missing_docs, non_upper_case_globals, non_snake_case)]
-'''
-
-# Mapping taken from Table 12 from:
-# http://www.unicode.org/reports/tr44/#General_Category_Values
-expanded_categories = {
-    'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'],
-    'Lm': ['L'], 'Lo': ['L'],
-    'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'],
-    'Nd': ['N'], 'Nl': ['N'], 'No': ['N'],
-    'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'],
-    'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'],
-    'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'],
-    'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'],
-    'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
-}
-
-# these are the surrogate codepoints, which are not valid rust characters
-surrogate_codepoints = (0xd800, 0xdfff)
-
-UNICODE_VERSION = (15, 0, 0)
-
-UNICODE_VERSION_NUMBER = "%s.%s.%s" %UNICODE_VERSION
-
-def is_surrogate(n):
-    return surrogate_codepoints[0] <= n <= surrogate_codepoints[1]
-
-def fetch(f):
-    if not os.path.exists(os.path.basename(f)):
-        if "emoji" in f:
-            os.system("curl -O https://www.unicode.org/Public/%s/ucd/emoji/%s"
-                      % (UNICODE_VERSION_NUMBER, f))
-        else:
-            os.system("curl -O https://www.unicode.org/Public/%s/ucd/%s"
-                      % (UNICODE_VERSION_NUMBER, f))
-
-    if not os.path.exists(os.path.basename(f)):
-        sys.stderr.write("cannot load %s" % f)
-        exit(1)
-
-def load_gencats(f):
-    fetch(f)
-    gencats = {}
-
-    udict = {};
-    range_start = -1;
-    for line in fileinput.input(f):
-        data = line.split(';');
-        if len(data) != 15:
-            continue
-        cp = int(data[0], 16);
-        if is_surrogate(cp):
-            continue
-        if range_start >= 0:
-            for i in range(range_start, cp):
-                udict[i] = data;
-            range_start = -1;
-        if data[1].endswith(", First>"):
-            range_start = cp;
-            continue;
-        udict[cp] = data;
-
-    for code in udict:
-        [code_org, name, gencat, combine, bidi,
-         decomp, deci, digit, num, mirror,
-         old, iso, upcase, lowcase, titlecase ] = udict[code];
-
-        # place letter in categories as appropriate
-        for cat in [gencat, "Assigned"] + expanded_categories.get(gencat, []):
-            if cat not in gencats:
-                gencats[cat] = []
-            gencats[cat].append(code)
-
-    gencats = group_cats(gencats)
-    return gencats
-
-def group_cats(cats):
-    cats_out = {}
-    for cat in cats:
-        cats_out[cat] = group_cat(cats[cat])
-    return cats_out
-
-def group_cat(cat):
-    cat_out = []
-    letters = sorted(set(cat))
-    cur_start = letters.pop(0)
-    cur_end = cur_start
-    for letter in letters:
-        assert letter > cur_end, \
-            "cur_end: %s, letter: %s" % (hex(cur_end), hex(letter))
-        if letter == cur_end + 1:
-            cur_end = letter
-        else:
-            cat_out.append((cur_start, cur_end))
-            cur_start = cur_end = letter
-    cat_out.append((cur_start, cur_end))
-    return cat_out
-
-def ungroup_cat(cat):
-    cat_out = []
-    for (lo, hi) in cat:
-        while lo <= hi:
-            cat_out.append(lo)
-            lo += 1
-    return cat_out
-
-def format_table_content(f, content, indent):
-    line = " "*indent
-    first = True
-    for chunk in content.split(","):
-        if len(line) + len(chunk) < 98:
-            if first:
-                line += chunk
-            else:
-                line += ", " + chunk
-            first = False
-        else:
-            f.write(line + ",\n")
-            line = " "*indent + chunk
-    f.write(line)
-
-def load_properties(f, interestingprops):
-    fetch(f)
-    props = {}
-    re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
-    re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
-
-    for line in fileinput.input(os.path.basename(f)):
-        prop = None
-        d_lo = 0
-        d_hi = 0
-        m = re1.match(line)
-        if m:
-            d_lo = m.group(1)
-            d_hi = m.group(1)
-            prop = m.group(2)
-        else:
-            m = re2.match(line)
-            if m:
-                d_lo = m.group(1)
-                d_hi = m.group(2)
-                prop = m.group(3)
-            else:
-                continue
-        if interestingprops and prop not in interestingprops:
-            continue
-        d_lo = int(d_lo, 16)
-        d_hi = int(d_hi, 16)
-        if prop not in props:
-            props[prop] = []
-        props[prop].append((d_lo, d_hi))
-
-    # optimize if possible
-    for prop in props:
-        props[prop] = group_cat(ungroup_cat(props[prop]))
-
-    return props
-
-def escape_char(c):
-    return "'\\u{%x}'" % c
-
-def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
-        pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])), is_const=True):
-    pub_string = "const"
-    if not is_const:
-        pub_string = "let"
-    if is_pub:
-        pub_string = "pub " + pub_string
-    f.write("    %s %s: %s = &[\n" % (pub_string, name, t_type))
-    data = ""
-    first = True
-    for dat in t_data:
-        if not first:
-            data += ","
-        first = False
-        data += pfun(dat)
-    format_table_content(f, data, 8)
-    f.write("\n    ];\n\n")
-
-def emit_util_mod(f):
-    f.write("""
-pub mod util {
-    #[inline]
-    pub fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
-        use core::cmp::Ordering::{Equal, Less, Greater};
-        r.binary_search_by(|&(lo,hi)| {
-            if lo <= c && c <= hi { Equal }
-            else if hi < c { Less }
-            else { Greater }
-        }).is_ok()
-    }
-
-    #[inline]
-    fn is_alphabetic(c: char) -> bool {
-        match c {
-            'a' ..= 'z' | 'A' ..= 'Z' => true,
-            c if c > '\x7f' => super::derived_property::Alphabetic(c),
-            _ => false,
-        }
-    }
-
-    #[inline]
-    fn is_numeric(c: char) -> bool {
-        match c {
-            '0' ..= '9' => true,
-            c if c > '\x7f' => super::general_category::N(c),
-            _ => false,
-        }
-    }
-
-    #[inline]
-    pub fn is_alphanumeric(c: char) -> bool {
-        is_alphabetic(c) || is_numeric(c)
-    }
-}
-
-""")
-
-def emit_property_module(f, mod, tbl, emit):
-    f.write("mod %s {\n" % mod)
-    for cat in sorted(emit):
-        emit_table(f, "%s_table" % cat, tbl[cat], is_pub=False)
-        f.write("    #[inline]\n")
-        f.write("    pub fn %s(c: char) -> bool {\n" % cat)
-        f.write("        super::util::bsearch_range_table(c, %s_table)\n" % cat)
-        f.write("    }\n\n")
-    f.write("}\n\n")
-
-def emit_break_module(f, break_table, break_cats, name):
-    Name = name.capitalize()
-    f.write("""pub mod %s {
-    use core::result::Result::{Ok, Err};
-
-    pub use self::%sCat::*;
-
-    #[allow(non_camel_case_types)]
-    #[derive(Clone, Copy, PartialEq, Eq, Debug)]
-    pub enum %sCat {
-""" % (name, Name, Name))
-
-    break_cats.append("Any")
-    break_cats.sort()
-    for cat in break_cats:
-        f.write(("        %sC_" % Name[0]) + cat + ",\n")
-    f.write("""    }
-
-    fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> (u32, u32, %sCat) {
-        use core::cmp::Ordering::{Equal, Less, Greater};
-        match r.binary_search_by(|&(lo, hi, _)| {
-            if lo <= c && c <= hi { Equal }
-            else if hi < c { Less }
-            else { Greater }
-        }) {
-            Ok(idx) => {
-                let (lower, upper, cat) = r[idx];
-                (lower as u32, upper as u32, cat)
-            }
-            Err(idx) => {
-                (
-                    if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
-                    r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
-                    %sC_Any,
-                )
-            }
-        }
-    }
-
-    pub fn %s_category(c: char) -> (u32, u32, %sCat) {
-        bsearch_range_value_table(c, %s_cat_table)
-    }
-
-""" % (Name, Name, Name[0], name, Name, name))
-
-    emit_table(f, "%s_cat_table" % name, break_table, "&'static [(char, char, %sCat)]" % Name,
-        pfun=lambda x: "(%s,%s,%sC_%s)" % (escape_char(x[0]), escape_char(x[1]), Name[0], x[2]),
-        is_pub=False, is_const=True)
-    f.write("}\n")
-
-if __name__ == "__main__":
-    r = "tables.rs"
-    if os.path.exists(r):
-        os.remove(r)
-    with open(r, "w") as rf:
-        # write the file's preamble
-        rf.write(preamble)
-        rf.write("""
-/// The version of [Unicode](http://www.unicode.org/)
-/// that this version of unicode-segmentation is based on.
-pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
-""" % UNICODE_VERSION)
-
-        # download and parse all the data
-        gencats = load_gencats("UnicodeData.txt")
-        derived = load_properties("DerivedCoreProperties.txt", ["Alphabetic"])
-
-        emit_util_mod(rf)
-        for (name, cat, pfuns) in ("general_category", gencats, ["N"]), \
-                                  ("derived_property", derived, ["Alphabetic"]):
-            emit_property_module(rf, name, cat, pfuns)
-
-        ### grapheme cluster module
-        # from http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Break_Property_Values
-        grapheme_cats = load_properties("auxiliary/GraphemeBreakProperty.txt", [])
-
-        # Control
-        #  Note:
-        # This category also includes Cs (surrogate codepoints), but Rust's `char`s are
-        # Unicode Scalar Values only, and surrogates are thus invalid `char`s.
-        # Thus, we have to remove Cs from the Control category
-        grapheme_cats["Control"] = group_cat(list(
-            set(ungroup_cat(grapheme_cats["Control"]))
-            - set(ungroup_cat([surrogate_codepoints]))))
-
-        grapheme_table = []
-        for cat in grapheme_cats:
-            grapheme_table.extend([(x, y, cat) for (x, y) in grapheme_cats[cat]])
-        emoji_props = load_properties("emoji-data.txt", ["Extended_Pictographic"])
-        grapheme_table.extend([(x, y, "Extended_Pictographic") for (x, y) in emoji_props["Extended_Pictographic"]])
-        grapheme_table.sort(key=lambda w: w[0])
-        last = -1
-        for chars in grapheme_table:
-            if chars[0] <= last:
-                raise "Grapheme tables and Extended_Pictographic values overlap; need to store these separately!"
-            last = chars[1]
-        emit_break_module(rf, grapheme_table, list(grapheme_cats.keys()) + ["Extended_Pictographic"], "grapheme")
-        rf.write("\n")
-
-        word_cats = load_properties("auxiliary/WordBreakProperty.txt", [])
-        word_table = []
-        for cat in word_cats:
-            word_table.extend([(x, y, cat) for (x, y) in word_cats[cat]])
-        word_table.sort(key=lambda w: w[0])
-        emit_break_module(rf, word_table, list(word_cats.keys()), "word")
-
-        # There are some emoji which are also ALetter, so this needs to be stored separately
-        # For efficiency, we could still merge the two tables and produce an ALetterEP state
-        emoji_table = [(x, y, "Extended_Pictographic") for (x, y) in emoji_props["Extended_Pictographic"]]
-        emit_break_module(rf, emoji_table, ["Extended_Pictographic"], "emoji")
-
-        sentence_cats = load_properties("auxiliary/SentenceBreakProperty.txt", [])
-        sentence_table = []
-        for cat in sentence_cats:
-            sentence_table.extend([(x, y, cat) for (x, y) in sentence_cats[cat]])
-        sentence_table.sort(key=lambda w: w[0])
-        emit_break_module(rf, sentence_table, list(sentence_cats.keys()), "sentence")
--- a/third_party/rust/unicode-segmentation/scripts/unicode_gen_breaktests.py
+++ b/third_party/rust/unicode-segmentation/scripts/unicode_gen_breaktests.py
@ -1,212 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8
-#
-# Copyright 2015 The Rust Project Developers. See the COPYRIGHT
-# file at the top-level directory of this distribution and at
-# http://rust-lang.org/COPYRIGHT.
-#
-# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
-# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
-# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
-# option. This file may not be copied, modified, or distributed
-# except according to those terms.
-
-# This script uses the following Unicode tables:
-# - auxiliary/GraphemeBreakTest.txt
-# - auxiliary/WordBreakTest.txt
-#
-# Since this should not require frequent updates, we just store this
-# out-of-line and check the unicode.rs file into git.
-from __future__ import print_function
-
-import unicode, re, os, fileinput
-
-def load_test_data(f, optsplit=[]):
-    testRe1 = re.compile(r"^÷\s+([^\s].*[^\s])\s+÷\s+#\s+÷\s+\[0.2\].*?([÷×].*)\s+÷\s+\[0.3\]\s*$")
-
-    unicode.fetch(f)
-    data = []
-    for line in fileinput.input(os.path.basename(f)):
-        # lines that include a test start with the ÷ character
-        if len(line) < 2 or not line.startswith('÷'):
-            continue
-
-        m = testRe1.match(line)
-        if not m:
-            print("error: no match on line where test was expected: %s" % line)
-            continue
-
-        # process the characters in this test case
-        chars = process_split_string(m.group(1))
-        # skip test case if it contains invalid characters (viz., surrogates)
-        if not chars:
-            continue
-
-        # now process test cases
-        (chars, info) = process_split_info(m.group(2), chars, optsplit)
-
-        # make sure that we have break info for each break!
-        assert len(chars) - 1 == len(info)
-
-        data.append((chars, info))
-
-    return data
-
-def process_split_info(s, c, o):
-    outcs = []
-    outis = []
-    workcs = c.pop(0)
-
-    # are we on a × or a ÷?
-    isX = False
-    if s.startswith('×'):
-        isX = True
-
-    # find each instance of '(÷|×) [x.y] '
-    while s:
-        # find the currently considered rule number
-        sInd = s.index('[') + 1
-        eInd = s.index(']')
-
-        # if it's '× [a.b]' where 'a.b' is in o, then
-        # we consider it a split even though it's not
-        # marked as one
-        # if it's ÷ then it's always a split
-        if not isX or s[sInd:eInd] in o:
-            outis.append(s[sInd:eInd])
-            outcs.append(workcs)
-            workcs = c.pop(0)
-        else:
-            workcs.extend(c.pop(0))
-
-        idx = 1
-        while idx < len(s):
-            if s[idx:].startswith('×'):
-                isX = True
-                break
-            if s[idx:].startswith('÷'):
-                isX = False
-                break
-            idx += 1
-        s = s[idx:]
-
-    outcs.append(workcs)
-    return (outcs, outis)
-
-def process_split_string(s):
-    outls = []
-    workls = []
-
-    inls = s.split()
-
-    for i in inls:
-        if i == '÷' or i == '×':
-            outls.append(workls)
-            workls = []
-            continue
-
-        ival = int(i,16)
-
-        if unicode.is_surrogate(ival):
-            return []
-
-        workls.append(ival)
-
-    if workls:
-        outls.append(workls)
-
-    return outls
-
-def showfun(x):
-    outstr = '("'
-    for c in x[0]:
-        outstr += "\\u{%x}" % c
-    outstr += '",&['
-    xfirst = True
-    for xx in x[1:]:
-        if not xfirst:
-            outstr += '],&['
-        xfirst = False
-        sfirst = True
-        for sp in xx:
-            if not sfirst:
-                outstr += ','
-            sfirst = False
-            outstr += '"'
-            for c in sp:
-                outstr += "\\u{%x}" % c
-            outstr += '"'
-    outstr += '])'
-    return outstr
-
-def create_grapheme_data(f):
-    # rules 9.1 and 9.2 are for extended graphemes only
-    optsplits = ['9.1','9.2']
-    d = load_test_data("auxiliary/GraphemeBreakTest.txt", optsplits)
-
-    test_same = []
-    test_diff = []
-
-    for (c, i) in d:
-        allchars = [cn for s in c for cn in s]
-        extgraphs = []
-        extwork = []
-
-        extwork.extend(c[0])
-        for n in range(0,len(i)):
-            if i[n] in optsplits:
-                extwork.extend(c[n+1])
-            else:
-                extgraphs.append(extwork)
-                extwork = []
-                extwork.extend(c[n+1])
-
-        # these are the extended grapheme clusters
-        extgraphs.append(extwork)
-
-        if extgraphs == c:
-            test_same.append((allchars, c))
-        else:
-            test_diff.append((allchars, extgraphs, c))
-
-    stype = "&'static [(&'static str, &'static [&'static str])]"
-    dtype = "&'static [(&'static str, &'static [&'static str], &'static [&'static str])]"
-    f.write("    // official Unicode test data\n")
-    f.write("    // http://www.unicode.org/Public/%s/ucd/auxiliary/GraphemeBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
-    unicode.emit_table(f, "TEST_SAME", test_same, stype, True, showfun, True)
-    unicode.emit_table(f, "TEST_DIFF", test_diff, dtype, True, showfun, True)
-
-def create_words_data(f):
-    d = load_test_data("auxiliary/WordBreakTest.txt")
-
-    test = []
-
-    for (c, i) in d:
-        allchars = [cn for s in c for cn in s]
-        test.append((allchars, c))
-
-    wtype = "&'static [(&'static str, &'static [&'static str])]"
-    f.write("    // official Unicode test data\n")
-    f.write("    // http://www.unicode.org/Public/%s/ucd/auxiliary/WordBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
-    unicode.emit_table(f, "TEST_WORD", test, wtype, True, showfun, True)
-
-def create_sentence_data(f):
-    d = load_test_data("auxiliary/SentenceBreakTest.txt")
-
-    test = []
-
-    for (c, i) in d:
-        allchars = [cn for s in c for cn in s]
-        test.append((allchars, c))
-
-    wtype = "&'static [(&'static str, &'static [&'static str])]"
-    f.write("    // official Unicode test data\n")
-    f.write("    // http://www.unicode.org/Public/%s/ucd/auxiliary/SentenceBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
-    unicode.emit_table(f, "TEST_SENTENCE", test, wtype, True, showfun, True)
-
-if __name__ == "__main__":
-    with open("testdata.rs", "w") as rf:
-        rf.write(unicode.preamble)
-        create_grapheme_data(rf)
-        create_words_data(rf)
-        create_sentence_data(rf)
--- a/third_party/rust/unicode-segmentation/src/grapheme.rs
+++ b/third_party/rust/unicode-segmentation/src/grapheme.rs
@ -1,801 +0,0 @@
-// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
-// file at the top-level directory of this distribution and at
-// http://rust-lang.org/COPYRIGHT.
-//
-// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
-// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
-// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
-// option. This file may not be copied, modified, or distributed
-// except according to those terms.
-
-use core::cmp;
-
-use crate::tables::grapheme::GraphemeCat;
-
-/// External iterator for grapheme clusters and byte offsets.
-///
-/// This struct is created by the [`grapheme_indices`] method on the [`UnicodeSegmentation`]
-/// trait. See its documentation for more.
-///
-/// [`grapheme_indices`]: trait.UnicodeSegmentation.html#tymethod.grapheme_indices
-/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
-#[derive(Clone)]
-pub struct GraphemeIndices<'a> {
-    start_offset: usize,
-    iter: Graphemes<'a>,
-}
-
-impl<'a> GraphemeIndices<'a> {
-    #[inline]
-    /// View the underlying data (the part yet to be iterated) as a slice of the original string.
-    ///
-    /// ```rust
-    /// # use unicode_segmentation::UnicodeSegmentation;
-    /// let mut iter = "abc".grapheme_indices(true);
-    /// assert_eq!(iter.as_str(), "abc");
-    /// iter.next();
-    /// assert_eq!(iter.as_str(), "bc");
-    /// iter.next();
-    /// iter.next();
-    /// assert_eq!(iter.as_str(), "");
-    /// ```
-    pub fn as_str(&self) -> &'a str {
-        self.iter.as_str()
-    }
-}
-
-impl<'a> Iterator for GraphemeIndices<'a> {
-    type Item = (usize, &'a str);
-
-    #[inline]
-    fn next(&mut self) -> Option<(usize, &'a str)> {
-        self.iter
-            .next()
-            .map(|s| (s.as_ptr() as usize - self.start_offset, s))
-    }
-
-    #[inline]
-    fn size_hint(&self) -> (usize, Option<usize>) {
-        self.iter.size_hint()
-    }
-}
-
-impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
-    #[inline]
-    fn next_back(&mut self) -> Option<(usize, &'a str)> {
-        self.iter
-            .next_back()
-            .map(|s| (s.as_ptr() as usize - self.start_offset, s))
-    }
-}
-
-/// External iterator for a string's
-/// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries).
-///
-/// This struct is created by the [`graphemes`] method on the [`UnicodeSegmentation`] trait. See its
-/// documentation for more.
-///
-/// [`graphemes`]: trait.UnicodeSegmentation.html#tymethod.graphemes
-/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
-#[derive(Clone, Debug)]
-pub struct Graphemes<'a> {
-    string: &'a str,
-    cursor: GraphemeCursor,
-    cursor_back: GraphemeCursor,
-}
-
-impl<'a> Graphemes<'a> {
-    #[inline]
-    /// View the underlying data (the part yet to be iterated) as a slice of the original string.
-    ///
-    /// ```rust
-    /// # use unicode_segmentation::UnicodeSegmentation;
-    /// let mut iter = "abc".graphemes(true);
-    /// assert_eq!(iter.as_str(), "abc");
-    /// iter.next();
-    /// assert_eq!(iter.as_str(), "bc");
-    /// iter.next();
-    /// iter.next();
-    /// assert_eq!(iter.as_str(), "");
-    /// ```
-    pub fn as_str(&self) -> &'a str {
-        &self.string[self.cursor.cur_cursor()..self.cursor_back.cur_cursor()]
-    }
-}
-
-impl<'a> Iterator for Graphemes<'a> {
-    type Item = &'a str;
-
-    #[inline]
-    fn size_hint(&self) -> (usize, Option<usize>) {
-        let slen = self.cursor_back.cur_cursor() - self.cursor.cur_cursor();
-        (cmp::min(slen, 1), Some(slen))
-    }
-
-    #[inline]
-    fn next(&mut self) -> Option<&'a str> {
-        let start = self.cursor.cur_cursor();
-        if start == self.cursor_back.cur_cursor() {
-            return None;
-        }
-        let next = self.cursor.next_boundary(self.string, 0).unwrap().unwrap();
-        Some(&self.string[start..next])
-    }
-}
-
-impl<'a> DoubleEndedIterator for Graphemes<'a> {
-    #[inline]
-    fn next_back(&mut self) -> Option<&'a str> {
-        let end = self.cursor_back.cur_cursor();
-        if end == self.cursor.cur_cursor() {
-            return None;
-        }
-        let prev = self
-            .cursor_back
-            .prev_boundary(self.string, 0)
-            .unwrap()
-            .unwrap();
-        Some(&self.string[prev..end])
-    }
-}
-
-#[inline]
-pub fn new_graphemes<'b>(s: &'b str, is_extended: bool) -> Graphemes<'b> {
-    let len = s.len();
-    Graphemes {
-        string: s,
-        cursor: GraphemeCursor::new(0, len, is_extended),
-        cursor_back: GraphemeCursor::new(len, len, is_extended),
-    }
-}
-
-#[inline]
-pub fn new_grapheme_indices<'b>(s: &'b str, is_extended: bool) -> GraphemeIndices<'b> {
-    GraphemeIndices {
-        start_offset: s.as_ptr() as usize,
-        iter: new_graphemes(s, is_extended),
-    }
-}
-
-// maybe unify with PairResult?
-// An enum describing information about a potential boundary.
-#[derive(PartialEq, Eq, Clone, Debug)]
-enum GraphemeState {
-    // No information is known.
-    Unknown,
-    // It is known to not be a boundary.
-    NotBreak,
-    // It is known to be a boundary.
-    Break,
-    // The codepoint after is a Regional Indicator Symbol, so a boundary iff
-    // it is preceded by an even number of RIS codepoints. (GB12, GB13)
-    Regional,
-    // The codepoint after is Extended_Pictographic,
-    // so whether it's a boundary depends on pre-context according to GB11.
-    Emoji,
-}
-
-/// Cursor-based segmenter for grapheme clusters.
-///
-/// This allows working with ropes and other datastructures where the string is not contiguous or
-/// fully known at initialization time.
-#[derive(Clone, Debug)]
-pub struct GraphemeCursor {
-    // Current cursor position.
-    offset: usize,
-    // Total length of the string.
-    len: usize,
-    // A config flag indicating whether this cursor computes legacy or extended
-    // grapheme cluster boundaries (enables GB9a and GB9b if set).
-    is_extended: bool,
-    // Information about the potential boundary at `offset`
-    state: GraphemeState,
-    // Category of codepoint immediately preceding cursor, if known.
-    cat_before: Option<GraphemeCat>,
-    // Category of codepoint immediately after cursor, if known.
-    cat_after: Option<GraphemeCat>,
-    // If set, at least one more codepoint immediately preceding this offset
-    // is needed to resolve whether there's a boundary at `offset`.
-    pre_context_offset: Option<usize>,
-    // The number of RIS codepoints preceding `offset`. If `pre_context_offset`
-    // is set, then counts the number of RIS between that and `offset`, otherwise
-    // is an accurate count relative to the string.
-    ris_count: Option<usize>,
-    // Set if a call to `prev_boundary` or `next_boundary` was suspended due
-    // to needing more input.
-    resuming: bool,
-    // Cached grapheme category and associated scalar value range.
-    grapheme_cat_cache: (u32, u32, GraphemeCat),
-}
-
-/// An error return indicating that not enough content was available in the
-/// provided chunk to satisfy the query, and that more content must be provided.
-#[derive(PartialEq, Eq, Debug)]
-pub enum GraphemeIncomplete {
-    /// More pre-context is needed. The caller should call `provide_context`
-    /// with a chunk ending at the offset given, then retry the query. This
-    /// will only be returned if the `chunk_start` parameter is nonzero.
-    PreContext(usize),
-
-    /// When requesting `prev_boundary`, the cursor is moving past the beginning
-    /// of the current chunk, so the chunk before that is requested. This will
-    /// only be returned if the `chunk_start` parameter is nonzero.
-    PrevChunk,
-
-    /// When requesting `next_boundary`, the cursor is moving past the end of the
-    /// current chunk, so the chunk after that is requested. This will only be
-    /// returned if the chunk ends before the `len` parameter provided on
-    /// creation of the cursor.
-    NextChunk, // requesting chunk following the one given
-
-    /// An error returned when the chunk given does not contain the cursor position.
-    InvalidOffset,
-}
-
-// An enum describing the result from lookup of a pair of categories.
-#[derive(PartialEq, Eq)]
-enum PairResult {
-    NotBreak, // definitely not a break
-    Break,    // definitely a break
-    Extended, // a break iff not in extended mode
-    Regional, // a break if preceded by an even number of RIS
-    Emoji,    // a break if preceded by emoji base and (Extend)*
-}
-
-#[inline]
-fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult {
-    use self::PairResult::*;
-    use crate::tables::grapheme::GraphemeCat::*;
-    match (before, after) {
-        (GC_CR, GC_LF) => NotBreak,                                 // GB3
-        (GC_Control, _) => Break,                                   // GB4
-        (GC_CR, _) => Break,                                        // GB4
-        (GC_LF, _) => Break,                                        // GB4
-        (_, GC_Control) => Break,                                   // GB5
-        (_, GC_CR) => Break,                                        // GB5
-        (_, GC_LF) => Break,                                        // GB5
-        (GC_L, GC_L) => NotBreak,                                   // GB6
-        (GC_L, GC_V) => NotBreak,                                   // GB6
-        (GC_L, GC_LV) => NotBreak,                                  // GB6
-        (GC_L, GC_LVT) => NotBreak,                                 // GB6
-        (GC_LV, GC_V) => NotBreak,                                  // GB7
-        (GC_LV, GC_T) => NotBreak,                                  // GB7
-        (GC_V, GC_V) => NotBreak,                                   // GB7
-        (GC_V, GC_T) => NotBreak,                                   // GB7
-        (GC_LVT, GC_T) => NotBreak,                                 // GB8
-        (GC_T, GC_T) => NotBreak,                                   // GB8
-        (_, GC_Extend) => NotBreak,                                 // GB9
-        (_, GC_ZWJ) => NotBreak,                                    // GB9
-        (_, GC_SpacingMark) => Extended,                            // GB9a
-        (GC_Prepend, _) => Extended,                                // GB9b
-        (GC_ZWJ, GC_Extended_Pictographic) => Emoji,                // GB11
-        (GC_Regional_Indicator, GC_Regional_Indicator) => Regional, // GB12, GB13
-        (_, _) => Break,                                            // GB999
-    }
-}
-
-impl GraphemeCursor {
-    /// Create a new cursor. The string and initial offset are given at creation
-    /// time, but the contents of the string are not. The `is_extended` parameter
-    /// controls whether extended grapheme clusters are selected.
-    ///
-    /// The `offset` parameter must be on a codepoint boundary.
-    ///
-    /// ```rust
-    /// # use unicode_segmentation::GraphemeCursor;
-    /// let s = "हिन्दी";
-    /// let mut legacy = GraphemeCursor::new(0, s.len(), false);
-    /// assert_eq!(legacy.next_boundary(s, 0), Ok(Some("ह".len())));
-    /// let mut extended = GraphemeCursor::new(0, s.len(), true);
-    /// assert_eq!(extended.next_boundary(s, 0), Ok(Some("हि".len())));
-    /// ```
-    pub fn new(offset: usize, len: usize, is_extended: bool) -> GraphemeCursor {
-        let state = if offset == 0 || offset == len {
-            GraphemeState::Break
-        } else {
-            GraphemeState::Unknown
-        };
-        GraphemeCursor {
-            offset: offset,
-            len: len,
-            state: state,
-            is_extended: is_extended,
-            cat_before: None,
-            cat_after: None,
-            pre_context_offset: None,
-            ris_count: None,
-            resuming: false,
-            grapheme_cat_cache: (0, 0, GraphemeCat::GC_Control),
-        }
-    }
-
-    fn grapheme_category(&mut self, ch: char) -> GraphemeCat {
-        use crate::tables::grapheme as gr;
-        use crate::tables::grapheme::GraphemeCat::*;
-
-        if ch <= '\u{7e}' {
-            // Special-case optimization for ascii, except U+007F.  This
-            // improves performance even for many primarily non-ascii texts,
-            // due to use of punctuation and white space characters from the
-            // ascii range.
-            if ch >= '\u{20}' {
-                GC_Any
-            } else if ch == '\n' {
-                GC_LF
-            } else if ch == '\r' {
-                GC_CR
-            } else {
-                GC_Control
-            }
-        } else {
-            // If this char isn't within the cached range, update the cache to the
-            // range that includes it.
-            if (ch as u32) < self.grapheme_cat_cache.0 || (ch as u32) > self.grapheme_cat_cache.1 {
-                self.grapheme_cat_cache = gr::grapheme_category(ch);
-            }
-            self.grapheme_cat_cache.2
-        }
-    }
-
-    // Not sure I'm gonna keep this, the advantage over new() seems thin.
-
-    /// Set the cursor to a new location in the same string.
-    ///
-    /// ```rust
-    /// # use unicode_segmentation::GraphemeCursor;
-    /// let s = "abcd";
-    /// let mut cursor = GraphemeCursor::new(0, s.len(), false);
-    /// assert_eq!(cursor.cur_cursor(), 0);
-    /// cursor.set_cursor(2);
-    /// assert_eq!(cursor.cur_cursor(), 2);
-    /// ```
-    pub fn set_cursor(&mut self, offset: usize) {
-        if offset != self.offset {
-            self.offset = offset;
-            self.state = if offset == 0 || offset == self.len {
-                GraphemeState::Break
-            } else {
-                GraphemeState::Unknown
-            };
-            // reset state derived from text around cursor
-            self.cat_before = None;
-            self.cat_after = None;
-            self.ris_count = None;
-        }
-    }
-
-    #[inline]
-    /// The current offset of the cursor. Equal to the last value provided to
-    /// `new()` or `set_cursor()`, or returned from `next_boundary()` or
-    /// `prev_boundary()`.
-    ///
-    /// ```rust
-    /// # use unicode_segmentation::GraphemeCursor;
-    /// // Two flags (🇷🇸🇮🇴), each flag is two RIS codepoints, each RIS is 4 bytes.
-    /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
-    /// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
-    /// assert_eq!(cursor.cur_cursor(), 4);
-    /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
-    /// assert_eq!(cursor.cur_cursor(), 8);
-    /// ```
-    pub fn cur_cursor(&self) -> usize {
-        self.offset
-    }
-
-    /// Provide additional pre-context when it is needed to decide a boundary.
-    /// The end of the chunk must coincide with the value given in the
-    /// `GraphemeIncomplete::PreContext` request.
-    ///
-    /// ```rust
-    /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
-    /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
-    /// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
-    /// // Not enough pre-context to decide if there's a boundary between the two flags.
-    /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(8)));
-    /// // Provide one more Regional Indicator Symbol of pre-context
-    /// cursor.provide_context(&flags[4..8], 4);
-    /// // Still not enough context to decide.
-    /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(4)));
-    /// // Provide additional requested context.
-    /// cursor.provide_context(&flags[0..4], 0);
-    /// // That's enough to decide (it always is when context goes to the start of the string)
-    /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Ok(true));
-    /// ```
-    pub fn provide_context(&mut self, chunk: &str, chunk_start: usize) {
-        use crate::tables::grapheme as gr;
-        assert!(chunk_start + chunk.len() == self.pre_context_offset.unwrap());
-        self.pre_context_offset = None;
-        if self.is_extended && chunk_start + chunk.len() == self.offset {
-            let ch = chunk.chars().rev().next().unwrap();
-            if self.grapheme_category(ch) == gr::GC_Prepend {
-                self.decide(false); // GB9b
-                return;
-            }
-        }
-        match self.state {
-            GraphemeState::Regional => self.handle_regional(chunk, chunk_start),
-            GraphemeState::Emoji => self.handle_emoji(chunk, chunk_start),
-            _ => {
-                if self.cat_before.is_none() && self.offset == chunk.len() + chunk_start {
-                    let ch = chunk.chars().rev().next().unwrap();
-                    self.cat_before = Some(self.grapheme_category(ch));
-                }
-            }
-        }
-    }
-
-    #[inline]
-    fn decide(&mut self, is_break: bool) {
-        self.state = if is_break {
-            GraphemeState::Break
-        } else {
-            GraphemeState::NotBreak
-        };
-    }
-
-    #[inline]
-    fn decision(&mut self, is_break: bool) -> Result<bool, GraphemeIncomplete> {
-        self.decide(is_break);
-        Ok(is_break)
-    }
-
-    #[inline]
-    fn is_boundary_result(&self) -> Result<bool, GraphemeIncomplete> {
-        if self.state == GraphemeState::Break {
-            Ok(true)
-        } else if self.state == GraphemeState::NotBreak {
-            Ok(false)
-        } else if let Some(pre_context_offset) = self.pre_context_offset {
-            Err(GraphemeIncomplete::PreContext(pre_context_offset))
-        } else {
-            unreachable!("inconsistent state");
-        }
-    }
-
-    #[inline]
-    fn handle_regional(&mut self, chunk: &str, chunk_start: usize) {
-        use crate::tables::grapheme as gr;
-        let mut ris_count = self.ris_count.unwrap_or(0);
-        for ch in chunk.chars().rev() {
-            if self.grapheme_category(ch) != gr::GC_Regional_Indicator {
-                self.ris_count = Some(ris_count);
-                self.decide((ris_count % 2) == 0);
-                return;
-            }
-            ris_count += 1;
-        }
-        self.ris_count = Some(ris_count);
-        if chunk_start == 0 {
-            self.decide((ris_count % 2) == 0);
-            return;
-        }
-        self.pre_context_offset = Some(chunk_start);
-        self.state = GraphemeState::Regional;
-    }
-
-    #[inline]
-    fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) {
-        use crate::tables::grapheme as gr;
-        let mut iter = chunk.chars().rev();
-        if let Some(ch) = iter.next() {
-            if self.grapheme_category(ch) != gr::GC_ZWJ {
-                self.decide(true);
-                return;
-            }
-        }
-        for ch in iter {
-            match self.grapheme_category(ch) {
-                gr::GC_Extend => (),
-                gr::GC_Extended_Pictographic => {
-                    self.decide(false);
-                    return;
-                }
-                _ => {
-                    self.decide(true);
-                    return;
-                }
-            }
-        }
-        if chunk_start == 0 {
-            self.decide(true);
-            return;
-        }
-        self.pre_context_offset = Some(chunk_start);
-        self.state = GraphemeState::Emoji;
-    }
-
-    #[inline]
-    /// Determine whether the current cursor location is a grapheme cluster boundary.
-    /// Only a part of the string need be supplied. If `chunk_start` is nonzero or
-    /// the length of `chunk` is not equal to `len` on creation, then this method
-    /// may return `GraphemeIncomplete::PreContext`. The caller should then
-    /// call `provide_context` with the requested chunk, then retry calling this
-    /// method.
-    ///
-    /// For partial chunks, if the cursor is not at the beginning or end of the
-    /// string, the chunk should contain at least the codepoint following the cursor.
-    /// If the string is nonempty, the chunk must be nonempty.
-    ///
-    /// All calls should have consistent chunk contents (ie, if a chunk provides
-    /// content for a given slice, all further chunks covering that slice must have
-    /// the same content for it).
-    ///
-    /// ```rust
-    /// # use unicode_segmentation::GraphemeCursor;
-    /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
-    /// let mut cursor = GraphemeCursor::new(8, flags.len(), false);
-    /// assert_eq!(cursor.is_boundary(flags, 0), Ok(true));
-    /// cursor.set_cursor(12);
-    /// assert_eq!(cursor.is_boundary(flags, 0), Ok(false));
-    /// ```
-    pub fn is_boundary(
-        &mut self,
-        chunk: &str,
-        chunk_start: usize,
-    ) -> Result<bool, GraphemeIncomplete> {
-        use crate::tables::grapheme as gr;
-        if self.state == GraphemeState::Break {
-            return Ok(true);
-        }
-        if self.state == GraphemeState::NotBreak {
-            return Ok(false);
-        }
-        if self.offset < chunk_start || self.offset >= chunk_start + chunk.len() {
-            if self.offset > chunk_start + chunk.len() || self.cat_after.is_none() {
-                return Err(GraphemeIncomplete::InvalidOffset);
-            }
-        }
-        if let Some(pre_context_offset) = self.pre_context_offset {
-            return Err(GraphemeIncomplete::PreContext(pre_context_offset));
-        }
-        let offset_in_chunk = self.offset - chunk_start;
-        if self.cat_after.is_none() {
-            let ch = chunk[offset_in_chunk..].chars().next().unwrap();
-            self.cat_after = Some(self.grapheme_category(ch));
-        }
-        if self.offset == chunk_start {
-            let mut need_pre_context = true;
-            match self.cat_after.unwrap() {
-                gr::GC_Regional_Indicator => self.state = GraphemeState::Regional,
-                gr::GC_Extended_Pictographic => self.state = GraphemeState::Emoji,
-                _ => need_pre_context = self.cat_before.is_none(),
-            }
-            if need_pre_context {
-                self.pre_context_offset = Some(chunk_start);
-                return Err(GraphemeIncomplete::PreContext(chunk_start));
-            }
-        }
-        if self.cat_before.is_none() {
-            let ch = chunk[..offset_in_chunk].chars().rev().next().unwrap();
-            self.cat_before = Some(self.grapheme_category(ch));
-        }
-        match check_pair(self.cat_before.unwrap(), self.cat_after.unwrap()) {
-            PairResult::NotBreak => return self.decision(false),
-            PairResult::Break => return self.decision(true),
-            PairResult::Extended => {
-                let is_extended = self.is_extended;
-                return self.decision(!is_extended);
-            }
-            PairResult::Regional => {
-                if let Some(ris_count) = self.ris_count {
-                    return self.decision((ris_count % 2) == 0);
-                }
-                self.handle_regional(&chunk[..offset_in_chunk], chunk_start);
-                self.is_boundary_result()
-            }
-            PairResult::Emoji => {
-                self.handle_emoji(&chunk[..offset_in_chunk], chunk_start);
-                self.is_boundary_result()
-            }
-        }
-    }
-
-    #[inline]
-    /// Find the next boundary after the current cursor position. Only a part of
-    /// the string need be supplied. If the chunk is incomplete, then this
-    /// method might return `GraphemeIncomplete::PreContext` or
-    /// `GraphemeIncomplete::NextChunk`. In the former case, the caller should
-    /// call `provide_context` with the requested chunk, then retry. In the
-    /// latter case, the caller should provide the chunk following the one
-    /// given, then retry.
-    ///
-    /// See `is_boundary` for expectations on the provided chunk.
-    ///
-    /// ```rust
-    /// # use unicode_segmentation::GraphemeCursor;
-    /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
-    /// let mut cursor = GraphemeCursor::new(4, flags.len(), false);
-    /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
-    /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(16)));
-    /// assert_eq!(cursor.next_boundary(flags, 0), Ok(None));
-    /// ```
-    ///
-    /// And an example that uses partial strings:
-    ///
-    /// ```rust
-    /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
-    /// let s = "abcd";
-    /// let mut cursor = GraphemeCursor::new(0, s.len(), false);
-    /// assert_eq!(cursor.next_boundary(&s[..2], 0), Ok(Some(1)));
-    /// assert_eq!(cursor.next_boundary(&s[..2], 0), Err(GraphemeIncomplete::NextChunk));
-    /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(2)));
-    /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(3)));
-    /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(4)));
-    /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(None));
-    /// ```
-    pub fn next_boundary(
-        &mut self,
-        chunk: &str,
-        chunk_start: usize,
-    ) -> Result<Option<usize>, GraphemeIncomplete> {
-        if self.offset == self.len {
-            return Ok(None);
-        }
-        let mut iter = chunk[self.offset - chunk_start..].chars();
-        let mut ch = iter.next().unwrap();
-        loop {
-            if self.resuming {
-                if self.cat_after.is_none() {
-                    self.cat_after = Some(self.grapheme_category(ch));
-                }
-            } else {
-                self.offset += ch.len_utf8();
-                self.state = GraphemeState::Unknown;
-                self.cat_before = self.cat_after.take();
-                if self.cat_before.is_none() {
-                    self.cat_before = Some(self.grapheme_category(ch));
-                }
-                if self.cat_before.unwrap() == GraphemeCat::GC_Regional_Indicator {
-                    self.ris_count = self.ris_count.map(|c| c + 1);
-                } else {
-                    self.ris_count = Some(0);
-                }
-                if let Some(next_ch) = iter.next() {
-                    ch = next_ch;
-                    self.cat_after = Some(self.grapheme_category(ch));
-                } else if self.offset == self.len {
-                    self.decide(true);
-                } else {
-                    self.resuming = true;
-                    return Err(GraphemeIncomplete::NextChunk);
-                }
-            }
-            self.resuming = true;
-            if self.is_boundary(chunk, chunk_start)? {
-                self.resuming = false;
-                return Ok(Some(self.offset));
-            }
-            self.resuming = false;
-        }
-    }
-
-    /// Find the previous boundary after the current cursor position. Only a part
-    /// of the string need be supplied. If the chunk is incomplete, then this
-    /// method might return `GraphemeIncomplete::PreContext` or
-    /// `GraphemeIncomplete::PrevChunk`. In the former case, the caller should
-    /// call `provide_context` with the requested chunk, then retry. In the
-    /// latter case, the caller should provide the chunk preceding the one
-    /// given, then retry.
-    ///
-    /// See `is_boundary` for expectations on the provided chunk.
-    ///
-    /// ```rust
-    /// # use unicode_segmentation::GraphemeCursor;
-    /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
-    /// let mut cursor = GraphemeCursor::new(12, flags.len(), false);
-    /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(8)));
-    /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(0)));
-    /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(None));
-    /// ```
-    ///
-    /// And an example that uses partial strings (note the exact return is not
-    /// guaranteed, and may be `PrevChunk` or `PreContext` arbitrarily):
-    ///
-    /// ```rust
-    /// # use unicode_segmentation::{GraphemeCursor, GraphemeIncomplete};
-    /// let s = "abcd";
-    /// let mut cursor = GraphemeCursor::new(4, s.len(), false);
-    /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Ok(Some(3)));
-    /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Err(GraphemeIncomplete::PrevChunk));
-    /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(2)));
-    /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(1)));
-    /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(0)));
-    /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(None));
-    /// ```
-    pub fn prev_boundary(
-        &mut self,
-        chunk: &str,
-        chunk_start: usize,
-    ) -> Result<Option<usize>, GraphemeIncomplete> {
-        if self.offset == 0 {
-            return Ok(None);
-        }
-        if self.offset == chunk_start {
-            return Err(GraphemeIncomplete::PrevChunk);
-        }
-        let mut iter = chunk[..self.offset - chunk_start].chars().rev();
-        let mut ch = iter.next().unwrap();
-        loop {
-            if self.offset == chunk_start {
-                self.resuming = true;
-                return Err(GraphemeIncomplete::PrevChunk);
-            }
-            if self.resuming {
-                self.cat_before = Some(self.grapheme_category(ch));
-            } else {
-                self.offset -= ch.len_utf8();
-                self.cat_after = self.cat_before.take();
-                self.state = GraphemeState::Unknown;
-                if let Some(ris_count) = self.ris_count {
-                    self.ris_count = if ris_count > 0 {
-                        Some(ris_count - 1)
-                    } else {
-                        None
-                    };
-                }
-                if let Some(prev_ch) = iter.next() {
-                    ch = prev_ch;
-                    self.cat_before = Some(self.grapheme_category(ch));
-                } else if self.offset == 0 {
-                    self.decide(true);
-                } else {
-                    self.resuming = true;
-                    self.cat_after = Some(self.grapheme_category(ch));
-                    return Err(GraphemeIncomplete::PrevChunk);
-                }
-            }
-            self.resuming = true;
-            if self.is_boundary(chunk, chunk_start)? {
-                self.resuming = false;
-                return Ok(Some(self.offset));
-            }
-            self.resuming = false;
-        }
-    }
-}
-
-#[test]
-fn test_grapheme_cursor_ris_precontext() {
-    let s = "\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}\u{1f1fa}\u{1f1f8}";
-    let mut c = GraphemeCursor::new(8, s.len(), true);
-    assert_eq!(
-        c.is_boundary(&s[4..], 4),
-        Err(GraphemeIncomplete::PreContext(4))
-    );
-    c.provide_context(&s[..4], 0);
-    assert_eq!(c.is_boundary(&s[4..], 4), Ok(true));
-}
-
-#[test]
-fn test_grapheme_cursor_chunk_start_require_precontext() {
-    let s = "\r\n";
-    let mut c = GraphemeCursor::new(1, s.len(), true);
-    assert_eq!(
-        c.is_boundary(&s[1..], 1),
-        Err(GraphemeIncomplete::PreContext(1))
-    );
-    c.provide_context(&s[..1], 0);
-    assert_eq!(c.is_boundary(&s[1..], 1), Ok(false));
-}
-
-#[test]
-fn test_grapheme_cursor_prev_boundary() {
-    let s = "abcd";
-    let mut c = GraphemeCursor::new(3, s.len(), true);
-    assert_eq!(
-        c.prev_boundary(&s[2..], 2),
-        Err(GraphemeIncomplete::PrevChunk)
-    );
-    assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(2)));
-}
-
-#[test]
-fn test_grapheme_cursor_prev_boundary_chunk_start() {
-    let s = "abcd";
-    let mut c = GraphemeCursor::new(2, s.len(), true);
-    assert_eq!(
-        c.prev_boundary(&s[2..], 2),
-        Err(GraphemeIncomplete::PrevChunk)
-    );
-    assert_eq!(c.prev_boundary(&s[..2], 0), Ok(Some(1)));
-}
--- a/third_party/rust/unicode-segmentation/src/lib.rs
+++ b/third_party/rust/unicode-segmentation/src/lib.rs
@ -1,307 +0,0 @@
-// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
-// file at the top-level directory of this distribution and at
-// http://rust-lang.org/COPYRIGHT.
-//
-// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
-// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
-// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
-// option. This file may not be copied, modified, or distributed
-// except according to those terms.
-
-//! Iterators which split strings on Grapheme Cluster, Word or Sentence boundaries, according
-//! to the [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/) rules.
-//!
-//! ```rust
-//! extern crate unicode_segmentation;
-//!
-//! use unicode_segmentation::UnicodeSegmentation;
-//!
-//! fn main() {
-//!     let s = "a̐éö̲\r\n";
-//!     let g = UnicodeSegmentation::graphemes(s, true).collect::<Vec<&str>>();
-//!     let b: &[_] = &["a̐", "é", "ö̲", "\r\n"];
-//!     assert_eq!(g, b);
-//!
-//!     let s = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
-//!     let w = s.unicode_words().collect::<Vec<&str>>();
-//!     let b: &[_] = &["The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right"];
-//!     assert_eq!(w, b);
-//!
-//!     let s = "The quick (\"brown\")  fox";
-//!     let w = s.split_word_bounds().collect::<Vec<&str>>();
-//!     let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", "  ", "fox"];
-//!     assert_eq!(w, b);
-//! }
-//! ```
-//!
-//! # no_std
-//!
-//! unicode-segmentation does not depend on libstd, so it can be used in crates
-//! with the `#![no_std]` attribute.
-//!
-//! # crates.io
-//!
-//! You can use this package in your project by adding the following
-//! to your `Cargo.toml`:
-//!
-//! ```toml
-//! [dependencies]
-//! unicode-segmentation = "1.9.0"
-//! ```
-
-#![deny(missing_docs, unsafe_code)]
-#![doc(
-    html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
-    html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png"
-)]
-#![no_std]
-
-#[cfg(test)]
-#[macro_use]
-extern crate std;
-
-#[cfg(test)]
-#[macro_use]
-extern crate quickcheck;
-
-pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
-pub use grapheme::{GraphemeIndices, Graphemes};
-pub use sentence::{USentenceBoundIndices, USentenceBounds, UnicodeSentences};
-pub use tables::UNICODE_VERSION;
-pub use word::{UWordBoundIndices, UWordBounds, UnicodeWordIndices, UnicodeWords};
-
-mod grapheme;
-#[rustfmt::skip]
-mod tables;
-mod sentence;
-mod word;
-
-#[cfg(test)]
-mod test;
-#[cfg(test)]
-mod testdata;
-
-/// Methods for segmenting strings according to
-/// [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/).
-pub trait UnicodeSegmentation {
-    /// Returns an iterator over the [grapheme clusters][graphemes] of `self`.
-    ///
-    /// [graphemes]: http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
-    ///
-    /// If `is_extended` is true, the iterator is over the
-    /// *extended grapheme clusters*;
-    /// otherwise, the iterator is over the *legacy grapheme clusters*.
-    /// [UAX#29](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries)
-    /// recommends extended grapheme cluster boundaries for general processing.
-    ///
-    /// # Examples
-    ///
-    /// ```
-    /// # use self::unicode_segmentation::UnicodeSegmentation;
-    /// let gr1 = UnicodeSegmentation::graphemes("a\u{310}e\u{301}o\u{308}\u{332}", true)
-    ///           .collect::<Vec<&str>>();
-    /// let b: &[_] = &["a\u{310}", "e\u{301}", "o\u{308}\u{332}"];
-    ///
-    /// assert_eq!(&gr1[..], b);
-    ///
-    /// let gr2 = UnicodeSegmentation::graphemes("a\r\nb🇷🇺🇸🇹", true).collect::<Vec<&str>>();
-    /// let b: &[_] = &["a", "\r\n", "b", "🇷🇺", "🇸🇹"];
-    ///
-    /// assert_eq!(&gr2[..], b);
-    /// ```
-    fn graphemes<'a>(&'a self, is_extended: bool) -> Graphemes<'a>;
-
-    /// Returns an iterator over the grapheme clusters of `self` and their
-    /// byte offsets. See `graphemes()` for more information.
-    ///
-    /// # Examples
-    ///
-    /// ```
-    /// # use self::unicode_segmentation::UnicodeSegmentation;
-    /// let gr_inds = UnicodeSegmentation::grapheme_indices("a̐éö̲\r\n", true)
-    ///               .collect::<Vec<(usize, &str)>>();
-    /// let b: &[_] = &[(0, "a̐"), (3, "é"), (6, "ö̲"), (11, "\r\n")];
-    ///
-    /// assert_eq!(&gr_inds[..], b);
-    /// ```
-    fn grapheme_indices<'a>(&'a self, is_extended: bool) -> GraphemeIndices<'a>;
-
-    /// Returns an iterator over the words of `self`, separated on
-    /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
-    ///
-    /// Here, "words" are just those substrings which, after splitting on
-    /// UAX#29 word boundaries, contain any alphanumeric characters. That is, the
-    /// substring must contain at least one character with the
-    /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
-    /// property, or with
-    /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
-    ///
-    /// # Example
-    ///
-    /// ```
-    /// # use self::unicode_segmentation::UnicodeSegmentation;
-    /// let uws = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
-    /// let uw1 = uws.unicode_words().collect::<Vec<&str>>();
-    /// let b: &[_] = &["The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right"];
-    ///
-    /// assert_eq!(&uw1[..], b);
-    /// ```
-    fn unicode_words<'a>(&'a self) -> UnicodeWords<'a>;
-
-    /// Returns an iterator over the words of `self`, separated on
-    /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), and their
-    /// offsets.
-    ///
-    /// Here, "words" are just those substrings which, after splitting on
-    /// UAX#29 word boundaries, contain any alphanumeric characters. That is, the
-    /// substring must contain at least one character with the
-    /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
-    /// property, or with
-    /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
-    ///
-    /// # Example
-    ///
-    /// ```
-    /// # use self::unicode_segmentation::UnicodeSegmentation;
-    /// let uwis = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
-    /// let uwi1 = uwis.unicode_word_indices().collect::<Vec<(usize, &str)>>();
-    /// let b: &[_] = &[(0, "The"), (4, "quick"), (12, "brown"), (20, "fox"), (24, "can't"),
-    ///                 (30, "jump"), (35, "32.3"), (40, "feet"), (46, "right")];
-    ///
-    /// assert_eq!(&uwi1[..], b);
-    /// ```
-    fn unicode_word_indices<'a>(&'a self) -> UnicodeWordIndices<'a>;
-
-    /// Returns an iterator over substrings of `self` separated on
-    /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
-    ///
-    /// The concatenation of the substrings returned by this function is just the original string.
-    ///
-    /// # Example
-    ///
-    /// ```
-    /// # use self::unicode_segmentation::UnicodeSegmentation;
-    /// let swu1 = "The quick (\"brown\")  fox".split_word_bounds().collect::<Vec<&str>>();
-    /// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", "  ", "fox"];
-    ///
-    /// assert_eq!(&swu1[..], b);
-    /// ```
-    fn split_word_bounds<'a>(&'a self) -> UWordBounds<'a>;
-
-    /// Returns an iterator over substrings of `self`, split on UAX#29 word boundaries,
-    /// and their offsets. See `split_word_bounds()` for more information.
-    ///
-    /// # Example
-    ///
-    /// ```
-    /// # use self::unicode_segmentation::UnicodeSegmentation;
-    /// let swi1 = "Brr, it's 29.3°F!".split_word_bound_indices().collect::<Vec<(usize, &str)>>();
-    /// let b: &[_] = &[(0, "Brr"), (3, ","), (4, " "), (5, "it's"), (9, " "), (10, "29.3"),
-    ///                 (14, "°"), (16, "F"), (17, "!")];
-    ///
-    /// assert_eq!(&swi1[..], b);
-    /// ```
-    fn split_word_bound_indices<'a>(&'a self) -> UWordBoundIndices<'a>;
-
-    /// Returns an iterator over substrings of `self` separated on
-    /// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
-    ///
-    /// Here, "sentences" are just those substrings which, after splitting on
-    /// UAX#29 sentence boundaries, contain any alphanumeric characters. That is, the
-    /// substring must contain at least one character with the
-    /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
-    /// property, or with
-    /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
-    ///
-    /// # Example
-    ///
-    /// ```
-    /// # use self::unicode_segmentation::UnicodeSegmentation;
-    /// let uss = "Mr. Fox jumped. [...] The dog was too lazy.";
-    /// let us1 = uss.unicode_sentences().collect::<Vec<&str>>();
-    /// let b: &[_] = &["Mr. ", "Fox jumped. ", "The dog was too lazy."];
-    ///
-    /// assert_eq!(&us1[..], b);
-    /// ```
-    fn unicode_sentences<'a>(&'a self) -> UnicodeSentences<'a>;
-
-    /// Returns an iterator over substrings of `self` separated on
-    /// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
-    ///
-    /// The concatenation of the substrings returned by this function is just the original string.
-    ///
-    /// # Example
-    ///
-    /// ```
-    /// # use self::unicode_segmentation::UnicodeSegmentation;
-    /// let ssbs = "Mr. Fox jumped. [...] The dog was too lazy.";
-    /// let ssb1 = ssbs.split_sentence_bounds().collect::<Vec<&str>>();
-    /// let b: &[_] = &["Mr. ", "Fox jumped. ", "[...] ", "The dog was too lazy."];
-    ///
-    /// assert_eq!(&ssb1[..], b);
-    /// ```
-    fn split_sentence_bounds<'a>(&'a self) -> USentenceBounds<'a>;
-
-    /// Returns an iterator over substrings of `self`, split on UAX#29 sentence boundaries,
-    /// and their offsets. See `split_sentence_bounds()` for more information.
-    ///
-    /// # Example
-    ///
-    /// ```
-    /// # use self::unicode_segmentation::UnicodeSegmentation;
-    /// let ssis = "Mr. Fox jumped. [...] The dog was too lazy.";
-    /// let ssi1 = ssis.split_sentence_bound_indices().collect::<Vec<(usize, &str)>>();
-    /// let b: &[_] = &[(0, "Mr. "), (4, "Fox jumped. "), (16, "[...] "),
-    ///                 (22, "The dog was too lazy.")];
-    ///
-    /// assert_eq!(&ssi1[..], b);
-    /// ```
-    fn split_sentence_bound_indices<'a>(&'a self) -> USentenceBoundIndices<'a>;
-}
-
-impl UnicodeSegmentation for str {
-    #[inline]
-    fn graphemes(&self, is_extended: bool) -> Graphemes {
-        grapheme::new_graphemes(self, is_extended)
-    }
-
-    #[inline]
-    fn grapheme_indices(&self, is_extended: bool) -> GraphemeIndices {
-        grapheme::new_grapheme_indices(self, is_extended)
-    }
-
-    #[inline]
-    fn unicode_words(&self) -> UnicodeWords {
-        word::new_unicode_words(self)
-    }
-
-    #[inline]
-    fn unicode_word_indices(&self) -> UnicodeWordIndices {
-        word::new_unicode_word_indices(self)
-    }
-
-    #[inline]
-    fn split_word_bounds(&self) -> UWordBounds {
-        word::new_word_bounds(self)
-    }
-
-    #[inline]
-    fn split_word_bound_indices(&self) -> UWordBoundIndices {
-        word::new_word_bound_indices(self)
-    }
-
-    #[inline]
-    fn unicode_sentences(&self) -> UnicodeSentences {
-        sentence::new_unicode_sentences(self)
-    }
-
-    #[inline]
-    fn split_sentence_bounds(&self) -> USentenceBounds {
-        sentence::new_sentence_bounds(self)
-    }
-
-    #[inline]
-    fn split_sentence_bound_indices(&self) -> USentenceBoundIndices {
-        sentence::new_sentence_bound_indices(self)
-    }
-}
--- a/third_party/rust/unicode-segmentation/src/sentence.rs
+++ b/third_party/rust/unicode-segmentation/src/sentence.rs
@ -1,415 +0,0 @@
-// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
-// file at the top-level directory of this distribution and at
-// http://rust-lang.org/COPYRIGHT.
-//
-// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
-// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
-// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
-// option. This file may not be copied, modified, or distributed
-// except according to those terms.
-
-use core::cmp;
-use core::iter::Filter;
-
-// All of the logic for forward iteration over sentences
-mod fwd {
-    use crate::tables::sentence::SentenceCat;
-    use core::cmp;
-
-    // Describe a parsed part of source string as described in this table:
-    // https://unicode.org/reports/tr29/#Default_Sentence_Boundaries
-    #[derive(Clone, Copy, PartialEq, Eq)]
-    enum StatePart {
-        Sot,
-        Eot,
-        Other,
-        CR,
-        LF,
-        Sep,
-        ATerm,
-        UpperLower,
-        ClosePlus,
-        SpPlus,
-        STerm,
-    }
-
-    #[derive(Clone, PartialEq, Eq)]
-    struct SentenceBreaksState(pub [StatePart; 4]);
-
-    const INITIAL_STATE: SentenceBreaksState = SentenceBreaksState([
-        StatePart::Sot,
-        StatePart::Sot,
-        StatePart::Sot,
-        StatePart::Sot,
-    ]);
-
-    #[derive(Clone)]
-    pub struct SentenceBreaks<'a> {
-        pub string: &'a str,
-        pos: usize,
-        state: SentenceBreaksState,
-    }
-
-    impl SentenceBreaksState {
-        // Attempt to advance the internal state by one part
-        // Whitespace and some punctutation will be collapsed
-        fn next(&self, cat: SentenceCat) -> SentenceBreaksState {
-            let &SentenceBreaksState(parts) = self;
-            let parts = match (parts[3], cat) {
-                (StatePart::ClosePlus, SentenceCat::SC_Close) => parts,
-                (StatePart::SpPlus, SentenceCat::SC_Sp) => parts,
-                _ => [
-                    parts[1],
-                    parts[2],
-                    parts[3],
-                    match cat {
-                        SentenceCat::SC_CR => StatePart::CR,
-                        SentenceCat::SC_LF => StatePart::LF,
-                        SentenceCat::SC_Sep => StatePart::Sep,
-                        SentenceCat::SC_ATerm => StatePart::ATerm,
-                        SentenceCat::SC_Upper | SentenceCat::SC_Lower => StatePart::UpperLower,
-                        SentenceCat::SC_Close => StatePart::ClosePlus,
-                        SentenceCat::SC_Sp => StatePart::SpPlus,
-                        SentenceCat::SC_STerm => StatePart::STerm,
-                        _ => StatePart::Other,
-                    },
-                ],
-            };
-            SentenceBreaksState(parts)
-        }
-
-        fn end(&self) -> SentenceBreaksState {
-            let &SentenceBreaksState(parts) = self;
-            SentenceBreaksState([parts[1], parts[2], parts[3], StatePart::Eot])
-        }
-
-        // Helper function to check if state head matches a single `StatePart`
-        fn match1(&self, part: StatePart) -> bool {
-            let &SentenceBreaksState(parts) = self;
-            part == parts[3]
-        }
-
-        // Helper function to check if first two `StateParts` in state match
-        // the given two
-        fn match2(&self, part1: StatePart, part2: StatePart) -> bool {
-            let &SentenceBreaksState(parts) = self;
-            part1 == parts[2] && part2 == parts[3]
-        }
-    }
-
-    // https://unicode.org/reports/tr29/#SB8
-    // TODO cache this, it is currently quadratic
-    fn match_sb8(state: &SentenceBreaksState, ahead: &str) -> bool {
-        let &SentenceBreaksState(parts) = state;
-        let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
-        if parts[idx] == StatePart::ClosePlus {
-            idx -= 1
-        }
-
-        if parts[idx] == StatePart::ATerm {
-            use crate::tables::sentence as se;
-
-            for next_char in ahead.chars() {
-                //( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower
-                match se::sentence_category(next_char).2 {
-                    se::SC_Lower => return true,
-                    se::SC_OLetter
-                    | se::SC_Upper
-                    | se::SC_Sep
-                    | se::SC_CR
-                    | se::SC_LF
-                    | se::SC_STerm
-                    | se::SC_ATerm => return false,
-                    _ => continue,
-                }
-            }
-        }
-
-        false
-    }
-
-    // https://unicode.org/reports/tr29/#SB8a
-    fn match_sb8a(state: &SentenceBreaksState) -> bool {
-        // SATerm Close* Sp*
-        let &SentenceBreaksState(parts) = state;
-        let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
-        if parts[idx] == StatePart::ClosePlus {
-            idx -= 1
-        }
-        parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
-    }
-
-    // https://unicode.org/reports/tr29/#SB9
-    fn match_sb9(state: &SentenceBreaksState) -> bool {
-        // SATerm Close*
-        let &SentenceBreaksState(parts) = state;
-        let idx = if parts[3] == StatePart::ClosePlus {
-            2
-        } else {
-            3
-        };
-        parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
-    }
-
-    // https://unicode.org/reports/tr29/#SB11
-    fn match_sb11(state: &SentenceBreaksState) -> bool {
-        // SATerm Close* Sp* ParaSep?
-        let &SentenceBreaksState(parts) = state;
-        let mut idx = match parts[3] {
-            StatePart::Sep | StatePart::CR | StatePart::LF => 2,
-            _ => 3,
-        };
-
-        if parts[idx] == StatePart::SpPlus {
-            idx -= 1
-        }
-        if parts[idx] == StatePart::ClosePlus {
-            idx -= 1
-        }
-
-        parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
-    }
-
-    impl<'a> Iterator for SentenceBreaks<'a> {
-        // Returns the index of the character which follows a break
-        type Item = usize;
-
-        #[inline]
-        fn size_hint(&self) -> (usize, Option<usize>) {
-            let slen = self.string.len();
-            // A sentence could be one character
-            (cmp::min(slen, 2), Some(slen + 1))
-        }
-
-        #[inline]
-        fn next(&mut self) -> Option<usize> {
-            use crate::tables::sentence as se;
-
-            for next_char in self.string[self.pos..].chars() {
-                let position_before = self.pos;
-                let state_before = self.state.clone();
-
-                let next_cat = se::sentence_category(next_char).2;
-
-                self.pos += next_char.len_utf8();
-                self.state = self.state.next(next_cat);
-
-                match next_cat {
-                    // SB1 https://unicode.org/reports/tr29/#SB1
-                    _ if state_before.match1(StatePart::Sot) => return Some(position_before),
-
-                    // SB2 is handled when inner iterator (chars) is finished
-
-                    // SB3 https://unicode.org/reports/tr29/#SB3
-                    SentenceCat::SC_LF if state_before.match1(StatePart::CR) => continue,
-
-                    // SB4 https://unicode.org/reports/tr29/#SB4
-                    _ if state_before.match1(StatePart::Sep)
-                        || state_before.match1(StatePart::CR)
-                        || state_before.match1(StatePart::LF) =>
-                    {
-                        return Some(position_before)
-                    }
-
-                    // SB5 https://unicode.org/reports/tr29/#SB5
-                    SentenceCat::SC_Extend | SentenceCat::SC_Format => self.state = state_before,
-
-                    // SB6 https://unicode.org/reports/tr29/#SB6
-                    SentenceCat::SC_Numeric if state_before.match1(StatePart::ATerm) => continue,
-
-                    // SB7 https://unicode.org/reports/tr29/#SB7
-                    SentenceCat::SC_Upper
-                        if state_before.match2(StatePart::UpperLower, StatePart::ATerm) =>
-                    {
-                        continue
-                    }
-
-                    // SB8 https://unicode.org/reports/tr29/#SB8
-                    _ if match_sb8(&state_before, &self.string[position_before..]) => continue,
-
-                    // SB8a https://unicode.org/reports/tr29/#SB8a
-                    SentenceCat::SC_SContinue | SentenceCat::SC_STerm | SentenceCat::SC_ATerm
-                        if match_sb8a(&state_before) =>
-                    {
-                        continue
-                    }
-
-                    // SB9 https://unicode.org/reports/tr29/#SB9
-                    SentenceCat::SC_Close
-                    | SentenceCat::SC_Sp
-                    | SentenceCat::SC_Sep
-                    | SentenceCat::SC_CR
-                    | SentenceCat::SC_LF
-                        if match_sb9(&state_before) =>
-                    {
-                        continue
-                    }
-
-                    // SB10 https://unicode.org/reports/tr29/#SB10
-                    SentenceCat::SC_Sp
-                    | SentenceCat::SC_Sep
-                    | SentenceCat::SC_CR
-                    | SentenceCat::SC_LF
-                        if match_sb8a(&state_before) =>
-                    {
-                        continue
-                    }
-
-                    // SB11 https://unicode.org/reports/tr29/#SB11
-                    _ if match_sb11(&state_before) => return Some(position_before),
-
-                    // SB998 https://unicode.org/reports/tr29/#SB998
-                    _ => continue,
-                }
-            }
-
-            // SB2 https://unicode.org/reports/tr29/#SB2
-            if self.state.match1(StatePart::Sot) {
-                None
-            } else if self.state.match1(StatePart::Eot) {
-                None
-            } else {
-                self.state = self.state.end();
-                Some(self.pos)
-            }
-        }
-    }
-
-    pub fn new_sentence_breaks<'a>(source: &'a str) -> SentenceBreaks<'a> {
-        SentenceBreaks {
-            string: source,
-            pos: 0,
-            state: INITIAL_STATE,
-        }
-    }
-}
-
-/// An iterator over the substrings of a string which, after splitting the string on
-/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries),
-/// contain any characters with the
-/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
-/// property, or with
-/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
-///
-/// This struct is created by the [`unicode_sentences`] method on the [`UnicodeSegmentation`]
-/// trait. See its documentation for more.
-///
-/// [`unicode_sentences`]: trait.UnicodeSegmentation.html#tymethod.unicode_sentences
-/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
-#[derive(Clone)]
-pub struct UnicodeSentences<'a> {
-    inner: Filter<USentenceBounds<'a>, fn(&&str) -> bool>,
-}
-
-/// External iterator for a string's
-/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
-///
-/// This struct is created by the [`split_sentence_bounds`] method on the [`UnicodeSegmentation`]
-/// trait. See its documentation for more.
-///
-/// [`split_sentence_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bounds
-/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
-#[derive(Clone)]
-pub struct USentenceBounds<'a> {
-    iter: fwd::SentenceBreaks<'a>,
-    sentence_start: Option<usize>,
-}
-
-/// External iterator for sentence boundaries and byte offsets.
-///
-/// This struct is created by the [`split_sentence_bound_indices`] method on the
-/// [`UnicodeSegmentation`] trait. See its documentation for more.
-///
-/// [`split_sentence_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bound_indices
-/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
-#[derive(Clone)]
-pub struct USentenceBoundIndices<'a> {
-    start_offset: usize,
-    iter: USentenceBounds<'a>,
-}
-
-#[inline]
-pub fn new_sentence_bounds<'a>(source: &'a str) -> USentenceBounds<'a> {
-    USentenceBounds {
-        iter: fwd::new_sentence_breaks(source),
-        sentence_start: None,
-    }
-}
-
-#[inline]
-pub fn new_sentence_bound_indices<'a>(source: &'a str) -> USentenceBoundIndices<'a> {
-    USentenceBoundIndices {
-        start_offset: source.as_ptr() as usize,
-        iter: new_sentence_bounds(source),
-    }
-}
-
-#[inline]
-pub fn new_unicode_sentences<'b>(s: &'b str) -> UnicodeSentences<'b> {
-    use super::UnicodeSegmentation;
-    use crate::tables::util::is_alphanumeric;
-
-    fn has_alphanumeric(s: &&str) -> bool {
-        s.chars().any(|c| is_alphanumeric(c))
-    }
-    let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer
-
-    UnicodeSentences {
-        inner: s.split_sentence_bounds().filter(has_alphanumeric),
-    }
-}
-
-impl<'a> Iterator for UnicodeSentences<'a> {
-    type Item = &'a str;
-
-    #[inline]
-    fn next(&mut self) -> Option<&'a str> {
-        self.inner.next()
-    }
-}
-
-impl<'a> Iterator for USentenceBounds<'a> {
-    type Item = &'a str;
-
-    #[inline]
-    fn size_hint(&self) -> (usize, Option<usize>) {
-        let (lower, upper) = self.iter.size_hint();
-        (cmp::max(0, lower - 1), upper.map(|u| cmp::max(0, u - 1)))
-    }
-
-    #[inline]
-    fn next(&mut self) -> Option<&'a str> {
-        if self.sentence_start == None {
-            if let Some(start_pos) = self.iter.next() {
-                self.sentence_start = Some(start_pos)
-            } else {
-                return None;
-            }
-        }
-
-        if let Some(break_pos) = self.iter.next() {
-            let start_pos = self.sentence_start.unwrap();
-            let sentence = &self.iter.string[start_pos..break_pos];
-            self.sentence_start = Some(break_pos);
-            Some(sentence)
-        } else {
-            None
-        }
-    }
-}
-
-impl<'a> Iterator for USentenceBoundIndices<'a> {
-    type Item = (usize, &'a str);
-
-    #[inline]
-    fn next(&mut self) -> Option<(usize, &'a str)> {
-        self.iter
-            .next()
-            .map(|s| (s.as_ptr() as usize - self.start_offset, s))
-    }
-
-    #[inline]
-    fn size_hint(&self) -> (usize, Option<usize>) {
-        self.iter.size_hint()
-    }
-}
--- a/third_party/rust/unicode-segmentation/src/tables.rs
+++ b/third_party/rust/unicode-segmentation/src/tables.rs
--- a/third_party/rust/unicode-segmentation/src/test.rs
+++ b/third_party/rust/unicode-segmentation/src/test.rs
@ -1,247 +0,0 @@
-// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
-// file at the top-level directory of this distribution and at
-// http://rust-lang.org/COPYRIGHT.
-//
-// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
-// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
-// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
-// option. This file may not be copied, modified, or distributed
-// except according to those terms.
-
-use super::UnicodeSegmentation;
-
-use std::prelude::v1::*;
-
-#[test]
-fn test_graphemes() {
-    use crate::testdata::{TEST_DIFF, TEST_SAME};
-
-    pub const EXTRA_DIFF: &'static [(
-        &'static str,
-        &'static [&'static str],
-        &'static [&'static str],
-    )] = &[
-        // Official test suite doesn't include two Prepend chars between two other chars.
-        (
-            "\u{20}\u{600}\u{600}\u{20}",
-            &["\u{20}", "\u{600}\u{600}\u{20}"],
-            &["\u{20}", "\u{600}", "\u{600}", "\u{20}"],
-        ),
-        // Test for Prepend followed by two Any chars
-        (
-            "\u{600}\u{20}\u{20}",
-            &["\u{600}\u{20}", "\u{20}"],
-            &["\u{600}", "\u{20}", "\u{20}"],
-        ),
-    ];
-
-    pub const EXTRA_SAME: &'static [(&'static str, &'static [&'static str])] = &[
-        // family emoji (more than two emoji joined by ZWJ)
-        (
-            "\u{1f468}\u{200d}\u{1f467}\u{200d}\u{1f466}",
-            &["\u{1f468}\u{200d}\u{1f467}\u{200d}\u{1f466}"],
-        ),
-        // cartwheel emoji followed by two fitzpatrick skin tone modifiers
-        // (test case from issue #19)
-        (
-            "\u{1F938}\u{1F3FE}\u{1F3FE}",
-            &["\u{1F938}\u{1F3FE}\u{1F3FE}"],
-        ),
-    ];
-
-    for &(s, g) in TEST_SAME.iter().chain(EXTRA_SAME) {
-        // test forward iterator
-        assert!(UnicodeSegmentation::graphemes(s, true).eq(g.iter().cloned()));
-        assert!(UnicodeSegmentation::graphemes(s, false).eq(g.iter().cloned()));
-
-        // test reverse iterator
-        assert!(UnicodeSegmentation::graphemes(s, true)
-            .rev()
-            .eq(g.iter().rev().cloned()));
-        assert!(UnicodeSegmentation::graphemes(s, false)
-            .rev()
-            .eq(g.iter().rev().cloned()));
-    }
-
-    for &(s, gt, gf) in TEST_DIFF.iter().chain(EXTRA_DIFF) {
-        // test forward iterator
-        assert!(UnicodeSegmentation::graphemes(s, true).eq(gt.iter().cloned()));
-        assert!(UnicodeSegmentation::graphemes(s, false).eq(gf.iter().cloned()));
-
-        // test reverse iterator
-        assert!(UnicodeSegmentation::graphemes(s, true)
-            .rev()
-            .eq(gt.iter().rev().cloned()));
-        assert!(UnicodeSegmentation::graphemes(s, false)
-            .rev()
-            .eq(gf.iter().rev().cloned()));
-    }
-
-    // test the indices iterators
-    let s = "a̐éö̲\r\n";
-    let gr_inds = UnicodeSegmentation::grapheme_indices(s, true).collect::<Vec<(usize, &str)>>();
-    let b: &[_] = &[(0, "a̐"), (3, "é"), (6, "ö̲"), (11, "\r\n")];
-    assert_eq!(gr_inds, b);
-    let gr_inds = UnicodeSegmentation::grapheme_indices(s, true)
-        .rev()
-        .collect::<Vec<(usize, &str)>>();
-    let b: &[_] = &[(11, "\r\n"), (6, "ö̲"), (3, "é"), (0, "a̐")];
-    assert_eq!(gr_inds, b);
-    let mut gr_inds_iter = UnicodeSegmentation::grapheme_indices(s, true);
-    {
-        let gr_inds = gr_inds_iter.by_ref();
-        let e1 = gr_inds.size_hint();
-        assert_eq!(e1, (1, Some(13)));
-        let c = gr_inds.count();
-        assert_eq!(c, 4);
-    }
-    let e2 = gr_inds_iter.size_hint();
-    assert_eq!(e2, (0, Some(0)));
-
-    // make sure the reverse iterator does the right thing with "\n" at beginning of string
-    let s = "\n\r\n\r";
-    let gr = UnicodeSegmentation::graphemes(s, true)
-        .rev()
-        .collect::<Vec<&str>>();
-    let b: &[_] = &["\r", "\r\n", "\n"];
-    assert_eq!(gr, b);
-}
-
-#[test]
-fn test_words() {
-    use crate::testdata::TEST_WORD;
-
-    // Unicode's official tests don't really test longer chains of flag emoji
-    // TODO This could be improved with more tests like flag emoji with interspersed Extend chars and ZWJ
-    const EXTRA_TESTS: &'static [(&'static str, &'static [&'static str])] = &[
-        (
-            "🇦🇫🇦🇽🇦🇱🇩🇿🇦🇸🇦🇩🇦🇴",
-            &["🇦🇫", "🇦🇽", "🇦🇱", "🇩🇿", "🇦🇸", "🇦🇩", "🇦🇴"],
-        ),
-        ("🇦🇫🇦🇽🇦🇱🇩🇿🇦🇸🇦🇩🇦", &["🇦🇫", "🇦🇽", "🇦🇱", "🇩🇿", "🇦🇸", "🇦🇩", "🇦"]),
-        (
-            "🇦a🇫🇦🇽a🇦🇱🇩🇿🇦🇸🇦🇩🇦",
-            &["🇦", "a", "🇫🇦", "🇽", "a", "🇦🇱", "🇩🇿", "🇦🇸", "🇦🇩", "🇦"],
-        ),
-        (
-            "\u{1f468}\u{200d}\u{1f468}\u{200d}\u{1f466}",
-            &["\u{1f468}\u{200d}\u{1f468}\u{200d}\u{1f466}"],
-        ),
-        ("😌👎🏼", &["😌", "👎🏼"]),
-        // perhaps wrong, spaces should not be included?
-        ("hello world", &["hello", " ", "world"]),
-        ("🇨🇦🇨🇭🇿🇲🇿 hi", &["🇨🇦", "🇨🇭", "🇿🇲", "🇿", " ", "hi"]),
-    ];
-    for &(s, w) in TEST_WORD.iter().chain(EXTRA_TESTS.iter()) {
-        macro_rules! assert_ {
-            ($test:expr, $exp:expr, $name:expr) => {
-                // collect into vector for better diagnostics in failure case
-                let testing = $test.collect::<Vec<_>>();
-                let expected = $exp.collect::<Vec<_>>();
-                assert_eq!(
-                    testing, expected,
-                    "{} test for testcase ({:?}, {:?}) failed.",
-                    $name, s, w
-                )
-            };
-        }
-        // test forward iterator
-        assert_!(
-            s.split_word_bounds(),
-            w.iter().cloned(),
-            "Forward word boundaries"
-        );
-
-        // test reverse iterator
-        assert_!(
-            s.split_word_bounds().rev(),
-            w.iter().rev().cloned(),
-            "Reverse word boundaries"
-        );
-
-        // generate offsets from word string lengths
-        let mut indices = vec![0];
-        for i in w.iter().cloned().map(|s| s.len()).scan(0, |t, n| {
-            *t += n;
-            Some(*t)
-        }) {
-            indices.push(i);
-        }
-        indices.pop();
-        let indices = indices;
-
-        // test forward indices iterator
-        assert_!(
-            s.split_word_bound_indices().map(|(l, _)| l),
-            indices.iter().cloned(),
-            "Forward word indices"
-        );
-
-        // test backward indices iterator
-        assert_!(
-            s.split_word_bound_indices().rev().map(|(l, _)| l),
-            indices.iter().rev().cloned(),
-            "Reverse word indices"
-        );
-    }
-}
-
-#[test]
-fn test_sentences() {
-    use crate::testdata::TEST_SENTENCE;
-
-    for &(s, w) in TEST_SENTENCE.iter() {
-        macro_rules! assert_ {
-            ($test:expr, $exp:expr, $name:expr) => {
-                // collect into vector for better diagnostics in failure case
-                let testing = $test.collect::<Vec<_>>();
-                let expected = $exp.collect::<Vec<_>>();
-                assert_eq!(
-                    testing, expected,
-                    "{} test for testcase ({:?}, {:?}) failed.",
-                    $name, s, w
-                )
-            };
-        }
-
-        assert_!(
-            s.split_sentence_bounds(),
-            w.iter().cloned(),
-            "Forward sentence boundaries"
-        );
-    }
-}
-
-quickcheck! {
-    fn quickcheck_forward_reverse_graphemes_extended(s: String) -> bool {
-        let a = s.graphemes(true).collect::<Vec<_>>();
-        let mut b = s.graphemes(true).rev().collect::<Vec<_>>();
-        b.reverse();
-        a == b
-    }
-
-    fn quickcheck_forward_reverse_graphemes_legacy(s: String) -> bool {
-        let a = s.graphemes(false).collect::<Vec<_>>();
-        let mut b = s.graphemes(false).rev().collect::<Vec<_>>();
-        b.reverse();
-        a == b
-    }
-
-    fn quickcheck_join_graphemes(s: String) -> bool {
-        let a = s.graphemes(true).collect::<String>();
-        let b = s.graphemes(false).collect::<String>();
-        a == s && b == s
-    }
-
-    fn quickcheck_forward_reverse_words(s: String) -> bool {
-        let a = s.split_word_bounds().collect::<Vec<_>>();
-        let mut b = s.split_word_bounds().rev().collect::<Vec<_>>();
-        b.reverse();
-        a == b
-    }
-
-    fn quickcheck_join_words(s: String) -> bool {
-        let a = s.split_word_bounds().collect::<String>();
-        a == s
-    }
-}
--- a/third_party/rust/unicode-segmentation/src/testdata.rs
+++ b/third_party/rust/unicode-segmentation/src/testdata.rs
--- a/third_party/rust/unicode-segmentation/src/word.rs
+++ b/third_party/rust/unicode-segmentation/src/word.rs
@ -1,754 +0,0 @@
-// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
-// file at the top-level directory of this distribution and at
-// http://rust-lang.org/COPYRIGHT.
-//
-// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
-// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
-// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
-// option. This file may not be copied, modified, or distributed
-// except according to those terms.
-
-use core::cmp;
-use core::iter::Filter;
-
-use crate::tables::word::WordCat;
-
-/// An iterator over the substrings of a string which, after splitting the string on
-/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
-/// contain any characters with the
-/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
-/// property, or with
-/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
-///
-/// This struct is created by the [`unicode_words`] method on the [`UnicodeSegmentation`] trait. See
-/// its documentation for more.
-///
-/// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words
-/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
-pub struct UnicodeWords<'a> {
-    inner: Filter<UWordBounds<'a>, fn(&&str) -> bool>,
-}
-
-impl<'a> Iterator for UnicodeWords<'a> {
-    type Item = &'a str;
-
-    #[inline]
-    fn next(&mut self) -> Option<&'a str> {
-        self.inner.next()
-    }
-}
-impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
-    #[inline]
-    fn next_back(&mut self) -> Option<&'a str> {
-        self.inner.next_back()
-    }
-}
-
-/// An iterator over the substrings of a string which, after splitting the string on
-/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
-/// contain any characters with the
-/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
-/// property, or with
-/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
-/// This iterator also provides the byte offsets for each substring.
-///
-/// This struct is created by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See
-/// its documentation for more.
-///
-/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices
-/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
-pub struct UnicodeWordIndices<'a> {
-    inner: Filter<UWordBoundIndices<'a>, fn(&(usize, &str)) -> bool>,
-}
-
-impl<'a> Iterator for UnicodeWordIndices<'a> {
-    type Item = (usize, &'a str);
-
-    #[inline]
-    fn next(&mut self) -> Option<(usize, &'a str)> {
-        self.inner.next()
-    }
-}
-impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> {
-    #[inline]
-    fn next_back(&mut self) -> Option<(usize, &'a str)> {
-        self.inner.next_back()
-    }
-}
-
-/// External iterator for a string's
-/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
-///
-/// This struct is created by the [`split_word_bounds`] method on the [`UnicodeSegmentation`]
-/// trait. See its documentation for more.
-///
-/// [`split_word_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_word_bounds
-/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
-#[derive(Clone)]
-pub struct UWordBounds<'a> {
-    string: &'a str,
-    cat: Option<WordCat>,
-    catb: Option<WordCat>,
-}
-
-/// External iterator for word boundaries and byte offsets.
-///
-/// This struct is created by the [`split_word_bound_indices`] method on the
-/// [`UnicodeSegmentation`] trait. See its documentation for more.
-///
-/// [`split_word_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_word_bound_indices
-/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
-#[derive(Clone)]
-pub struct UWordBoundIndices<'a> {
-    start_offset: usize,
-    iter: UWordBounds<'a>,
-}
-
-impl<'a> UWordBoundIndices<'a> {
-    #[inline]
-    /// View the underlying data (the part yet to be iterated) as a slice of the original string.
-    ///
-    /// ```rust
-    /// # use unicode_segmentation::UnicodeSegmentation;
-    /// let mut iter = "Hello world".split_word_bound_indices();
-    /// assert_eq!(iter.as_str(), "Hello world");
-    /// iter.next();
-    /// assert_eq!(iter.as_str(), " world");
-    /// iter.next();
-    /// assert_eq!(iter.as_str(), "world");
-    /// ```
-    pub fn as_str(&self) -> &'a str {
-        self.iter.as_str()
-    }
-}
-
-impl<'a> Iterator for UWordBoundIndices<'a> {
-    type Item = (usize, &'a str);
-
-    #[inline]
-    fn next(&mut self) -> Option<(usize, &'a str)> {
-        self.iter
-            .next()
-            .map(|s| (s.as_ptr() as usize - self.start_offset, s))
-    }
-
-    #[inline]
-    fn size_hint(&self) -> (usize, Option<usize>) {
-        self.iter.size_hint()
-    }
-}
-
-impl<'a> DoubleEndedIterator for UWordBoundIndices<'a> {
-    #[inline]
-    fn next_back(&mut self) -> Option<(usize, &'a str)> {
-        self.iter
-            .next_back()
-            .map(|s| (s.as_ptr() as usize - self.start_offset, s))
-    }
-}
-
-// state machine for word boundary rules
-#[derive(Clone, Copy, PartialEq, Eq, Debug)]
-enum UWordBoundsState {
-    Start,
-    Letter,
-    HLetter,
-    Numeric,
-    Katakana,
-    ExtendNumLet,
-    Regional(RegionalState),
-    FormatExtend(FormatExtendType),
-    Zwj,
-    Emoji,
-    WSegSpace,
-}
-
-// subtypes for FormatExtend state in UWordBoundsState
-#[derive(Clone, Copy, PartialEq, Eq, Debug)]
-enum FormatExtendType {
-    AcceptAny,
-    AcceptNone,
-    RequireLetter,
-    RequireHLetter,
-    AcceptQLetter,
-    RequireNumeric,
-}
-
-#[derive(Clone, Copy, PartialEq, Eq, Debug)]
-enum RegionalState {
-    Half,
-    Full,
-    Unknown,
-}
-
-fn is_emoji(ch: char) -> bool {
-    use crate::tables::emoji;
-    emoji::emoji_category(ch).2 == emoji::EmojiCat::EC_Extended_Pictographic
-}
-
-impl<'a> Iterator for UWordBounds<'a> {
-    type Item = &'a str;
-
-    #[inline]
-    fn size_hint(&self) -> (usize, Option<usize>) {
-        let slen = self.string.len();
-        (cmp::min(slen, 1), Some(slen))
-    }
-
-    #[inline]
-    fn next(&mut self) -> Option<&'a str> {
-        use self::FormatExtendType::*;
-        use self::UWordBoundsState::*;
-        use crate::tables::word as wd;
-        if self.string.len() == 0 {
-            return None;
-        }
-
-        let mut take_curr = true;
-        let mut take_cat = true;
-        let mut idx = 0;
-        let mut saveidx = 0;
-        let mut state = Start;
-        let mut cat = wd::WC_Any;
-        let mut savecat = wd::WC_Any;
-
-        // If extend/format/zwj were skipped. Handles precedence of WB3d over WB4
-        let mut skipped_format_extend = false;
-        for (curr, ch) in self.string.char_indices() {
-            idx = curr;
-            // Whether or not the previous category was ZWJ
-            // ZWJs get collapsed, so this handles precedence of WB3c over WB4
-            let prev_zwj = cat == wd::WC_ZWJ;
-            // if there's a category cached, grab it
-            cat = match self.cat {
-                None => wd::word_category(ch).2,
-                _ => self.cat.take().unwrap(),
-            };
-            take_cat = true;
-
-            // handle rule WB4
-            // just skip all format, extend, and zwj chars
-            // note that Start is a special case: if there's a bunch of Format | Extend
-            // characters at the beginning of a block of text, dump them out as one unit.
-            //
-            // (This is not obvious from the wording of UAX#29, but if you look at the
-            // test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
-            // then the "correct" interpretation of WB4 becomes apparent.)
-            if state != Start {
-                match cat {
-                    wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => {
-                        skipped_format_extend = true;
-                        continue;
-                    }
-                    _ => {}
-                }
-            }
-
-            // rule WB3c
-            // WB4 makes all ZWJs collapse into the previous state
-            // but you can still be in a Zwj state if you started with Zwj
-            //
-            // This means that an EP + Zwj will collapse into EP, which is wrong,
-            // since EP+EP is not a boundary but EP+ZWJ+EP is
-            //
-            // Thus, we separately keep track of whether or not the last character
-            // was a ZWJ. This is an additional bit of state tracked outside of the
-            // state enum; the state enum represents the last non-zwj state encountered.
-            // When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state,
-            // however we are in the previous state for the purposes of all other rules.
-            if prev_zwj {
-                if is_emoji(ch) {
-                    state = Emoji;
-                    continue;
-                }
-            }
-            // Don't use `continue` in this match without updating `cat`
-            state = match state {
-                Start if cat == wd::WC_CR => {
-                    idx += match self.get_next_cat(idx) {
-                        Some(ncat) if ncat == wd::WC_LF => 1, // rule WB3
-                        _ => 0,
-                    };
-                    break; // rule WB3a
-                }
-                Start => match cat {
-                    wd::WC_ALetter => Letter,            // rule WB5, WB6, WB9, WB13a
-                    wd::WC_Hebrew_Letter => HLetter,     // rule WB5, WB6, WB7a, WB7b, WB9, WB13a
-                    wd::WC_Numeric => Numeric,           // rule WB8, WB10, WB12, WB13a
-                    wd::WC_Katakana => Katakana,         // rule WB13, WB13a
-                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a, WB13b
-                    wd::WC_Regional_Indicator => Regional(RegionalState::Half), // rule WB13c
-                    wd::WC_LF | wd::WC_Newline => break, // rule WB3a
-                    wd::WC_ZWJ => Zwj,                   // rule WB3c
-                    wd::WC_WSegSpace => WSegSpace,       // rule WB3d
-                    _ => {
-                        if let Some(ncat) = self.get_next_cat(idx) {
-                            // rule WB4
-                            if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ
-                            {
-                                state = FormatExtend(AcceptNone);
-                                self.cat = Some(ncat);
-                                continue;
-                            }
-                        }
-                        break; // rule WB999
-                    }
-                },
-                WSegSpace => match cat {
-                    wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
-                    _ => {
-                        take_curr = false;
-                        break;
-                    }
-                },
-                Zwj => {
-                    // We already handle WB3c above.
-                    take_curr = false;
-                    break;
-                }
-                Letter | HLetter => match cat {
-                    wd::WC_ALetter => Letter,            // rule WB5
-                    wd::WC_Hebrew_Letter => HLetter,     // rule WB5
-                    wd::WC_Numeric => Numeric,           // rule WB9
-                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
-                    wd::WC_Double_Quote if state == HLetter => {
-                        savecat = cat;
-                        saveidx = idx;
-                        FormatExtend(RequireHLetter) // rule WB7b
-                    }
-                    wd::WC_Single_Quote if state == HLetter => {
-                        FormatExtend(AcceptQLetter) // rule WB7a
-                    }
-                    wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
-                        savecat = cat;
-                        saveidx = idx;
-                        FormatExtend(RequireLetter) // rule WB6
-                    }
-                    _ => {
-                        take_curr = false;
-                        break;
-                    }
-                },
-                Numeric => match cat {
-                    wd::WC_Numeric => Numeric,           // rule WB8
-                    wd::WC_ALetter => Letter,            // rule WB10
-                    wd::WC_Hebrew_Letter => HLetter,     // rule WB10
-                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
-                    wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
-                        savecat = cat;
-                        saveidx = idx;
-                        FormatExtend(RequireNumeric) // rule WB12
-                    }
-                    _ => {
-                        take_curr = false;
-                        break;
-                    }
-                },
-                Katakana => match cat {
-                    wd::WC_Katakana => Katakana,         // rule WB13
-                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
-                    _ => {
-                        take_curr = false;
-                        break;
-                    }
-                },
-                ExtendNumLet => match cat {
-                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
-                    wd::WC_ALetter => Letter,            // rule WB13b
-                    wd::WC_Hebrew_Letter => HLetter,     // rule WB13b
-                    wd::WC_Numeric => Numeric,           // rule WB13b
-                    wd::WC_Katakana => Katakana,         // rule WB13b
-                    _ => {
-                        take_curr = false;
-                        break;
-                    }
-                },
-                Regional(RegionalState::Full) => {
-                    // if it reaches here we've gone too far,
-                    // a full flag can only compose with ZWJ/Extend/Format
-                    // proceeding it.
-                    take_curr = false;
-                    break;
-                }
-                Regional(RegionalState::Half) => match cat {
-                    wd::WC_Regional_Indicator => Regional(RegionalState::Full), // rule WB13c
-                    _ => {
-                        take_curr = false;
-                        break;
-                    }
-                },
-                Regional(_) => {
-                    unreachable!("RegionalState::Unknown should not occur on forward iteration")
-                }
-                Emoji => {
-                    // We already handle WB3c above. If you've reached this point, the emoji sequence is over.
-                    take_curr = false;
-                    break;
-                }
-                FormatExtend(t) => match t {
-                    // handle FormatExtends depending on what type
-                    RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB11
-                    RequireLetter | AcceptQLetter if cat == wd::WC_ALetter => Letter, // rule WB7
-                    RequireLetter | AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // WB7a
-                    RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
-                    AcceptNone | AcceptQLetter => {
-                        take_curr = false; // emit all the Format|Extend characters
-                        take_cat = false;
-                        break;
-                    }
-                    _ => break, // rewind (in if statement below)
-                },
-            }
-        }
-
-        if let FormatExtend(t) = state {
-            // we were looking for something and didn't find it; we have to back up
-            if t == RequireLetter || t == RequireHLetter || t == RequireNumeric {
-                idx = saveidx;
-                cat = savecat;
-                take_curr = false;
-            }
-        }
-
-        self.cat = if take_curr {
-            idx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
-            None
-        } else if take_cat {
-            Some(cat)
-        } else {
-            None
-        };
-
-        let retstr = &self.string[..idx];
-        self.string = &self.string[idx..];
-        Some(retstr)
-    }
-}
-
-impl<'a> DoubleEndedIterator for UWordBounds<'a> {
-    #[inline]
-    fn next_back(&mut self) -> Option<&'a str> {
-        use self::FormatExtendType::*;
-        use self::UWordBoundsState::*;
-        use crate::tables::word as wd;
-        if self.string.len() == 0 {
-            return None;
-        }
-
-        let mut take_curr = true;
-        let mut take_cat = true;
-        let mut idx = self.string.len();
-        idx -= self.string.chars().next_back().unwrap().len_utf8();
-        let mut previdx = idx;
-        let mut saveidx = idx;
-        let mut state = Start;
-        let mut savestate = Start;
-        let mut cat = wd::WC_Any;
-
-        let mut skipped_format_extend = false;
-
-        for (curr, ch) in self.string.char_indices().rev() {
-            previdx = idx;
-            idx = curr;
-
-            // if there's a category cached, grab it
-            cat = match self.catb {
-                None => wd::word_category(ch).2,
-                _ => self.catb.take().unwrap(),
-            };
-            take_cat = true;
-
-            // backward iterator over word boundaries. Mostly the same as the forward
-            // iterator, with two weirdnesses:
-            // (1) If we encounter a single quote in the Start state, we have to check for a
-            //     Hebrew Letter immediately before it.
-            // (2) Format and Extend char handling takes some gymnastics.
-
-            if cat == wd::WC_Extend || cat == wd::WC_Format || (cat == wd::WC_ZWJ && state != Zwj) {
-                // WB3c has more priority so we should not
-                // fold in that case
-                if match state {
-                    FormatExtend(_) | Start => false,
-                    _ => true,
-                } {
-                    saveidx = previdx;
-                    savestate = state;
-                    state = FormatExtend(AcceptNone);
-                }
-
-                if state != Start {
-                    continue;
-                }
-            } else if state == FormatExtend(AcceptNone) {
-                // finished a scan of some Format|Extend chars, restore previous state
-                state = savestate;
-                previdx = saveidx;
-                take_cat = false;
-                skipped_format_extend = true;
-            }
-
-            // Don't use `continue` in this match without updating `catb`
-            state = match state {
-                Start | FormatExtend(AcceptAny) => match cat {
-                    _ if is_emoji(ch) => Zwj,
-                    wd::WC_ALetter => Letter, // rule WB5, WB7, WB10, WB13b
-                    wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB7, WB7c, WB10, WB13b
-                    wd::WC_Numeric => Numeric, // rule WB8, WB9, WB11, WB13b
-                    wd::WC_Katakana => Katakana, // rule WB13, WB13b
-                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
-                    wd::WC_Regional_Indicator => Regional(RegionalState::Unknown), // rule WB13c
-                    // rule WB4:
-                    wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => FormatExtend(AcceptAny),
-                    wd::WC_Single_Quote => {
-                        saveidx = idx;
-                        FormatExtend(AcceptQLetter) // rule WB7a
-                    }
-                    wd::WC_WSegSpace => WSegSpace,
-                    wd::WC_CR | wd::WC_LF | wd::WC_Newline => {
-                        if state == Start {
-                            if cat == wd::WC_LF {
-                                idx -= match self.get_prev_cat(idx) {
-                                    Some(pcat) if pcat == wd::WC_CR => 1, // rule WB3
-                                    _ => 0,
-                                };
-                            }
-                        } else {
-                            take_curr = false;
-                        }
-                        break; // rule WB3a
-                    }
-                    _ => break, // rule WB999
-                },
-                Zwj => match cat {
-                    // rule WB3c
-                    wd::WC_ZWJ => FormatExtend(AcceptAny),
-                    _ => {
-                        take_curr = false;
-                        break;
-                    }
-                },
-                WSegSpace => match cat {
-                    // rule WB3d
-                    wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
-                    _ => {
-                        take_curr = false;
-                        break;
-                    }
-                },
-                Letter | HLetter => match cat {
-                    wd::WC_ALetter => Letter,            // rule WB5
-                    wd::WC_Hebrew_Letter => HLetter,     // rule WB5
-                    wd::WC_Numeric => Numeric,           // rule WB10
-                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
-                    wd::WC_Double_Quote if state == HLetter => {
-                        saveidx = previdx;
-                        FormatExtend(RequireHLetter) // rule WB7c
-                    }
-                    wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => {
-                        saveidx = previdx;
-                        FormatExtend(RequireLetter) // rule WB7
-                    }
-                    _ => {
-                        take_curr = false;
-                        break;
-                    }
-                },
-                Numeric => match cat {
-                    wd::WC_Numeric => Numeric,           // rule WB8
-                    wd::WC_ALetter => Letter,            // rule WB9
-                    wd::WC_Hebrew_Letter => HLetter,     // rule WB9
-                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
-                    wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => {
-                        saveidx = previdx;
-                        FormatExtend(RequireNumeric) // rule WB11
-                    }
-                    _ => {
-                        take_curr = false;
-                        break;
-                    }
-                },
-                Katakana => match cat {
-                    wd::WC_Katakana => Katakana,         // rule WB13
-                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
-                    _ => {
-                        take_curr = false;
-                        break;
-                    }
-                },
-                ExtendNumLet => match cat {
-                    wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
-                    wd::WC_ALetter => Letter,            // rule WB13a
-                    wd::WC_Hebrew_Letter => HLetter,     // rule WB13a
-                    wd::WC_Numeric => Numeric,           // rule WB13a
-                    wd::WC_Katakana => Katakana,         // rule WB13a
-                    _ => {
-                        take_curr = false;
-                        break;
-                    }
-                },
-                Regional(mut regional_state) => match cat {
-                    // rule WB13c
-                    wd::WC_Regional_Indicator => {
-                        if regional_state == RegionalState::Unknown {
-                            let count = self.string[..previdx]
-                                .chars()
-                                .rev()
-                                .map(|c| wd::word_category(c).2)
-                                .filter(|&c| {
-                                    !(c == wd::WC_ZWJ || c == wd::WC_Extend || c == wd::WC_Format)
-                                })
-                                .take_while(|&c| c == wd::WC_Regional_Indicator)
-                                .count();
-                            regional_state = if count % 2 == 0 {
-                                RegionalState::Full
-                            } else {
-                                RegionalState::Half
-                            };
-                        }
-                        if regional_state == RegionalState::Full {
-                            take_curr = false;
-                            break;
-                        } else {
-                            Regional(RegionalState::Full)
-                        }
-                    }
-                    _ => {
-                        take_curr = false;
-                        break;
-                    }
-                },
-                Emoji => {
-                    if is_emoji(ch) {
-                        // rule WB3c
-                        Zwj
-                    } else {
-                        take_curr = false;
-                        break;
-                    }
-                }
-                FormatExtend(t) => match t {
-                    RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB12
-                    RequireLetter if cat == wd::WC_ALetter => Letter,   // rule WB6
-                    RequireLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB6
-                    AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7a
-                    RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
-                    _ => break,                                         // backtrack will happens
-                },
-            }
-        }
-
-        if let FormatExtend(t) = state {
-            // if we required something but didn't find it, backtrack
-            if t == RequireLetter
-                || t == RequireHLetter
-                || t == RequireNumeric
-                || t == AcceptNone
-                || t == AcceptQLetter
-            {
-                previdx = saveidx;
-                take_cat = false;
-                take_curr = false;
-            }
-        }
-
-        self.catb = if take_curr {
-            None
-        } else {
-            idx = previdx;
-            if take_cat {
-                Some(cat)
-            } else {
-                None
-            }
-        };
-
-        let retstr = &self.string[idx..];
-        self.string = &self.string[..idx];
-        Some(retstr)
-    }
-}
-
-impl<'a> UWordBounds<'a> {
-    #[inline]
-    /// View the underlying data (the part yet to be iterated) as a slice of the original string.
-    ///
-    /// ```rust
-    /// # use unicode_segmentation::UnicodeSegmentation;
-    /// let mut iter = "Hello world".split_word_bounds();
-    /// assert_eq!(iter.as_str(), "Hello world");
-    /// iter.next();
-    /// assert_eq!(iter.as_str(), " world");
-    /// iter.next();
-    /// assert_eq!(iter.as_str(), "world");
-    /// ```
-    pub fn as_str(&self) -> &'a str {
-        self.string
-    }
-
-    #[inline]
-    fn get_next_cat(&self, idx: usize) -> Option<WordCat> {
-        use crate::tables::word as wd;
-        let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
-        if nidx < self.string.len() {
-            let nch = self.string[nidx..].chars().next().unwrap();
-            Some(wd::word_category(nch).2)
-        } else {
-            None
-        }
-    }
-
-    #[inline]
-    fn get_prev_cat(&self, idx: usize) -> Option<WordCat> {
-        use crate::tables::word as wd;
-        if idx > 0 {
-            let nch = self.string[..idx].chars().next_back().unwrap();
-            Some(wd::word_category(nch).2)
-        } else {
-            None
-        }
-    }
-}
-
-#[inline]
-pub fn new_word_bounds<'b>(s: &'b str) -> UWordBounds<'b> {
-    UWordBounds {
-        string: s,
-        cat: None,
-        catb: None,
-    }
-}
-
-#[inline]
-pub fn new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b> {
-    UWordBoundIndices {
-        start_offset: s.as_ptr() as usize,
-        iter: new_word_bounds(s),
-    }
-}
-
-#[inline]
-fn has_alphanumeric(s: &&str) -> bool {
-    use crate::tables::util::is_alphanumeric;
-
-    s.chars().any(|c| is_alphanumeric(c))
-}
-
-#[inline]
-pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> {
-    use super::UnicodeSegmentation;
-
-    UnicodeWords {
-        inner: s.split_word_bounds().filter(has_alphanumeric),
-    }
-}
-
-#[inline]
-pub fn new_unicode_word_indices<'b>(s: &'b str) -> UnicodeWordIndices<'b> {
-    use super::UnicodeSegmentation;
-
-    UnicodeWordIndices {
-        inner: s
-            .split_word_bound_indices()
-            .filter(|(_, c)| has_alphanumeric(c)),
-    }
-}
				`@ -1 +0,0 @@`
				{"files":{"COPYRIGHT":"23860c2a7b5d96b21569afedf033469bab9fe14a1b24a35068b8641c578ce24d","Cargo.toml":"55e5a65c91693dd47a27409e54ad6d5ce805ce003b822e4a568bfd070725e956","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"7b63ecd5f1902af1b63729947373683c32745c16a10e8e6292e2e2dcd7e90ae0","README.md":"efe7aa058e004e12d683039dbc4440e2fec3088364201a620703acedbeef8cb2","benches/graphemes.rs":"88a9f672ea7a03cc15fae36ce544a6e7234e532359402483978858ccda47db3d","benches/unicode_words.rs":"95c3a178ebe07c8cb2c560546ee911bfc4f1e1db81a6cd2c1cef1c99ed2a421a","benches/word_bounds.rs":"66acf40c0a4b06cdb6dd97c1759aba8dea961bb30cd7f223de3ebff8198520b2","scripts/unicode.py":"d4ba970a0419f33d20f3deb888be12427bfbb40aa25a5719968600d45cf4dadb","scripts/unicode_gen_breaktests.py":"ee96982d8959bec75c2382233cfca7e239f12a89a1be5fbf942601a215bb9283","src/grapheme.rs":"b5a32bdbb529e9417e8ada8d92656339b6ffb4e9bed8e6d32a0409c13a03050b","src/lib.rs":"572789173717edd0fe037ae656530663406951636c548e6793711b7d5caad910","src/sentence.rs":"aac52f69207e0b68925ab0c6c18cc36ed3da8e918006d96d724f0f19d4d9d643","src/tables.rs":"ba9fa1774b6294ed14565ec6be0f2ec316759d54e3af7c002b6848973d7b1f3c","src/test.rs":"f039fa285d510244672a067bdbe98ce7ff940e4f2ff82926466e012ac48ad95a","src/testdata.rs":"533c02ecace1bec3d46b65d101c7619bc83a2fb2c187a2c960346533c09a0e3e","src/word.rs":"6eeea9351c12f0a4404606596a487e0e8aa948ba4b134c7cb827ee41557a39fe"},"package":"0fdbf052a0783de01e944a6ce7a8cb939e295b1e7be835a1112c3b9a7f047a5a"}