зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1716518 - Upgrade regex to v1.5.4 and regex-syntax to v0.6.25. r=emilio
This removes thread_local. Differential Revision: https://phabricator.services.mozilla.com/D117843
This commit is contained in:
Родитель
3a91162145
Коммит
8eadacb968
|
@ -4178,21 +4178,20 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "regex"
|
||||
version = "1.3.3"
|
||||
version = "1.5.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b5508c1941e4e7cb19965abef075d35a9a8b5cdf0846f30b4050e9b55dc55e87"
|
||||
checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-syntax",
|
||||
"thread_local",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.6.12"
|
||||
version = "0.6.25"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "11a7e20d1cce64ef2fed88b66d347f88bd9babb82845b2b858f3edbf59a4f716"
|
||||
checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b"
|
||||
|
||||
[[package]]
|
||||
name = "remote"
|
||||
|
@ -5089,15 +5088,6 @@ dependencies = [
|
|||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thread_local"
|
||||
version = "1.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "threadbound"
|
||||
version = "0.1.0"
|
||||
|
|
|
@ -1 +1 @@
|
|||
{"files":{"Cargo.toml":"ba410e4d856743cb87fa471d2ba2e3b14cd35aa816a04a213463fc9c6b9a2111","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"6485b8ed310d3f0340bf1ad1f47645069ce4069dcc6bb46c7d5c6faf41de1fdb","README.md":"7e9a134c72b09540e6f81f02133c5ae7d35067ea6fec44a94a1ebd20af47b151","benches/bench.rs":"f04160a876ee69bc9938bf51227513d6dbf3608643bc8ae422200f7ffc5ca85f","src/ast/mod.rs":"6eb24ba078c25ed59ceefa2d57e6c1b4c621d87d327335ea9a03049f0d4b5d44","src/ast/parse.rs":"a109a7d3ef793fb57277b24bd7791357879836231e322405a52348487433927b","src/ast/print.rs":"b075392a6d5b48713e09aa510199bb7380aca2cf09fa4bc0efb5c49782630dda","src/ast/visitor.rs":"fb1489ed5ce019091dde244acd8b027e391be442aef3a9033c785c81a4c251fb","src/either.rs":"1758e3edd056884eccadd995708d1e374ba9aa65846bd0e13b1aae852607c560","src/error.rs":"a6904d4081379b96853570e32e0aa82d84fc49d5059c48cf801566ff483bedb1","src/hir/interval.rs":"fcd0babe3bddbe411e04adff7f4d8855db1d6aaa7d8e2180bba819abad576736","src/hir/literal/mod.rs":"b0a01a3d7e524277ada88d5b58efcced498a53addfac69355fcc368c4c4dfb0f","src/hir/mod.rs":"73c4cbb48dead01bd03e2da54e1685b620be3fe4b062a9f5b76a93a3bb52236c","src/hir/print.rs":"1bd12a70e5876d85eb2188d83c4b71c9533dc6fff0c1ab5c2b0e4701de7e7a90","src/hir/translate.rs":"bdc82b7aa6e71b9a12092e5e136f4cb2f4965c13183001e7994724bf39eafa8f","src/hir/visitor.rs":"203dbe93e4a8cde395c6ff5a0eb98c9c3737bc5ea11fe7163f5e7bf1babc1f69","src/lib.rs":"db6fd6a65ea30a5b3b1b45c68c17f521302d3408bfe4cec77115913dd25ae072","src/parser.rs":"10cc145d79c275c7e19b8cc9078754f23fc1da9a2c3a2e56041a8616d5f85dea","src/unicode.rs":"873e817b3a8bff11b69eb5053b55f6d1a5f9357758d9e2aedfdc7d833b817c80","src/unicode_tables/LICENSE-UNICODE":"74db5baf44a41b1000312c673544b3374e4198af5605c7f9080a402cec42cfa3","src/unicode_tables/age.rs":"752194f2cb98c483cd98affcbface39431b8039645cc59e8f584a8dde34b34ff","src/unicode_tables/case_folding_simple.rs":"5f4fa71e8abdd01a711247b2c00b46cb4b12e0139b1abcee4be557d127e705fb","src/unicode_tables/general_category.rs":"59423c66260e21c505a901507d6bdd4288f1a1d76362bfae7d7478b943894fe5","src/unicode_tables/grapheme_cluster_break.rs":"d40127918f6015c46b6060c387a5fc2ee083f8d4c2e5ece5bff57ea1d6d031ef","src/unicode_tables/mod.rs":"26c837099cd934c8062e24bc9a0aaecf15fe1de03f9c6da3f3e1e5ac3ca24bee","src/unicode_tables/perl_decimal.rs":"f3ea734f43b123996f8a2c66d54c2b70ded9d333e2e8338bf895ef0f9ec7578e","src/unicode_tables/perl_space.rs":"3304ab6555e950198f9b1714c9a293c7ad80659c2389edb6b56df174a7d317e5","src/unicode_tables/perl_word.rs":"9b493901ebd3d80ed7b26389e9f2a244108ab7eb8a219418e19d5dc040ff52b1","src/unicode_tables/property_bool.rs":"2ae8df389456d0267cc7198420fdd16446f0a5bccda3a73a755a3763975695e2","src/unicode_tables/property_names.rs":"849ff2209af572ef3edeb8a84653098bd38c2497a06758a92ef798b7ffbfb4c8","src/unicode_tables/property_values.rs":"2af9239fbb3ec2458b17a7ed16f3a27a11ae574ee6c9366d3b6768e0560ba134","src/unicode_tables/script.rs":"3cb7442ee2460dce4ab7790801429408e55d8c3b19eac8b32560d693710a7533","src/unicode_tables/script_extension.rs":"71e9dd03f311945225540b2d984a17d224edf051069e31bf834a07382135bf7d","src/unicode_tables/sentence_break.rs":"314401cbbb1afb77b1b2ebcc0e44cb0e4cb7571469d288336d43812c4eeb3d90","src/unicode_tables/word_break.rs":"d03974a4b557670c9d0ac7a3e46175f036bcd258cd188e94af5798dea48cf082","src/utf8.rs":"68353f4303364d058426311893c786ea4b89076978abd11448e5bb4b8cc04a29","test":"9d0bc68616c249f317e783e5083102d2645a6ade3de735e8d8a414e97eaa76d0"},"package":"11a7e20d1cce64ef2fed88b66d347f88bd9babb82845b2b858f3edbf59a4f716"}
|
||||
{"files":{"Cargo.toml":"f15a235fff5192b488e6259ed785c77cdab87f77ce17de1c91c997c622379722","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"6485b8ed310d3f0340bf1ad1f47645069ce4069dcc6bb46c7d5c6faf41de1fdb","README.md":"7e9a134c72b09540e6f81f02133c5ae7d35067ea6fec44a94a1ebd20af47b151","benches/bench.rs":"d2b6ae5b939abd6093064f144b981b7739d7f474ec0698a1268052fc92406635","src/ast/mod.rs":"b0fe9af7ae15d9448246c204977634e5827bbae247bd59ab2e61411996fc68d6","src/ast/parse.rs":"ed3badf248937c81d280f3f3e7573264e3d3587300bcf959579c42c47518d929","src/ast/print.rs":"521d7abeec518f85fe47803347348ebf08364308ebfa614b5eb471c84af43670","src/ast/visitor.rs":"8ffcad13eb2c2a2f745f7bc8d823bd2f0bb728bd150f439455be5a245731f1d2","src/either.rs":"1758e3edd056884eccadd995708d1e374ba9aa65846bd0e13b1aae852607c560","src/error.rs":"cc99a11392b52f7665ff5ee8ea350f7386ed7c6c6bedd46e216b2f396785317f","src/hir/interval.rs":"2ffab258f204fe47bc5fe9ca84376fcd9ecb4929649f683a9412f2e382e908dc","src/hir/literal/mod.rs":"79aa42009de070058a6388e587bfaa98706f8dd61ee1dca70f23d440f5d8bb70","src/hir/mod.rs":"325dc1e42eb8fb9daeb7a8a5e7f967fdee745a7a7c5e26c20dec0b6c66109ad7","src/hir/print.rs":"ab45ccdb61e32561e246cb564414cd9d0477900bd07b0fba13ef02db8973d8b3","src/hir/translate.rs":"4c595d2faee09aecfdafe5871e7b5b698159d846e3262cf694e6e0a59e8e6a5f","src/hir/visitor.rs":"e5bf7f8c09f6155e59c9d676fe25437f7e3700f9bf5d91101d7e246a64c11d5a","src/lib.rs":"0fc94332a971691862ca17531881302b10ef6fa4aba65c123f0b69ffb14b989a","src/parser.rs":"e45755fcdcc8e5c40c4ecfab34962652fe46ad4f23d445f90885c3c36969c8f7","src/unicode.rs":"3b486b36e2ffcae306cb6d7387a82069163c7269597ff2b50589a05462464c36","src/unicode_tables/LICENSE-UNICODE":"74db5baf44a41b1000312c673544b3374e4198af5605c7f9080a402cec42cfa3","src/unicode_tables/age.rs":"b0932a020d3386478dd2f4839c59e30c525e8591735052b9e791e1ce3a2e2b72","src/unicode_tables/case_folding_simple.rs":"6d1f3d095132639228faf4806d05308c70ce2baa68cce69dca01ea159c4eaa15","src/unicode_tables/general_category.rs":"d21877600d387b8a0c5fbb0942458d0330c69aad6565e28134b8a1a371d2f4f4","src/unicode_tables/grapheme_cluster_break.rs":"f03a8be4a00265b568ca2a41d99f66a5d0f5fb423cb4113494153423a1123cda","src/unicode_tables/mod.rs":"26c837099cd934c8062e24bc9a0aaecf15fe1de03f9c6da3f3e1e5ac3ca24bee","src/unicode_tables/perl_decimal.rs":"e39a5934b504eb3282ccb26bbf50ecd764e720120eb7cf6c43662a2321665ab5","src/unicode_tables/perl_space.rs":"014e5d92b66730557e408c2d5c9b2f46d3d288aa85400ab9193c218c7b98ad21","src/unicode_tables/perl_word.rs":"ddf126f39171776ef83151d7a0dbc41da8dd09186723211fb966c4b304247a5e","src/unicode_tables/property_bool.rs":"21f72bd9f3955e3443549ef6609418817ae6df3c81fb5be90a0ceee9d7d3002d","src/unicode_tables/property_names.rs":"504ea44604cd15a7e827a89066bb81a847dd5c57cef360d9f4a914cf22afcf36","src/unicode_tables/property_values.rs":"4d793ad1b664c1913db146897c8eb4fa29d181b821f096de90dc889b738edb88","src/unicode_tables/script.rs":"5a7d2a958b93056081b8b2eb87c3a5609579ad791ad5b0c42959362ce6ea5b31","src/unicode_tables/script_extension.rs":"1d5f1985f7dcae833e78c3858231666b535bf60e032cfacc09d014c22bda6690","src/unicode_tables/sentence_break.rs":"cd5f0eb7ab6b0ec1c1fb4d78496dfecd691d0d0b76495538b9f376645a753deb","src/unicode_tables/word_break.rs":"eabeacfde7558cfe7b1556b0221f09c65f049de0b08c7cd464c1669040610a6b","src/utf8.rs":"f145b2cb0324e6a39260db685fdf2d88675dead54c5b808fb1b7f73a4b530d66","test":"8a9bd1bd9fb389e08288f951319a9bbb0d4c5284a2ba63cbdab7f6afa2c2f76e"},"package":"f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b"}
|
|
@ -11,8 +11,9 @@
|
|||
# will likely look very different (and much more reasonable)
|
||||
|
||||
[package]
|
||||
edition = "2018"
|
||||
name = "regex-syntax"
|
||||
version = "0.6.12"
|
||||
version = "0.6.25"
|
||||
authors = ["The Rust Project Developers"]
|
||||
description = "A regular expression parser."
|
||||
homepage = "https://github.com/rust-lang/regex"
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
#![feature(test)]
|
||||
|
||||
extern crate regex_syntax;
|
||||
extern crate test;
|
||||
|
||||
use regex_syntax::Parser;
|
||||
|
|
|
@ -6,7 +6,7 @@ use std::cmp::Ordering;
|
|||
use std::error;
|
||||
use std::fmt;
|
||||
|
||||
pub use ast::visitor::{visit, Visitor};
|
||||
pub use crate::ast::visitor::{visit, Visitor};
|
||||
|
||||
pub mod parse;
|
||||
pub mod print;
|
||||
|
@ -156,6 +156,9 @@ pub enum ErrorKind {
|
|||
/// `(?i)*`. It is, however, possible to create a repetition operating on
|
||||
/// an empty sub-expression. For example, `()*` is still considered valid.
|
||||
RepetitionMissing,
|
||||
/// The Unicode class is not valid. This typically occurs when a `\p` is
|
||||
/// followed by something other than a `{`.
|
||||
UnicodeClassInvalid,
|
||||
/// When octal support is disabled, this error is produced when an octal
|
||||
/// escape is used. The octal escape is assumed to be an invocation of
|
||||
/// a backreference, which is the common case.
|
||||
|
@ -176,6 +179,8 @@ pub enum ErrorKind {
|
|||
}
|
||||
|
||||
impl error::Error for Error {
|
||||
// TODO: Remove this method entirely on the next breaking semver release.
|
||||
#[allow(deprecated)]
|
||||
fn description(&self) -> &str {
|
||||
use self::ErrorKind::*;
|
||||
match self.kind {
|
||||
|
@ -206,6 +211,7 @@ impl error::Error for Error {
|
|||
RepetitionCountInvalid => "invalid repetition count range",
|
||||
RepetitionCountUnclosed => "unclosed counted repetition",
|
||||
RepetitionMissing => "repetition operator missing expression",
|
||||
UnicodeClassInvalid => "invalid Unicode character class",
|
||||
UnsupportedBackreference => "backreferences are not supported",
|
||||
UnsupportedLookAround => "look-around is not supported",
|
||||
_ => unreachable!(),
|
||||
|
@ -214,13 +220,13 @@ impl error::Error for Error {
|
|||
}
|
||||
|
||||
impl fmt::Display for Error {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
::error::Formatter::from(self).fmt(f)
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
crate::error::Formatter::from(self).fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for ErrorKind {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
use self::ErrorKind::*;
|
||||
match *self {
|
||||
CaptureLimitExceeded => write!(
|
||||
|
@ -293,6 +299,9 @@ impl fmt::Display for ErrorKind {
|
|||
RepetitionMissing => {
|
||||
write!(f, "repetition operator missing expression")
|
||||
}
|
||||
UnicodeClassInvalid => {
|
||||
write!(f, "invalid Unicode character class")
|
||||
}
|
||||
UnsupportedBackreference => {
|
||||
write!(f, "backreferences are not supported")
|
||||
}
|
||||
|
@ -319,7 +328,7 @@ pub struct Span {
|
|||
}
|
||||
|
||||
impl fmt::Debug for Span {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "Span({:?}, {:?})", self.start, self.end)
|
||||
}
|
||||
}
|
||||
|
@ -352,7 +361,7 @@ pub struct Position {
|
|||
}
|
||||
|
||||
impl fmt::Debug for Position {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"Position(o: {:?}, l: {:?}, c: {:?})",
|
||||
|
@ -533,8 +542,8 @@ impl Ast {
|
|||
/// This implementation uses constant stack space and heap space proportional
|
||||
/// to the size of the `Ast`.
|
||||
impl fmt::Display for Ast {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
use ast::print::Printer;
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
use crate::ast::print::Printer;
|
||||
Printer::new().print(self, f)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -7,10 +7,10 @@ use std::cell::{Cell, RefCell};
|
|||
use std::mem;
|
||||
use std::result;
|
||||
|
||||
use ast::{self, Ast, Position, Span};
|
||||
use either::Either;
|
||||
use crate::ast::{self, Ast, Position, Span};
|
||||
use crate::either::Either;
|
||||
|
||||
use is_meta_character;
|
||||
use crate::is_meta_character;
|
||||
|
||||
type Result<T> = result::Result<T, ast::Error>;
|
||||
|
||||
|
@ -58,10 +58,10 @@ impl Primitive {
|
|||
/// then return an error.
|
||||
fn into_class_set_item<P: Borrow<Parser>>(
|
||||
self,
|
||||
p: &ParserI<P>,
|
||||
p: &ParserI<'_, P>,
|
||||
) -> Result<ast::ClassSetItem> {
|
||||
use self::Primitive::*;
|
||||
use ast::ClassSetItem;
|
||||
use crate::ast::ClassSetItem;
|
||||
|
||||
match self {
|
||||
Literal(lit) => Ok(ClassSetItem::Literal(lit)),
|
||||
|
@ -79,7 +79,7 @@ impl Primitive {
|
|||
/// dot), then return an error.
|
||||
fn into_class_literal<P: Borrow<Parser>>(
|
||||
self,
|
||||
p: &ParserI<P>,
|
||||
p: &ParserI<'_, P>,
|
||||
) -> Result<ast::Literal> {
|
||||
use self::Primitive::*;
|
||||
|
||||
|
@ -98,12 +98,13 @@ fn is_hex(c: char) -> bool {
|
|||
/// Returns true if the given character is a valid in a capture group name.
|
||||
///
|
||||
/// If `first` is true, then `c` is treated as the first character in the
|
||||
/// group name (which is not allowed to be a digit).
|
||||
/// group name (which must be alphabetic or underscore).
|
||||
fn is_capture_char(c: char, first: bool) -> bool {
|
||||
c == '_'
|
||||
|| (!first && c >= '0' && c <= '9')
|
||||
|| (c >= 'a' && c <= 'z')
|
||||
|| (c >= 'A' && c <= 'Z')
|
||||
|| (!first
|
||||
&& (('0' <= c && c <= '9') || c == '.' || c == '[' || c == ']'))
|
||||
|| ('A' <= c && c <= 'Z')
|
||||
|| ('a' <= c && c <= 'z')
|
||||
}
|
||||
|
||||
/// A builder for a regular expression parser.
|
||||
|
@ -2095,6 +2096,12 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
|
|||
} else {
|
||||
let start = self.pos();
|
||||
let c = self.char();
|
||||
if c == '\\' {
|
||||
return Err(self.error(
|
||||
self.span_char(),
|
||||
ast::ErrorKind::UnicodeClassInvalid,
|
||||
));
|
||||
}
|
||||
self.bump_and_bump_space();
|
||||
let kind = ast::ClassUnicodeKind::OneLetter(c);
|
||||
(start, kind)
|
||||
|
@ -2130,7 +2137,7 @@ impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
|
|||
/// A type that traverses a fully parsed Ast and checks whether its depth
|
||||
/// exceeds the specified nesting limit. If it does, then an error is returned.
|
||||
#[derive(Debug)]
|
||||
struct NestLimiter<'p, 's: 'p, P: 'p + 's> {
|
||||
struct NestLimiter<'p, 's, P> {
|
||||
/// The parser that is checking the nest limit.
|
||||
p: &'p ParserI<'s, P>,
|
||||
/// The current depth while walking an Ast.
|
||||
|
@ -2305,7 +2312,7 @@ mod tests {
|
|||
use std::ops::Range;
|
||||
|
||||
use super::{Parser, ParserBuilder, ParserI, Primitive};
|
||||
use ast::{self, Ast, Position, Span};
|
||||
use crate::ast::{self, Ast, Position, Span};
|
||||
|
||||
// Our own assert_eq, which has slightly better formatting (but honestly
|
||||
// still kind of crappy).
|
||||
|
@ -2350,21 +2357,24 @@ mod tests {
|
|||
str.to_string()
|
||||
}
|
||||
|
||||
fn parser(pattern: &str) -> ParserI<Parser> {
|
||||
fn parser(pattern: &str) -> ParserI<'_, Parser> {
|
||||
ParserI::new(Parser::new(), pattern)
|
||||
}
|
||||
|
||||
fn parser_octal(pattern: &str) -> ParserI<Parser> {
|
||||
fn parser_octal(pattern: &str) -> ParserI<'_, Parser> {
|
||||
let parser = ParserBuilder::new().octal(true).build();
|
||||
ParserI::new(parser, pattern)
|
||||
}
|
||||
|
||||
fn parser_nest_limit(pattern: &str, nest_limit: u32) -> ParserI<Parser> {
|
||||
fn parser_nest_limit(
|
||||
pattern: &str,
|
||||
nest_limit: u32,
|
||||
) -> ParserI<'_, Parser> {
|
||||
let p = ParserBuilder::new().nest_limit(nest_limit).build();
|
||||
ParserI::new(p, pattern)
|
||||
}
|
||||
|
||||
fn parser_ignore_whitespace(pattern: &str) -> ParserI<Parser> {
|
||||
fn parser_ignore_whitespace(pattern: &str) -> ParserI<'_, Parser> {
|
||||
let p = ParserBuilder::new().ignore_whitespace(true).build();
|
||||
ParserI::new(p, pattern)
|
||||
}
|
||||
|
@ -3845,6 +3855,45 @@ bar
|
|||
}))
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
parser("(?P<a_1>z)").parse(),
|
||||
Ok(Ast::Group(ast::Group {
|
||||
span: span(0..10),
|
||||
kind: ast::GroupKind::CaptureName(ast::CaptureName {
|
||||
span: span(4..7),
|
||||
name: s("a_1"),
|
||||
index: 1,
|
||||
}),
|
||||
ast: Box::new(lit('z', 8)),
|
||||
}))
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
parser("(?P<a.1>z)").parse(),
|
||||
Ok(Ast::Group(ast::Group {
|
||||
span: span(0..10),
|
||||
kind: ast::GroupKind::CaptureName(ast::CaptureName {
|
||||
span: span(4..7),
|
||||
name: s("a.1"),
|
||||
index: 1,
|
||||
}),
|
||||
ast: Box::new(lit('z', 8)),
|
||||
}))
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
parser("(?P<a[1]>z)").parse(),
|
||||
Ok(Ast::Group(ast::Group {
|
||||
span: span(0..11),
|
||||
kind: ast::GroupKind::CaptureName(ast::CaptureName {
|
||||
span: span(4..8),
|
||||
name: s("a[1]"),
|
||||
index: 1,
|
||||
}),
|
||||
ast: Box::new(lit('z', 9)),
|
||||
}))
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
parser("(?P<").parse().unwrap_err(),
|
||||
TestError {
|
||||
|
@ -5713,6 +5762,20 @@ bar
|
|||
],
|
||||
}))
|
||||
);
|
||||
assert_eq!(
|
||||
parser(r"\p\{").parse().unwrap_err(),
|
||||
TestError {
|
||||
span: span(2..3),
|
||||
kind: ast::ErrorKind::UnicodeClassInvalid,
|
||||
}
|
||||
);
|
||||
assert_eq!(
|
||||
parser(r"\P\{").parse().unwrap_err(),
|
||||
TestError {
|
||||
span: span(2..3),
|
||||
kind: ast::ErrorKind::UnicodeClassInvalid,
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
|
@ -4,8 +4,8 @@ This module provides a regular expression printer for `Ast`.
|
|||
|
||||
use std::fmt;
|
||||
|
||||
use ast::visitor::{self, Visitor};
|
||||
use ast::{self, Ast};
|
||||
use crate::ast::visitor::{self, Visitor};
|
||||
use crate::ast::{self, Ast};
|
||||
|
||||
/// A builder for constructing a printer.
|
||||
///
|
||||
|
@ -86,7 +86,7 @@ impl<'p, W: fmt::Write> Visitor for Writer<'p, W> {
|
|||
}
|
||||
|
||||
fn visit_post(&mut self, ast: &Ast) -> fmt::Result {
|
||||
use ast::Class;
|
||||
use crate::ast::Class;
|
||||
|
||||
match *ast {
|
||||
Ast::Empty(_) => Ok(()),
|
||||
|
@ -126,7 +126,7 @@ impl<'p, W: fmt::Write> Visitor for Writer<'p, W> {
|
|||
&mut self,
|
||||
ast: &ast::ClassSetItem,
|
||||
) -> Result<(), Self::Err> {
|
||||
use ast::ClassSetItem::*;
|
||||
use crate::ast::ClassSetItem::*;
|
||||
|
||||
match *ast {
|
||||
Empty(_) => Ok(()),
|
||||
|
@ -155,7 +155,7 @@ impl<'p, W: fmt::Write> Visitor for Writer<'p, W> {
|
|||
|
||||
impl<'p, W: fmt::Write> Writer<'p, W> {
|
||||
fn fmt_group_pre(&mut self, ast: &ast::Group) -> fmt::Result {
|
||||
use ast::GroupKind::*;
|
||||
use crate::ast::GroupKind::*;
|
||||
match ast.kind {
|
||||
CaptureIndex(_) => self.wtr.write_str("("),
|
||||
CaptureName(ref x) => {
|
||||
|
@ -178,7 +178,7 @@ impl<'p, W: fmt::Write> Writer<'p, W> {
|
|||
}
|
||||
|
||||
fn fmt_repetition(&mut self, ast: &ast::Repetition) -> fmt::Result {
|
||||
use ast::RepetitionKind::*;
|
||||
use crate::ast::RepetitionKind::*;
|
||||
match ast.op.kind {
|
||||
ZeroOrOne if ast.greedy => self.wtr.write_str("?"),
|
||||
ZeroOrOne => self.wtr.write_str("??"),
|
||||
|
@ -200,7 +200,7 @@ impl<'p, W: fmt::Write> Writer<'p, W> {
|
|||
&mut self,
|
||||
ast: &ast::RepetitionRange,
|
||||
) -> fmt::Result {
|
||||
use ast::RepetitionRange::*;
|
||||
use crate::ast::RepetitionRange::*;
|
||||
match *ast {
|
||||
Exactly(x) => write!(self.wtr, "{{{}}}", x),
|
||||
AtLeast(x) => write!(self.wtr, "{{{},}}", x),
|
||||
|
@ -209,7 +209,7 @@ impl<'p, W: fmt::Write> Writer<'p, W> {
|
|||
}
|
||||
|
||||
fn fmt_literal(&mut self, ast: &ast::Literal) -> fmt::Result {
|
||||
use ast::LiteralKind::*;
|
||||
use crate::ast::LiteralKind::*;
|
||||
|
||||
match ast.kind {
|
||||
Verbatim => self.wtr.write_char(ast.c),
|
||||
|
@ -256,7 +256,7 @@ impl<'p, W: fmt::Write> Writer<'p, W> {
|
|||
}
|
||||
|
||||
fn fmt_assertion(&mut self, ast: &ast::Assertion) -> fmt::Result {
|
||||
use ast::AssertionKind::*;
|
||||
use crate::ast::AssertionKind::*;
|
||||
match ast.kind {
|
||||
StartLine => self.wtr.write_str("^"),
|
||||
EndLine => self.wtr.write_str("$"),
|
||||
|
@ -275,7 +275,7 @@ impl<'p, W: fmt::Write> Writer<'p, W> {
|
|||
}
|
||||
|
||||
fn fmt_flags(&mut self, ast: &ast::Flags) -> fmt::Result {
|
||||
use ast::{Flag, FlagsItemKind};
|
||||
use crate::ast::{Flag, FlagsItemKind};
|
||||
|
||||
for item in &ast.items {
|
||||
match item.kind {
|
||||
|
@ -315,7 +315,7 @@ impl<'p, W: fmt::Write> Writer<'p, W> {
|
|||
&mut self,
|
||||
ast: &ast::ClassSetBinaryOpKind,
|
||||
) -> fmt::Result {
|
||||
use ast::ClassSetBinaryOpKind::*;
|
||||
use crate::ast::ClassSetBinaryOpKind::*;
|
||||
match *ast {
|
||||
Intersection => self.wtr.write_str("&&"),
|
||||
Difference => self.wtr.write_str("--"),
|
||||
|
@ -324,7 +324,7 @@ impl<'p, W: fmt::Write> Writer<'p, W> {
|
|||
}
|
||||
|
||||
fn fmt_class_perl(&mut self, ast: &ast::ClassPerl) -> fmt::Result {
|
||||
use ast::ClassPerlKind::*;
|
||||
use crate::ast::ClassPerlKind::*;
|
||||
match ast.kind {
|
||||
Digit if ast.negated => self.wtr.write_str(r"\D"),
|
||||
Digit => self.wtr.write_str(r"\d"),
|
||||
|
@ -336,7 +336,7 @@ impl<'p, W: fmt::Write> Writer<'p, W> {
|
|||
}
|
||||
|
||||
fn fmt_class_ascii(&mut self, ast: &ast::ClassAscii) -> fmt::Result {
|
||||
use ast::ClassAsciiKind::*;
|
||||
use crate::ast::ClassAsciiKind::*;
|
||||
match ast.kind {
|
||||
Alnum if ast.negated => self.wtr.write_str("[:^alnum:]"),
|
||||
Alnum => self.wtr.write_str("[:alnum:]"),
|
||||
|
@ -370,8 +370,8 @@ impl<'p, W: fmt::Write> Writer<'p, W> {
|
|||
}
|
||||
|
||||
fn fmt_class_unicode(&mut self, ast: &ast::ClassUnicode) -> fmt::Result {
|
||||
use ast::ClassUnicodeKind::*;
|
||||
use ast::ClassUnicodeOpKind::*;
|
||||
use crate::ast::ClassUnicodeKind::*;
|
||||
use crate::ast::ClassUnicodeOpKind::*;
|
||||
|
||||
if ast.negated {
|
||||
self.wtr.write_str(r"\P")?;
|
||||
|
@ -397,7 +397,7 @@ impl<'p, W: fmt::Write> Writer<'p, W> {
|
|||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::Printer;
|
||||
use ast::parse::ParserBuilder;
|
||||
use crate::ast::parse::ParserBuilder;
|
||||
|
||||
fn roundtrip(given: &str) {
|
||||
roundtrip_with(|b| b, given);
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
use std::fmt;
|
||||
|
||||
use ast::{self, Ast};
|
||||
use crate::ast::{self, Ast};
|
||||
|
||||
/// A trait for visiting an abstract syntax tree (AST) in depth first order.
|
||||
///
|
||||
|
@ -478,7 +478,7 @@ impl<'a> ClassInduct<'a> {
|
|||
}
|
||||
|
||||
impl<'a> fmt::Debug for ClassFrame<'a> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let x = match *self {
|
||||
ClassFrame::Union { .. } => "Union",
|
||||
ClassFrame::Binary { .. } => "Binary",
|
||||
|
@ -490,7 +490,7 @@ impl<'a> fmt::Debug for ClassFrame<'a> {
|
|||
}
|
||||
|
||||
impl<'a> fmt::Debug for ClassInduct<'a> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let x = match *self {
|
||||
ClassInduct::Item(it) => match *it {
|
||||
ast::ClassSetItem::Empty(_) => "Item(Empty)",
|
||||
|
|
|
@ -3,8 +3,8 @@ use std::error;
|
|||
use std::fmt;
|
||||
use std::result;
|
||||
|
||||
use ast;
|
||||
use hir;
|
||||
use crate::ast;
|
||||
use crate::hir;
|
||||
|
||||
/// A type alias for dealing with errors returned by this crate.
|
||||
pub type Result<T> = result::Result<T, Error>;
|
||||
|
@ -40,6 +40,8 @@ impl From<hir::Error> for Error {
|
|||
}
|
||||
|
||||
impl error::Error for Error {
|
||||
// TODO: Remove this method entirely on the next breaking semver release.
|
||||
#[allow(deprecated)]
|
||||
fn description(&self) -> &str {
|
||||
match *self {
|
||||
Error::Parse(ref x) => x.description(),
|
||||
|
@ -50,7 +52,7 @@ impl error::Error for Error {
|
|||
}
|
||||
|
||||
impl fmt::Display for Error {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match *self {
|
||||
Error::Parse(ref x) => x.fmt(f),
|
||||
Error::Translate(ref x) => x.fmt(f),
|
||||
|
@ -65,7 +67,7 @@ impl fmt::Display for Error {
|
|||
/// readable format. Most of its complexity is from interspersing notational
|
||||
/// markers pointing out the position where an error occurred.
|
||||
#[derive(Debug)]
|
||||
pub struct Formatter<'e, E: 'e> {
|
||||
pub struct Formatter<'e, E> {
|
||||
/// The original regex pattern in which the error occurred.
|
||||
pattern: &'e str,
|
||||
/// The error kind. It must impl fmt::Display.
|
||||
|
@ -100,7 +102,7 @@ impl<'e> From<&'e hir::Error> for Formatter<'e, hir::ErrorKind> {
|
|||
}
|
||||
|
||||
impl<'e, E: fmt::Display> fmt::Display for Formatter<'e, E> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let spans = Spans::from_formatter(self);
|
||||
if self.pattern.contains('\n') {
|
||||
let divider = repeat_char('~', 79);
|
||||
|
@ -284,7 +286,7 @@ fn repeat_char(c: char, count: usize) -> String {
|
|||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use ast::parse::Parser;
|
||||
use crate::ast::parse::Parser;
|
||||
|
||||
fn assert_panic_message(pattern: &str, expected_msg: &str) -> () {
|
||||
let result = Parser::new().parse(pattern);
|
||||
|
|
|
@ -4,7 +4,7 @@ use std::fmt::Debug;
|
|||
use std::slice;
|
||||
use std::u8;
|
||||
|
||||
use unicode;
|
||||
use crate::unicode;
|
||||
|
||||
// This module contains an *internal* implementation of interval sets.
|
||||
//
|
||||
|
@ -60,7 +60,7 @@ impl<I: Interval> IntervalSet<I> {
|
|||
/// Return an iterator over all intervals in this set.
|
||||
///
|
||||
/// The iterator yields intervals in ascending order.
|
||||
pub fn iter(&self) -> IntervalSetIter<I> {
|
||||
pub fn iter(&self) -> IntervalSetIter<'_, I> {
|
||||
IntervalSetIter(self.ranges.iter())
|
||||
}
|
||||
|
||||
|
@ -322,7 +322,7 @@ impl<I: Interval> IntervalSet<I> {
|
|||
|
||||
/// An iterator over intervals.
|
||||
#[derive(Debug)]
|
||||
pub struct IntervalSetIter<'a, I: 'a>(slice::Iter<'a, I>);
|
||||
pub struct IntervalSetIter<'a, I>(slice::Iter<'a, I>);
|
||||
|
||||
impl<'a, I> Iterator for IntervalSetIter<'a, I> {
|
||||
type Item = &'a I;
|
||||
|
|
|
@ -8,7 +8,7 @@ use std::iter;
|
|||
use std::mem;
|
||||
use std::ops;
|
||||
|
||||
use hir::{self, Hir, HirKind};
|
||||
use crate::hir::{self, Hir, HirKind};
|
||||
|
||||
/// A set of literal byte strings extracted from a regular expression.
|
||||
///
|
||||
|
@ -838,7 +838,7 @@ fn alternate_literals<F: FnMut(&Hir, &mut Literals)>(
|
|||
}
|
||||
|
||||
impl fmt::Debug for Literals {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.debug_struct("Literals")
|
||||
.field("lits", &self.lits)
|
||||
.field("limit_size", &self.limit_size)
|
||||
|
@ -882,7 +882,7 @@ impl PartialOrd for Literal {
|
|||
}
|
||||
|
||||
impl fmt::Debug for Literal {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
if self.is_cut() {
|
||||
write!(f, "Cut({})", escape_unicode(&self.v))
|
||||
} else {
|
||||
|
@ -977,8 +977,8 @@ mod tests {
|
|||
use std::fmt;
|
||||
|
||||
use super::{escape_bytes, Literal, Literals};
|
||||
use hir::Hir;
|
||||
use ParserBuilder;
|
||||
use crate::hir::Hir;
|
||||
use crate::ParserBuilder;
|
||||
|
||||
// To make test failures easier to read.
|
||||
#[derive(Debug, Eq, PartialEq)]
|
||||
|
@ -1017,7 +1017,7 @@ mod tests {
|
|||
}
|
||||
|
||||
impl fmt::Debug for ULiteral {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
if self.is_cut() {
|
||||
write!(f, "Cut({})", self.v)
|
||||
} else {
|
||||
|
|
|
@ -8,12 +8,12 @@ use std::fmt;
|
|||
use std::result;
|
||||
use std::u8;
|
||||
|
||||
use ast::Span;
|
||||
use hir::interval::{Interval, IntervalSet, IntervalSetIter};
|
||||
use unicode;
|
||||
use crate::ast::Span;
|
||||
use crate::hir::interval::{Interval, IntervalSet, IntervalSetIter};
|
||||
use crate::unicode;
|
||||
|
||||
pub use hir::visitor::{visit, Visitor};
|
||||
pub use unicode::CaseFoldError;
|
||||
pub use crate::hir::visitor::{visit, Visitor};
|
||||
pub use crate::unicode::CaseFoldError;
|
||||
|
||||
mod interval;
|
||||
pub mod literal;
|
||||
|
@ -91,6 +91,8 @@ pub enum ErrorKind {
|
|||
}
|
||||
|
||||
impl ErrorKind {
|
||||
// TODO: Remove this method entirely on the next breaking semver release.
|
||||
#[allow(deprecated)]
|
||||
fn description(&self) -> &str {
|
||||
use self::ErrorKind::*;
|
||||
match *self {
|
||||
|
@ -113,19 +115,23 @@ impl ErrorKind {
|
|||
}
|
||||
|
||||
impl error::Error for Error {
|
||||
// TODO: Remove this method entirely on the next breaking semver release.
|
||||
#[allow(deprecated)]
|
||||
fn description(&self) -> &str {
|
||||
self.kind.description()
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for Error {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
::error::Formatter::from(self).fmt(f)
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
crate::error::Formatter::from(self).fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for ErrorKind {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
// TODO: Remove this on the next breaking semver release.
|
||||
#[allow(deprecated)]
|
||||
f.write_str(self.description())
|
||||
}
|
||||
}
|
||||
|
@ -235,8 +241,8 @@ impl Hir {
|
|||
info.set_any_anchored_start(false);
|
||||
info.set_any_anchored_end(false);
|
||||
info.set_match_empty(true);
|
||||
info.set_literal(true);
|
||||
info.set_alternation_literal(true);
|
||||
info.set_literal(false);
|
||||
info.set_alternation_literal(false);
|
||||
Hir { kind: HirKind::Empty, info: info }
|
||||
}
|
||||
|
||||
|
@ -665,8 +671,8 @@ impl Hir {
|
|||
/// true when this HIR expression is either itself a `Literal` or a
|
||||
/// concatenation of only `Literal`s.
|
||||
///
|
||||
/// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()`
|
||||
/// are not (even though that contain sub-expressions that are literals).
|
||||
/// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()`,
|
||||
/// `` are not (even though that contain sub-expressions that are literals).
|
||||
pub fn is_literal(&self) -> bool {
|
||||
self.info.is_literal()
|
||||
}
|
||||
|
@ -676,8 +682,8 @@ impl Hir {
|
|||
/// true when this HIR expression is either itself a `Literal` or a
|
||||
/// concatenation of only `Literal`s or an alternation of only `Literal`s.
|
||||
///
|
||||
/// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternaiton
|
||||
/// literals, but `f+`, `(foo)`, `foo()`
|
||||
/// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternation
|
||||
/// literals, but `f+`, `(foo)`, `foo()`, ``
|
||||
/// are not (even though that contain sub-expressions that are literals).
|
||||
pub fn is_alternation_literal(&self) -> bool {
|
||||
self.info.is_alternation_literal()
|
||||
|
@ -721,8 +727,8 @@ impl HirKind {
|
|||
/// This implementation uses constant stack space and heap space proportional
|
||||
/// to the size of the `Hir`.
|
||||
impl fmt::Display for Hir {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
use hir::print::Printer;
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
use crate::hir::print::Printer;
|
||||
Printer::new().print(self, f)
|
||||
}
|
||||
}
|
||||
|
@ -853,7 +859,7 @@ impl ClassUnicode {
|
|||
/// Return an iterator over all ranges in this class.
|
||||
///
|
||||
/// The iterator yields ranges in ascending order.
|
||||
pub fn iter(&self) -> ClassUnicodeIter {
|
||||
pub fn iter(&self) -> ClassUnicodeIter<'_> {
|
||||
ClassUnicodeIter(self.set.iter())
|
||||
}
|
||||
|
||||
|
@ -886,14 +892,11 @@ impl ClassUnicode {
|
|||
/// this class consists of the range `a-z`, then applying case folding will
|
||||
/// result in the class containing both the ranges `a-z` and `A-Z`.
|
||||
///
|
||||
/// # Panics
|
||||
/// # Error
|
||||
///
|
||||
/// This routine panics when the case mapping data necessary for this
|
||||
/// routine to complete is unavailable. This occurs when the `unicode-case`
|
||||
/// feature is not enabled.
|
||||
///
|
||||
/// Callers should prefer using `try_case_fold_simple` instead, which will
|
||||
/// return an error instead of panicking.
|
||||
/// This routine returns an error when the case mapping data necessary
|
||||
/// for this routine to complete is unavailable. This occurs when the
|
||||
/// `unicode-case` feature is not enabled.
|
||||
pub fn try_case_fold_simple(
|
||||
&mut self,
|
||||
) -> result::Result<(), CaseFoldError> {
|
||||
|
@ -935,6 +938,13 @@ impl ClassUnicode {
|
|||
pub fn symmetric_difference(&mut self, other: &ClassUnicode) {
|
||||
self.set.symmetric_difference(&other.set);
|
||||
}
|
||||
|
||||
/// Returns true if and only if this character class will either match
|
||||
/// nothing or only ASCII bytes. Stated differently, this returns false
|
||||
/// if and only if this class contains a non-ASCII codepoint.
|
||||
pub fn is_all_ascii(&self) -> bool {
|
||||
self.set.intervals().last().map_or(true, |r| r.end <= '\x7F')
|
||||
}
|
||||
}
|
||||
|
||||
/// An iterator over all ranges in a Unicode character class.
|
||||
|
@ -962,7 +972,7 @@ pub struct ClassUnicodeRange {
|
|||
}
|
||||
|
||||
impl fmt::Debug for ClassUnicodeRange {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let start = if !self.start.is_whitespace() && !self.start.is_control()
|
||||
{
|
||||
self.start.to_string()
|
||||
|
@ -1092,7 +1102,7 @@ impl ClassBytes {
|
|||
/// Return an iterator over all ranges in this class.
|
||||
///
|
||||
/// The iterator yields ranges in ascending order.
|
||||
pub fn iter(&self) -> ClassBytesIter {
|
||||
pub fn iter(&self) -> ClassBytesIter<'_> {
|
||||
ClassBytesIter(self.set.iter())
|
||||
}
|
||||
|
||||
|
@ -1248,7 +1258,7 @@ impl ClassBytesRange {
|
|||
}
|
||||
|
||||
impl fmt::Debug for ClassBytesRange {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let mut debug = f.debug_struct("ClassBytesRange");
|
||||
if self.start <= 0x7F {
|
||||
debug.field("start", &(self.start as char));
|
||||
|
@ -1486,7 +1496,7 @@ macro_rules! define_bool {
|
|||
self.bools &= !(1 << $bit);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
impl HirInfo {
|
||||
|
|
|
@ -4,9 +4,9 @@ This module provides a regular expression printer for `Hir`.
|
|||
|
||||
use std::fmt;
|
||||
|
||||
use hir::visitor::{self, Visitor};
|
||||
use hir::{self, Hir, HirKind};
|
||||
use is_meta_character;
|
||||
use crate::hir::visitor::{self, Visitor};
|
||||
use crate::hir::{self, Hir, HirKind};
|
||||
use crate::is_meta_character;
|
||||
|
||||
/// A builder for constructing a printer.
|
||||
///
|
||||
|
@ -239,7 +239,7 @@ impl<'p, W: fmt::Write> Writer<'p, W> {
|
|||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::Printer;
|
||||
use ParserBuilder;
|
||||
use crate::ParserBuilder;
|
||||
|
||||
fn roundtrip(given: &str, expected: &str) {
|
||||
roundtrip_with(|b| b, given, expected);
|
||||
|
|
|
@ -5,9 +5,9 @@ Defines a translator that converts an `Ast` to an `Hir`.
|
|||
use std::cell::{Cell, RefCell};
|
||||
use std::result;
|
||||
|
||||
use ast::{self, Ast, Span, Visitor};
|
||||
use hir::{self, Error, ErrorKind, Hir};
|
||||
use unicode::{self, ClassQuery};
|
||||
use crate::ast::{self, Ast, Span, Visitor};
|
||||
use crate::hir::{self, Error, ErrorKind, Hir};
|
||||
use crate::unicode::{self, ClassQuery};
|
||||
|
||||
type Result<T> = result::Result<T, Error>;
|
||||
|
||||
|
@ -159,18 +159,19 @@ enum HirFrame {
|
|||
/// indicated by parentheses (including non-capturing groups). It is popped
|
||||
/// upon leaving a group.
|
||||
Group {
|
||||
/// The old active flags, if any, when this group was opened.
|
||||
/// The old active flags when this group was opened.
|
||||
///
|
||||
/// If this group sets flags, then the new active flags are set to the
|
||||
/// result of merging the old flags with the flags introduced by this
|
||||
/// group.
|
||||
/// group. If the group doesn't set any flags, then this is simply
|
||||
/// equivalent to whatever flags were set when the group was opened.
|
||||
///
|
||||
/// When this group is popped, the active flags should be restored to
|
||||
/// the flags set here.
|
||||
///
|
||||
/// The "active" flags correspond to whatever flags are set in the
|
||||
/// Translator.
|
||||
old_flags: Option<Flags>,
|
||||
old_flags: Flags,
|
||||
},
|
||||
/// This is pushed whenever a concatenation is observed. After visiting
|
||||
/// every sub-expression in the concatenation, the translator's stack is
|
||||
|
@ -219,8 +220,8 @@ impl HirFrame {
|
|||
|
||||
/// Assert that the current stack frame is a group indicator and return
|
||||
/// its corresponding flags (the flags that were active at the time the
|
||||
/// group was entered) if they exist.
|
||||
fn unwrap_group(self) -> Option<Flags> {
|
||||
/// group was entered).
|
||||
fn unwrap_group(self) -> Flags {
|
||||
match self {
|
||||
HirFrame::Group { old_flags } => old_flags,
|
||||
_ => {
|
||||
|
@ -252,8 +253,11 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
|
|||
}
|
||||
}
|
||||
Ast::Group(ref x) => {
|
||||
let old_flags = x.flags().map(|ast| self.set_flags(ast));
|
||||
self.push(HirFrame::Group { old_flags: old_flags });
|
||||
let old_flags = x
|
||||
.flags()
|
||||
.map(|ast| self.set_flags(ast))
|
||||
.unwrap_or_else(|| self.flags());
|
||||
self.push(HirFrame::Group { old_flags });
|
||||
}
|
||||
Ast::Concat(ref x) if x.asts.is_empty() => {}
|
||||
Ast::Concat(_) => {
|
||||
|
@ -318,7 +322,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
|
|||
ast.negated,
|
||||
&mut cls,
|
||||
)?;
|
||||
if cls.iter().next().is_none() {
|
||||
if cls.ranges().is_empty() {
|
||||
return Err(self.error(
|
||||
ast.span,
|
||||
ErrorKind::EmptyClassNotAllowed,
|
||||
|
@ -333,7 +337,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
|
|||
ast.negated,
|
||||
&mut cls,
|
||||
)?;
|
||||
if cls.iter().next().is_none() {
|
||||
if cls.ranges().is_empty() {
|
||||
return Err(self.error(
|
||||
ast.span,
|
||||
ErrorKind::EmptyClassNotAllowed,
|
||||
|
@ -350,9 +354,8 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
|
|||
}
|
||||
Ast::Group(ref x) => {
|
||||
let expr = self.pop().unwrap().unwrap_expr();
|
||||
if let Some(flags) = self.pop().unwrap().unwrap_group() {
|
||||
self.trans().flags.set(flags);
|
||||
}
|
||||
let old_flags = self.pop().unwrap().unwrap_group();
|
||||
self.trans().flags.set(old_flags);
|
||||
self.push(HirFrame::Expr(self.hir_group(x, expr)));
|
||||
}
|
||||
Ast::Concat(_) => {
|
||||
|
@ -530,7 +533,7 @@ impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
|
|||
&mut self,
|
||||
op: &ast::ClassSetBinaryOp,
|
||||
) -> Result<()> {
|
||||
use ast::ClassSetBinaryOpKind::*;
|
||||
use crate::ast::ClassSetBinaryOpKind::*;
|
||||
|
||||
if self.flags().unicode() {
|
||||
let mut rhs = self.pop().unwrap().unwrap_class_unicode();
|
||||
|
@ -816,7 +819,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
|
|||
&self,
|
||||
ast_class: &ast::ClassUnicode,
|
||||
) -> Result<hir::ClassUnicode> {
|
||||
use ast::ClassUnicodeKind::*;
|
||||
use crate::ast::ClassUnicodeKind::*;
|
||||
|
||||
if !self.flags().unicode() {
|
||||
return Err(
|
||||
|
@ -841,6 +844,11 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
|
|||
ast_class.negated,
|
||||
class,
|
||||
)?;
|
||||
if class.ranges().is_empty() {
|
||||
let err = self
|
||||
.error(ast_class.span, ErrorKind::EmptyClassNotAllowed);
|
||||
return Err(err);
|
||||
}
|
||||
}
|
||||
result
|
||||
}
|
||||
|
@ -849,7 +857,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
|
|||
&self,
|
||||
ast_class: &ast::ClassPerl,
|
||||
) -> Result<hir::ClassUnicode> {
|
||||
use ast::ClassPerlKind::*;
|
||||
use crate::ast::ClassPerlKind::*;
|
||||
|
||||
assert!(self.flags().unicode());
|
||||
let result = match ast_class.kind {
|
||||
|
@ -871,7 +879,7 @@ impl<'t, 'p> TranslatorI<'t, 'p> {
|
|||
&self,
|
||||
ast_class: &ast::ClassPerl,
|
||||
) -> hir::ClassBytes {
|
||||
use ast::ClassPerlKind::*;
|
||||
use crate::ast::ClassPerlKind::*;
|
||||
|
||||
assert!(!self.flags().unicode());
|
||||
let mut class = match ast_class.kind {
|
||||
|
@ -1069,7 +1077,7 @@ fn hir_ascii_class_bytes(kind: &ast::ClassAsciiKind) -> hir::ClassBytes {
|
|||
}
|
||||
|
||||
fn ascii_class(kind: &ast::ClassAsciiKind) -> &'static [(char, char)] {
|
||||
use ast::ClassAsciiKind::*;
|
||||
use crate::ast::ClassAsciiKind::*;
|
||||
match *kind {
|
||||
Alnum => &[('0', '9'), ('A', 'Z'), ('a', 'z')],
|
||||
Alpha => &[('A', 'Z'), ('a', 'z')],
|
||||
|
@ -1097,10 +1105,10 @@ fn ascii_class(kind: &ast::ClassAsciiKind) -> &'static [(char, char)] {
|
|||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use ast::parse::ParserBuilder;
|
||||
use ast::{self, Ast, Position, Span};
|
||||
use hir::{self, Hir, HirKind};
|
||||
use unicode::{self, ClassQuery};
|
||||
use crate::ast::parse::ParserBuilder;
|
||||
use crate::ast::{self, Ast, Position, Span};
|
||||
use crate::hir::{self, Hir, HirKind};
|
||||
use crate::unicode::{self, ClassQuery};
|
||||
|
||||
use super::{ascii_class, TranslatorBuilder};
|
||||
|
||||
|
@ -1248,7 +1256,7 @@ mod tests {
|
|||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
fn hir_uclass_query(query: ClassQuery) -> Hir {
|
||||
fn hir_uclass_query(query: ClassQuery<'_>) -> Hir {
|
||||
Hir::class(hir::Class::Unicode(unicode::class(query).unwrap()))
|
||||
}
|
||||
|
||||
|
@ -1307,7 +1315,7 @@ mod tests {
|
|||
|
||||
#[allow(dead_code)]
|
||||
fn hir_union(expr1: Hir, expr2: Hir) -> Hir {
|
||||
use hir::Class::{Bytes, Unicode};
|
||||
use crate::hir::Class::{Bytes, Unicode};
|
||||
|
||||
match (expr1.into_kind(), expr2.into_kind()) {
|
||||
(HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => {
|
||||
|
@ -1324,7 +1332,7 @@ mod tests {
|
|||
|
||||
#[allow(dead_code)]
|
||||
fn hir_difference(expr1: Hir, expr2: Hir) -> Hir {
|
||||
use hir::Class::{Bytes, Unicode};
|
||||
use crate::hir::Class::{Bytes, Unicode};
|
||||
|
||||
match (expr1.into_kind(), expr2.into_kind()) {
|
||||
(HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => {
|
||||
|
@ -1641,6 +1649,20 @@ mod tests {
|
|||
hir_lit("β"),
|
||||
])
|
||||
);
|
||||
assert_eq!(
|
||||
t("(?:(?i-u)a)b"),
|
||||
hir_cat(vec![
|
||||
hir_group_nocap(hir_bclass(&[(b'A', b'A'), (b'a', b'a')])),
|
||||
hir_lit("b"),
|
||||
])
|
||||
);
|
||||
assert_eq!(
|
||||
t("((?i-u)a)b"),
|
||||
hir_cat(vec![
|
||||
hir_group(1, hir_bclass(&[(b'A', b'A'), (b'a', b'a')])),
|
||||
hir_lit("b"),
|
||||
])
|
||||
);
|
||||
#[cfg(feature = "unicode-case")]
|
||||
assert_eq!(
|
||||
t("(?i)(?-i:a)a"),
|
||||
|
@ -2300,6 +2322,21 @@ mod tests {
|
|||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "unicode-gencat")]
|
||||
fn class_unicode_any_empty() {
|
||||
assert_eq!(
|
||||
t_err(r"\P{any}"),
|
||||
TestError {
|
||||
kind: hir::ErrorKind::EmptyClassNotAllowed,
|
||||
span: Span::new(
|
||||
Position::new(0, 1, 1),
|
||||
Position::new(7, 1, 8)
|
||||
),
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(not(feature = "unicode-age"))]
|
||||
fn class_unicode_age_disabled() {
|
||||
|
@ -3088,13 +3125,13 @@ mod tests {
|
|||
#[test]
|
||||
fn analysis_is_literal() {
|
||||
// Positive examples.
|
||||
assert!(t(r"").is_literal());
|
||||
assert!(t(r"a").is_literal());
|
||||
assert!(t(r"ab").is_literal());
|
||||
assert!(t(r"abc").is_literal());
|
||||
assert!(t(r"(?m)abc").is_literal());
|
||||
|
||||
// Negative examples.
|
||||
assert!(!t(r"").is_literal());
|
||||
assert!(!t(r"^").is_literal());
|
||||
assert!(!t(r"a|b").is_literal());
|
||||
assert!(!t(r"(a)").is_literal());
|
||||
|
@ -3107,7 +3144,6 @@ mod tests {
|
|||
#[test]
|
||||
fn analysis_is_alternation_literal() {
|
||||
// Positive examples.
|
||||
assert!(t(r"").is_alternation_literal());
|
||||
assert!(t(r"a").is_alternation_literal());
|
||||
assert!(t(r"ab").is_alternation_literal());
|
||||
assert!(t(r"abc").is_alternation_literal());
|
||||
|
@ -3118,6 +3154,7 @@ mod tests {
|
|||
assert!(t(r"foo|bar|baz").is_alternation_literal());
|
||||
|
||||
// Negative examples.
|
||||
assert!(!t(r"").is_alternation_literal());
|
||||
assert!(!t(r"^").is_alternation_literal());
|
||||
assert!(!t(r"(a)").is_alternation_literal());
|
||||
assert!(!t(r"a+").is_alternation_literal());
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
use hir::{self, Hir, HirKind};
|
||||
use crate::hir::{self, Hir, HirKind};
|
||||
|
||||
/// A trait for visiting the high-level IR (HIR) in depth first order.
|
||||
///
|
||||
|
|
|
@ -155,11 +155,12 @@ The following features are available:
|
|||
*/
|
||||
|
||||
#![deny(missing_docs)]
|
||||
#![warn(missing_debug_implementations)]
|
||||
#![forbid(unsafe_code)]
|
||||
|
||||
pub use error::{Error, Result};
|
||||
pub use parser::{Parser, ParserBuilder};
|
||||
pub use unicode::UnicodeWordError;
|
||||
pub use crate::error::{Error, Result};
|
||||
pub use crate::parser::{Parser, ParserBuilder};
|
||||
pub use crate::unicode::UnicodeWordError;
|
||||
|
||||
pub mod ast;
|
||||
mod either;
|
||||
|
@ -175,7 +176,7 @@ pub mod utf8;
|
|||
/// The string returned may be safely used as a literal in a regular
|
||||
/// expression.
|
||||
pub fn escape(text: &str) -> String {
|
||||
let mut quoted = String::with_capacity(text.len());
|
||||
let mut quoted = String::new();
|
||||
escape_into(text, &mut quoted);
|
||||
quoted
|
||||
}
|
||||
|
@ -185,6 +186,7 @@ pub fn escape(text: &str) -> String {
|
|||
/// This will append escape characters into the given buffer. The characters
|
||||
/// that are appended are safe to use as a literal in a regular expression.
|
||||
pub fn escape_into(text: &str, buf: &mut String) {
|
||||
buf.reserve(text.len());
|
||||
for c in text.chars() {
|
||||
if is_meta_character(c) {
|
||||
buf.push('\\');
|
||||
|
@ -197,7 +199,7 @@ pub fn escape_into(text: &str, buf: &mut String) {
|
|||
///
|
||||
/// These are the only characters that are allowed to be escaped, with one
|
||||
/// exception: an ASCII space character may be escaped when extended mode (with
|
||||
/// the `x` flag) is enabld. In particular, `is_meta_character(' ')` returns
|
||||
/// the `x` flag) is enabled. In particular, `is_meta_character(' ')` returns
|
||||
/// `false`.
|
||||
///
|
||||
/// Note that the set of characters for which this function returns `true` or
|
||||
|
@ -214,7 +216,7 @@ pub fn is_meta_character(c: char) -> bool {
|
|||
/// character.
|
||||
///
|
||||
/// A Unicode word character is defined by
|
||||
/// [UTS#18 Annex C](http://unicode.org/reports/tr18/#Compatibility_Properties).
|
||||
/// [UTS#18 Annex C](https://unicode.org/reports/tr18/#Compatibility_Properties).
|
||||
/// In particular, a character
|
||||
/// is considered a word character if it is in either of the `Alphabetic` or
|
||||
/// `Join_Control` properties, or is in one of the `Decimal_Number`, `Mark`
|
||||
|
@ -234,7 +236,7 @@ pub fn is_word_character(c: char) -> bool {
|
|||
/// character.
|
||||
///
|
||||
/// A Unicode word character is defined by
|
||||
/// [UTS#18 Annex C](http://unicode.org/reports/tr18/#Compatibility_Properties).
|
||||
/// [UTS#18 Annex C](https://unicode.org/reports/tr18/#Compatibility_Properties).
|
||||
/// In particular, a character
|
||||
/// is considered a word character if it is in either of the `Alphabetic` or
|
||||
/// `Join_Control` properties, or is in one of the `Decimal_Number`, `Mark`
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
use ast;
|
||||
use hir;
|
||||
use crate::ast;
|
||||
use crate::hir;
|
||||
|
||||
use Result;
|
||||
use crate::Result;
|
||||
|
||||
/// A builder for a regular expression parser.
|
||||
///
|
||||
|
|
|
@ -2,7 +2,7 @@ use std::error;
|
|||
use std::fmt;
|
||||
use std::result;
|
||||
|
||||
use hir;
|
||||
use crate::hir;
|
||||
|
||||
/// A type alias for errors specific to Unicode handling of classes.
|
||||
pub type Result<T> = result::Result<T, Error>;
|
||||
|
@ -38,7 +38,7 @@ pub struct CaseFoldError(());
|
|||
impl error::Error for CaseFoldError {}
|
||||
|
||||
impl fmt::Display for CaseFoldError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"Unicode-aware case folding is not available \
|
||||
|
@ -58,7 +58,7 @@ pub struct UnicodeWordError(());
|
|||
impl error::Error for UnicodeWordError {}
|
||||
|
||||
impl fmt::Display for UnicodeWordError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"Unicode-aware \\w class is not available \
|
||||
|
@ -95,7 +95,7 @@ pub fn simple_fold(
|
|||
c: char,
|
||||
) -> FoldResult<result::Result<impl Iterator<Item = char>, Option<char>>>
|
||||
{
|
||||
use unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE;
|
||||
use crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE;
|
||||
|
||||
Ok(CASE_FOLDING_SIMPLE
|
||||
.binary_search_by_key(&c, |&(c1, _)| c1)
|
||||
|
@ -130,8 +130,8 @@ pub fn contains_simple_case_mapping(
|
|||
|
||||
#[cfg(feature = "unicode-case")]
|
||||
fn imp(start: char, end: char) -> FoldResult<bool> {
|
||||
use crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE;
|
||||
use std::cmp::Ordering;
|
||||
use unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE;
|
||||
|
||||
assert!(start <= end);
|
||||
Ok(CASE_FOLDING_SIMPLE
|
||||
|
@ -237,8 +237,16 @@ impl<'a> ClassQuery<'a> {
|
|||
fn canonical_binary(&self, name: &str) -> Result<CanonicalClassQuery> {
|
||||
let norm = symbolic_name_normalize(name);
|
||||
|
||||
if let Some(canon) = canonical_prop(&norm)? {
|
||||
return Ok(CanonicalClassQuery::Binary(canon));
|
||||
// This is a special case where 'cf' refers to the 'Format' general
|
||||
// category, but where the 'cf' abbreviation is also an abbreviation
|
||||
// for the 'Case_Folding' property. But we want to treat it as
|
||||
// a general category. (Currently, we don't even support the
|
||||
// 'Case_Folding' property. But if we do in the future, users will be
|
||||
// required to spell it out.)
|
||||
if norm != "cf" {
|
||||
if let Some(canon) = canonical_prop(&norm)? {
|
||||
return Ok(CanonicalClassQuery::Binary(canon));
|
||||
}
|
||||
}
|
||||
if let Some(canon) = canonical_gencat(&norm)? {
|
||||
return Ok(CanonicalClassQuery::GeneralCategory(canon));
|
||||
|
@ -277,7 +285,7 @@ enum CanonicalClassQuery {
|
|||
|
||||
/// Looks up a Unicode class given a query. If one doesn't exist, then
|
||||
/// `None` is returned.
|
||||
pub fn class<'a>(query: ClassQuery<'a>) -> Result<hir::ClassUnicode> {
|
||||
pub fn class(query: ClassQuery<'_>) -> Result<hir::ClassUnicode> {
|
||||
use self::CanonicalClassQuery::*;
|
||||
|
||||
match query.canonicalize()? {
|
||||
|
@ -322,7 +330,7 @@ pub fn perl_word() -> Result<hir::ClassUnicode> {
|
|||
|
||||
#[cfg(feature = "unicode-perl")]
|
||||
fn imp() -> Result<hir::ClassUnicode> {
|
||||
use unicode_tables::perl_word::PERL_WORD;
|
||||
use crate::unicode_tables::perl_word::PERL_WORD;
|
||||
Ok(hir_class(PERL_WORD))
|
||||
}
|
||||
|
||||
|
@ -340,13 +348,13 @@ pub fn perl_space() -> Result<hir::ClassUnicode> {
|
|||
|
||||
#[cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))]
|
||||
fn imp() -> Result<hir::ClassUnicode> {
|
||||
use unicode_tables::perl_space::WHITE_SPACE;
|
||||
use crate::unicode_tables::perl_space::WHITE_SPACE;
|
||||
Ok(hir_class(WHITE_SPACE))
|
||||
}
|
||||
|
||||
#[cfg(feature = "unicode-bool")]
|
||||
fn imp() -> Result<hir::ClassUnicode> {
|
||||
use unicode_tables::property_bool::WHITE_SPACE;
|
||||
use crate::unicode_tables::property_bool::WHITE_SPACE;
|
||||
Ok(hir_class(WHITE_SPACE))
|
||||
}
|
||||
|
||||
|
@ -364,13 +372,13 @@ pub fn perl_digit() -> Result<hir::ClassUnicode> {
|
|||
|
||||
#[cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))]
|
||||
fn imp() -> Result<hir::ClassUnicode> {
|
||||
use unicode_tables::perl_decimal::DECIMAL_NUMBER;
|
||||
use crate::unicode_tables::perl_decimal::DECIMAL_NUMBER;
|
||||
Ok(hir_class(DECIMAL_NUMBER))
|
||||
}
|
||||
|
||||
#[cfg(feature = "unicode-gencat")]
|
||||
fn imp() -> Result<hir::ClassUnicode> {
|
||||
use unicode_tables::general_category::DECIMAL_NUMBER;
|
||||
use crate::unicode_tables::general_category::DECIMAL_NUMBER;
|
||||
Ok(hir_class(DECIMAL_NUMBER))
|
||||
}
|
||||
|
||||
|
@ -397,9 +405,9 @@ pub fn is_word_character(c: char) -> result::Result<bool, UnicodeWordError> {
|
|||
|
||||
#[cfg(feature = "unicode-perl")]
|
||||
fn imp(c: char) -> result::Result<bool, UnicodeWordError> {
|
||||
use is_word_byte;
|
||||
use crate::is_word_byte;
|
||||
use crate::unicode_tables::perl_word::PERL_WORD;
|
||||
use std::cmp::Ordering;
|
||||
use unicode_tables::perl_word::PERL_WORD;
|
||||
|
||||
if c <= 0x7F as char && is_word_byte(c as u8) {
|
||||
return Ok(true);
|
||||
|
@ -474,7 +482,7 @@ fn canonical_prop(normalized_name: &str) -> Result<Option<&'static str>> {
|
|||
feature = "unicode-segment",
|
||||
))]
|
||||
fn imp(name: &str) -> Result<Option<&'static str>> {
|
||||
use unicode_tables::property_names::PROPERTY_NAMES;
|
||||
use crate::unicode_tables::property_names::PROPERTY_NAMES;
|
||||
|
||||
Ok(PROPERTY_NAMES
|
||||
.binary_search_by_key(&name, |&(n, _)| n)
|
||||
|
@ -531,7 +539,7 @@ fn property_values(
|
|||
feature = "unicode-segment",
|
||||
))]
|
||||
fn imp(name: &'static str) -> Result<Option<PropertyValues>> {
|
||||
use unicode_tables::property_values::PROPERTY_VALUES;
|
||||
use crate::unicode_tables::property_values::PROPERTY_VALUES;
|
||||
|
||||
Ok(PROPERTY_VALUES
|
||||
.binary_search_by_key(&name, |&(n, _)| n)
|
||||
|
@ -570,7 +578,7 @@ fn ages(canonical_age: &str) -> Result<impl Iterator<Item = Range>> {
|
|||
|
||||
#[cfg(feature = "unicode-age")]
|
||||
fn imp(canonical_age: &str) -> Result<impl Iterator<Item = Range>> {
|
||||
use unicode_tables::age;
|
||||
use crate::unicode_tables::age;
|
||||
|
||||
const AGES: &'static [(&'static str, Range)] = &[
|
||||
("V1_1", age::V1_1),
|
||||
|
@ -595,6 +603,7 @@ fn ages(canonical_age: &str) -> Result<impl Iterator<Item = Range>> {
|
|||
("V11_0", age::V11_0),
|
||||
("V12_0", age::V12_0),
|
||||
("V12_1", age::V12_1),
|
||||
("V13_0", age::V13_0),
|
||||
];
|
||||
assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync");
|
||||
|
||||
|
@ -622,7 +631,7 @@ fn gencat(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
|
|||
|
||||
#[cfg(feature = "unicode-gencat")]
|
||||
fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
|
||||
use unicode_tables::general_category::BY_NAME;
|
||||
use crate::unicode_tables::general_category::BY_NAME;
|
||||
match name {
|
||||
"ASCII" => Ok(hir_class(&[('\0', '\x7F')])),
|
||||
"Any" => Ok(hir_class(&[('\0', '\u{10FFFF}')])),
|
||||
|
@ -657,7 +666,7 @@ fn script(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
|
|||
|
||||
#[cfg(feature = "unicode-script")]
|
||||
fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
|
||||
use unicode_tables::script::BY_NAME;
|
||||
use crate::unicode_tables::script::BY_NAME;
|
||||
property_set(BY_NAME, name)
|
||||
.map(hir_class)
|
||||
.ok_or(Error::PropertyValueNotFound)
|
||||
|
@ -682,7 +691,7 @@ fn script_extension(
|
|||
|
||||
#[cfg(feature = "unicode-script")]
|
||||
fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
|
||||
use unicode_tables::script_extension::BY_NAME;
|
||||
use crate::unicode_tables::script_extension::BY_NAME;
|
||||
property_set(BY_NAME, name)
|
||||
.map(hir_class)
|
||||
.ok_or(Error::PropertyValueNotFound)
|
||||
|
@ -706,7 +715,7 @@ fn bool_property(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
|
|||
|
||||
#[cfg(feature = "unicode-bool")]
|
||||
fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
|
||||
use unicode_tables::property_bool::BY_NAME;
|
||||
use crate::unicode_tables::property_bool::BY_NAME;
|
||||
property_set(BY_NAME, name)
|
||||
.map(hir_class)
|
||||
.ok_or(Error::PropertyNotFound)
|
||||
|
@ -734,7 +743,7 @@ fn gcb(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
|
|||
|
||||
#[cfg(feature = "unicode-segment")]
|
||||
fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
|
||||
use unicode_tables::grapheme_cluster_break::BY_NAME;
|
||||
use crate::unicode_tables::grapheme_cluster_break::BY_NAME;
|
||||
property_set(BY_NAME, name)
|
||||
.map(hir_class)
|
||||
.ok_or(Error::PropertyValueNotFound)
|
||||
|
@ -758,7 +767,7 @@ fn wb(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
|
|||
|
||||
#[cfg(feature = "unicode-segment")]
|
||||
fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
|
||||
use unicode_tables::word_break::BY_NAME;
|
||||
use crate::unicode_tables::word_break::BY_NAME;
|
||||
property_set(BY_NAME, name)
|
||||
.map(hir_class)
|
||||
.ok_or(Error::PropertyValueNotFound)
|
||||
|
@ -782,7 +791,7 @@ fn sb(canonical_name: &'static str) -> Result<hir::ClassUnicode> {
|
|||
|
||||
#[cfg(feature = "unicode-segment")]
|
||||
fn imp(name: &'static str) -> Result<hir::ClassUnicode> {
|
||||
use unicode_tables::sentence_break::BY_NAME;
|
||||
use crate::unicode_tables::sentence_break::BY_NAME;
|
||||
property_set(BY_NAME, name)
|
||||
.map(hir_class)
|
||||
.ok_or(Error::PropertyValueNotFound)
|
||||
|
@ -814,7 +823,7 @@ fn symbolic_name_normalize(x: &str) -> String {
|
|||
/// The slice returned is guaranteed to be valid UTF-8 for all possible values
|
||||
/// of `slice`.
|
||||
///
|
||||
/// See: http://unicode.org/reports/tr44/#UAX44-LM3
|
||||
/// See: https://unicode.org/reports/tr44/#UAX44-LM3
|
||||
fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] {
|
||||
// I couldn't find a place in the standard that specified that property
|
||||
// names/aliases had a particular structure (unlike character names), but
|
||||
|
|
|
@ -1,14 +1,17 @@
|
|||
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
|
||||
//
|
||||
// ucd-generate age /tmp/ucd/12.1.0/ --chars
|
||||
// ucd-generate age ucd-13.0.0 --chars
|
||||
//
|
||||
// ucd-generate is available on crates.io.
|
||||
// Unicode version: 13.0.0.
|
||||
//
|
||||
// ucd-generate 0.2.8 is available on crates.io.
|
||||
|
||||
pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[
|
||||
("V10_0", V10_0),
|
||||
("V11_0", V11_0),
|
||||
("V12_0", V12_0),
|
||||
("V12_1", V12_1),
|
||||
("V13_0", V13_0),
|
||||
("V1_1", V1_1),
|
||||
("V2_0", V2_0),
|
||||
("V2_1", V2_1),
|
||||
|
@ -135,69 +138,131 @@ pub const V11_0: &'static [(char, char)] = &[
|
|||
];
|
||||
|
||||
pub const V12_0: &'static [(char, char)] = &[
|
||||
('\u{c77}', '\u{c77}'),
|
||||
('\u{e86}', '\u{e86}'),
|
||||
('\u{e89}', '\u{e89}'),
|
||||
('\u{e8c}', '\u{e8c}'),
|
||||
('\u{e8e}', '\u{e93}'),
|
||||
('\u{e98}', '\u{e98}'),
|
||||
('\u{ea0}', '\u{ea0}'),
|
||||
('\u{ea8}', '\u{ea9}'),
|
||||
('\u{eac}', '\u{eac}'),
|
||||
('౷', '౷'),
|
||||
('ຆ', 'ຆ'),
|
||||
('ຉ', 'ຉ'),
|
||||
('ຌ', 'ຌ'),
|
||||
('ຎ', 'ຓ'),
|
||||
('ຘ', 'ຘ'),
|
||||
('ຠ', 'ຠ'),
|
||||
('ຨ', 'ຩ'),
|
||||
('ຬ', 'ຬ'),
|
||||
('\u{eba}', '\u{eba}'),
|
||||
('\u{1cfa}', '\u{1cfa}'),
|
||||
('\u{2bc9}', '\u{2bc9}'),
|
||||
('\u{2bff}', '\u{2bff}'),
|
||||
('\u{2e4f}', '\u{2e4f}'),
|
||||
('\u{a7ba}', '\u{a7bf}'),
|
||||
('\u{a7c2}', '\u{a7c6}'),
|
||||
('\u{ab66}', '\u{ab67}'),
|
||||
('\u{10fe0}', '\u{10ff6}'),
|
||||
('\u{1145f}', '\u{1145f}'),
|
||||
('\u{116b8}', '\u{116b8}'),
|
||||
('\u{119a0}', '\u{119a7}'),
|
||||
('\u{119aa}', '\u{119d7}'),
|
||||
('\u{119da}', '\u{119e4}'),
|
||||
('\u{11a84}', '\u{11a85}'),
|
||||
('\u{11fc0}', '\u{11ff1}'),
|
||||
('\u{11fff}', '\u{11fff}'),
|
||||
('ᳺ', 'ᳺ'),
|
||||
('⯉', '⯉'),
|
||||
('⯿', '⯿'),
|
||||
('⹏', '⹏'),
|
||||
('Ꞻ', 'ꞿ'),
|
||||
('Ꟃ', 'Ᶎ'),
|
||||
('ꭦ', 'ꭧ'),
|
||||
('𐿠', '𐿶'),
|
||||
('𑑟', '𑑟'),
|
||||
('𑚸', '𑚸'),
|
||||
('𑦠', '𑦧'),
|
||||
('𑦪', '\u{119d7}'),
|
||||
('\u{119da}', '𑧤'),
|
||||
('𑪄', '𑪅'),
|
||||
('𑿀', '𑿱'),
|
||||
('𑿿', '𑿿'),
|
||||
('\u{13430}', '\u{13438}'),
|
||||
('\u{16f45}', '\u{16f4a}'),
|
||||
('𖽅', '𖽊'),
|
||||
('\u{16f4f}', '\u{16f4f}'),
|
||||
('\u{16f7f}', '\u{16f87}'),
|
||||
('\u{16fe2}', '\u{16fe3}'),
|
||||
('\u{187f2}', '\u{187f7}'),
|
||||
('\u{1b150}', '\u{1b152}'),
|
||||
('\u{1b164}', '\u{1b167}'),
|
||||
('\u{1e100}', '\u{1e12c}'),
|
||||
('\u{1e130}', '\u{1e13d}'),
|
||||
('\u{1e140}', '\u{1e149}'),
|
||||
('\u{1e14e}', '\u{1e14f}'),
|
||||
('\u{1e2c0}', '\u{1e2f9}'),
|
||||
('\u{1e2ff}', '\u{1e2ff}'),
|
||||
('\u{1e94b}', '\u{1e94b}'),
|
||||
('\u{1ed01}', '\u{1ed3d}'),
|
||||
('\u{1f16c}', '\u{1f16c}'),
|
||||
('\u{1f6d5}', '\u{1f6d5}'),
|
||||
('\u{1f6fa}', '\u{1f6fa}'),
|
||||
('\u{1f7e0}', '\u{1f7eb}'),
|
||||
('\u{1f90d}', '\u{1f90f}'),
|
||||
('\u{1f93f}', '\u{1f93f}'),
|
||||
('\u{1f971}', '\u{1f971}'),
|
||||
('\u{1f97b}', '\u{1f97b}'),
|
||||
('\u{1f9a5}', '\u{1f9aa}'),
|
||||
('\u{1f9ae}', '\u{1f9af}'),
|
||||
('\u{1f9ba}', '\u{1f9bf}'),
|
||||
('\u{1f9c3}', '\u{1f9ca}'),
|
||||
('\u{1f9cd}', '\u{1f9cf}'),
|
||||
('\u{1fa00}', '\u{1fa53}'),
|
||||
('\u{1fa70}', '\u{1fa73}'),
|
||||
('\u{1fa78}', '\u{1fa7a}'),
|
||||
('\u{1fa80}', '\u{1fa82}'),
|
||||
('\u{1fa90}', '\u{1fa95}'),
|
||||
('𖽿', '𖾇'),
|
||||
('𖿢', '𖿣'),
|
||||
('𘟲', '𘟷'),
|
||||
('𛅐', '𛅒'),
|
||||
('𛅤', '𛅧'),
|
||||
('𞄀', '𞄬'),
|
||||
('\u{1e130}', '𞄽'),
|
||||
('𞅀', '𞅉'),
|
||||
('𞅎', '𞅏'),
|
||||
('𞋀', '𞋹'),
|
||||
('𞋿', '𞋿'),
|
||||
('𞥋', '𞥋'),
|
||||
('𞴁', '𞴽'),
|
||||
('🅬', '🅬'),
|
||||
('🛕', '🛕'),
|
||||
('🛺', '🛺'),
|
||||
('🟠', '🟫'),
|
||||
('🤍', '🤏'),
|
||||
('🤿', '🤿'),
|
||||
('🥱', '🥱'),
|
||||
('🥻', '🥻'),
|
||||
('🦥', '🦪'),
|
||||
('🦮', '🦯'),
|
||||
('🦺', '🦿'),
|
||||
('🧃', '🧊'),
|
||||
('🧍', '🧏'),
|
||||
('🨀', '🩓'),
|
||||
('🩰', '🩳'),
|
||||
('🩸', '🩺'),
|
||||
('🪀', '🪂'),
|
||||
('🪐', '🪕'),
|
||||
];
|
||||
|
||||
pub const V12_1: &'static [(char, char)] = &[('\u{32ff}', '\u{32ff}')];
|
||||
pub const V12_1: &'static [(char, char)] = &[('㋿', '㋿')];
|
||||
|
||||
pub const V13_0: &'static [(char, char)] = &[
|
||||
('\u{8be}', '\u{8c7}'),
|
||||
('\u{b55}', '\u{b55}'),
|
||||
('\u{d04}', '\u{d04}'),
|
||||
('\u{d81}', '\u{d81}'),
|
||||
('\u{1abf}', '\u{1ac0}'),
|
||||
('\u{2b97}', '\u{2b97}'),
|
||||
('\u{2e50}', '\u{2e52}'),
|
||||
('\u{31bb}', '\u{31bf}'),
|
||||
('\u{4db6}', '\u{4dbf}'),
|
||||
('\u{9ff0}', '\u{9ffc}'),
|
||||
('\u{a7c7}', '\u{a7ca}'),
|
||||
('\u{a7f5}', '\u{a7f6}'),
|
||||
('\u{a82c}', '\u{a82c}'),
|
||||
('\u{ab68}', '\u{ab6b}'),
|
||||
('\u{1019c}', '\u{1019c}'),
|
||||
('\u{10e80}', '\u{10ea9}'),
|
||||
('\u{10eab}', '\u{10ead}'),
|
||||
('\u{10eb0}', '\u{10eb1}'),
|
||||
('\u{10fb0}', '\u{10fcb}'),
|
||||
('\u{11147}', '\u{11147}'),
|
||||
('\u{111ce}', '\u{111cf}'),
|
||||
('\u{1145a}', '\u{1145a}'),
|
||||
('\u{11460}', '\u{11461}'),
|
||||
('\u{11900}', '\u{11906}'),
|
||||
('\u{11909}', '\u{11909}'),
|
||||
('\u{1190c}', '\u{11913}'),
|
||||
('\u{11915}', '\u{11916}'),
|
||||
('\u{11918}', '\u{11935}'),
|
||||
('\u{11937}', '\u{11938}'),
|
||||
('\u{1193b}', '\u{11946}'),
|
||||
('\u{11950}', '\u{11959}'),
|
||||
('\u{11fb0}', '\u{11fb0}'),
|
||||
('\u{16fe4}', '\u{16fe4}'),
|
||||
('\u{16ff0}', '\u{16ff1}'),
|
||||
('\u{18af3}', '\u{18cd5}'),
|
||||
('\u{18d00}', '\u{18d08}'),
|
||||
('\u{1f10d}', '\u{1f10f}'),
|
||||
('\u{1f16d}', '\u{1f16f}'),
|
||||
('\u{1f1ad}', '\u{1f1ad}'),
|
||||
('\u{1f6d6}', '\u{1f6d7}'),
|
||||
('\u{1f6fb}', '\u{1f6fc}'),
|
||||
('\u{1f8b0}', '\u{1f8b1}'),
|
||||
('\u{1f90c}', '\u{1f90c}'),
|
||||
('\u{1f972}', '\u{1f972}'),
|
||||
('\u{1f977}', '\u{1f978}'),
|
||||
('\u{1f9a3}', '\u{1f9a4}'),
|
||||
('\u{1f9ab}', '\u{1f9ad}'),
|
||||
('\u{1f9cb}', '\u{1f9cb}'),
|
||||
('\u{1fa74}', '\u{1fa74}'),
|
||||
('\u{1fa83}', '\u{1fa86}'),
|
||||
('\u{1fa96}', '\u{1faa8}'),
|
||||
('\u{1fab0}', '\u{1fab6}'),
|
||||
('\u{1fac0}', '\u{1fac2}'),
|
||||
('\u{1fad0}', '\u{1fad6}'),
|
||||
('\u{1fb00}', '\u{1fb92}'),
|
||||
('\u{1fb94}', '\u{1fbca}'),
|
||||
('\u{1fbf0}', '\u{1fbf9}'),
|
||||
('\u{2a6d7}', '\u{2a6dd}'),
|
||||
('\u{30000}', '\u{3134a}'),
|
||||
];
|
||||
|
||||
pub const V1_1: &'static [(char, char)] = &[
|
||||
('\u{0}', 'ǵ'),
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
|
||||
//
|
||||
// ucd-generate case-folding-simple /tmp/ucd/12.1.0/ --chars --all-pairs
|
||||
// ucd-generate case-folding-simple ucd-13.0.0 --chars --all-pairs
|
||||
//
|
||||
// ucd-generate is available on crates.io.
|
||||
// Unicode version: 13.0.0.
|
||||
//
|
||||
// ucd-generate 0.2.8 is available on crates.io.
|
||||
|
||||
pub const CASE_FOLDING_SIMPLE: &'static [(char, &'static [char])] = &[
|
||||
('A', &['a']),
|
||||
|
@ -459,7 +461,7 @@ pub const CASE_FOLDING_SIMPLE: &'static [(char, &'static [char])] = &[
|
|||
('ɵ', &['Ɵ']),
|
||||
('ɽ', &['Ɽ']),
|
||||
('ʀ', &['Ʀ']),
|
||||
('ʂ', &['\u{a7c5}']),
|
||||
('ʂ', &['Ʂ']),
|
||||
('ʃ', &['Ʃ']),
|
||||
('ʇ', &['Ʇ']),
|
||||
('ʈ', &['Ʈ']),
|
||||
|
@ -1199,7 +1201,7 @@ pub const CASE_FOLDING_SIMPLE: &'static [(char, &'static [char])] = &[
|
|||
('Ჿ', &['ჿ']),
|
||||
('ᵹ', &['Ᵹ']),
|
||||
('ᵽ', &['Ᵽ']),
|
||||
('ᶎ', &['\u{a7c6}']),
|
||||
('ᶎ', &['Ᶎ']),
|
||||
('Ḁ', &['ḁ']),
|
||||
('ḁ', &['Ḁ']),
|
||||
('Ḃ', &['ḃ']),
|
||||
|
@ -2167,7 +2169,7 @@ pub const CASE_FOLDING_SIMPLE: &'static [(char, &'static [char])] = &[
|
|||
('ꞑ', &['Ꞑ']),
|
||||
('Ꞓ', &['ꞓ']),
|
||||
('ꞓ', &['Ꞓ']),
|
||||
('ꞔ', &['\u{a7c4}']),
|
||||
('ꞔ', &['Ꞔ']),
|
||||
('Ꞗ', &['ꞗ']),
|
||||
('ꞗ', &['Ꞗ']),
|
||||
('Ꞙ', &['ꞙ']),
|
||||
|
@ -2203,17 +2205,23 @@ pub const CASE_FOLDING_SIMPLE: &'static [(char, &'static [char])] = &[
|
|||
('ꞷ', &['Ꞷ']),
|
||||
('Ꞹ', &['ꞹ']),
|
||||
('ꞹ', &['Ꞹ']),
|
||||
('\u{a7ba}', &['\u{a7bb}']),
|
||||
('\u{a7bb}', &['\u{a7ba}']),
|
||||
('\u{a7bc}', &['\u{a7bd}']),
|
||||
('\u{a7bd}', &['\u{a7bc}']),
|
||||
('\u{a7be}', &['\u{a7bf}']),
|
||||
('\u{a7bf}', &['\u{a7be}']),
|
||||
('\u{a7c2}', &['\u{a7c3}']),
|
||||
('\u{a7c3}', &['\u{a7c2}']),
|
||||
('\u{a7c4}', &['ꞔ']),
|
||||
('\u{a7c5}', &['ʂ']),
|
||||
('\u{a7c6}', &['ᶎ']),
|
||||
('Ꞻ', &['ꞻ']),
|
||||
('ꞻ', &['Ꞻ']),
|
||||
('Ꞽ', &['ꞽ']),
|
||||
('ꞽ', &['Ꞽ']),
|
||||
('Ꞿ', &['ꞿ']),
|
||||
('ꞿ', &['Ꞿ']),
|
||||
('Ꟃ', &['ꟃ']),
|
||||
('ꟃ', &['Ꟃ']),
|
||||
('Ꞔ', &['ꞔ']),
|
||||
('Ʂ', &['ʂ']),
|
||||
('Ᶎ', &['ᶎ']),
|
||||
('\u{a7c7}', &['\u{a7c8}']),
|
||||
('\u{a7c8}', &['\u{a7c7}']),
|
||||
('\u{a7c9}', &['\u{a7ca}']),
|
||||
('\u{a7ca}', &['\u{a7c9}']),
|
||||
('\u{a7f5}', &['\u{a7f6}']),
|
||||
('\u{a7f6}', &['\u{a7f5}']),
|
||||
('ꭓ', &['Ꭓ']),
|
||||
('ꭰ', &['Ꭰ']),
|
||||
('ꭱ', &['Ꭱ']),
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,8 +1,10 @@
|
|||
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
|
||||
//
|
||||
// ucd-generate grapheme-cluster-break /tmp/ucd/12.1.0/ --chars
|
||||
// ucd-generate grapheme-cluster-break ucd-13.0.0 --chars
|
||||
//
|
||||
// ucd-generate is available on crates.io.
|
||||
// Unicode version: 13.0.0.
|
||||
//
|
||||
// ucd-generate 0.2.8 is available on crates.io.
|
||||
|
||||
pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[
|
||||
("CR", CR),
|
||||
|
@ -105,7 +107,7 @@ pub const EXTEND: &'static [(char, char)] = &[
|
|||
('\u{b3e}', '\u{b3f}'),
|
||||
('\u{b41}', '\u{b44}'),
|
||||
('\u{b4d}', '\u{b4d}'),
|
||||
('\u{b56}', '\u{b57}'),
|
||||
('\u{b55}', '\u{b57}'),
|
||||
('\u{b62}', '\u{b63}'),
|
||||
('\u{b82}', '\u{b82}'),
|
||||
('\u{bbe}', '\u{bbe}'),
|
||||
|
@ -134,6 +136,7 @@ pub const EXTEND: &'static [(char, char)] = &[
|
|||
('\u{d4d}', '\u{d4d}'),
|
||||
('\u{d57}', '\u{d57}'),
|
||||
('\u{d62}', '\u{d63}'),
|
||||
('\u{d81}', '\u{d81}'),
|
||||
('\u{dca}', '\u{dca}'),
|
||||
('\u{dcf}', '\u{dcf}'),
|
||||
('\u{dd2}', '\u{dd4}'),
|
||||
|
@ -192,7 +195,7 @@ pub const EXTEND: &'static [(char, char)] = &[
|
|||
('\u{1a65}', '\u{1a6c}'),
|
||||
('\u{1a73}', '\u{1a7c}'),
|
||||
('\u{1a7f}', '\u{1a7f}'),
|
||||
('\u{1ab0}', '\u{1abe}'),
|
||||
('\u{1ab0}', '\u{1ac0}'),
|
||||
('\u{1b00}', '\u{1b03}'),
|
||||
('\u{1b34}', '\u{1b3a}'),
|
||||
('\u{1b3c}', '\u{1b3c}'),
|
||||
|
@ -231,6 +234,7 @@ pub const EXTEND: &'static [(char, char)] = &[
|
|||
('\u{a806}', '\u{a806}'),
|
||||
('\u{a80b}', '\u{a80b}'),
|
||||
('\u{a825}', '\u{a826}'),
|
||||
('\u{a82c}', '\u{a82c}'),
|
||||
('\u{a8c4}', '\u{a8c5}'),
|
||||
('\u{a8e0}', '\u{a8f1}'),
|
||||
('\u{a8ff}', '\u{a8ff}'),
|
||||
|
@ -239,7 +243,7 @@ pub const EXTEND: &'static [(char, char)] = &[
|
|||
('\u{a980}', '\u{a982}'),
|
||||
('\u{a9b3}', '\u{a9b3}'),
|
||||
('\u{a9b6}', '\u{a9b9}'),
|
||||
('\u{a9bc}', 'ꦽ'),
|
||||
('\u{a9bc}', '\u{a9bd}'),
|
||||
('\u{a9e5}', '\u{a9e5}'),
|
||||
('\u{aa29}', '\u{aa2e}'),
|
||||
('\u{aa31}', '\u{aa32}'),
|
||||
|
@ -271,6 +275,7 @@ pub const EXTEND: &'static [(char, char)] = &[
|
|||
('\u{10a3f}', '\u{10a3f}'),
|
||||
('\u{10ae5}', '\u{10ae6}'),
|
||||
('\u{10d24}', '\u{10d27}'),
|
||||
('\u{10eab}', '\u{10eac}'),
|
||||
('\u{10f46}', '\u{10f50}'),
|
||||
('\u{11001}', '\u{11001}'),
|
||||
('\u{11038}', '\u{11046}'),
|
||||
|
@ -284,6 +289,7 @@ pub const EXTEND: &'static [(char, char)] = &[
|
|||
('\u{11180}', '\u{11181}'),
|
||||
('\u{111b6}', '\u{111be}'),
|
||||
('\u{111c9}', '\u{111cc}'),
|
||||
('\u{111cf}', '\u{111cf}'),
|
||||
('\u{1122f}', '\u{11231}'),
|
||||
('\u{11234}', '\u{11234}'),
|
||||
('\u{11236}', '\u{11237}'),
|
||||
|
@ -324,6 +330,10 @@ pub const EXTEND: &'static [(char, char)] = &[
|
|||
('\u{11727}', '\u{1172b}'),
|
||||
('\u{1182f}', '\u{11837}'),
|
||||
('\u{11839}', '\u{1183a}'),
|
||||
('\u{11930}', '\u{11930}'),
|
||||
('\u{1193b}', '\u{1193c}'),
|
||||
('\u{1193e}', '\u{1193e}'),
|
||||
('\u{11943}', '\u{11943}'),
|
||||
('\u{119d4}', '\u{119d7}'),
|
||||
('\u{119da}', '\u{119db}'),
|
||||
('\u{119e0}', '\u{119e0}'),
|
||||
|
@ -355,6 +365,7 @@ pub const EXTEND: &'static [(char, char)] = &[
|
|||
('\u{16b30}', '\u{16b36}'),
|
||||
('\u{16f4f}', '\u{16f4f}'),
|
||||
('\u{16f8f}', '\u{16f92}'),
|
||||
('\u{16fe4}', '\u{16fe4}'),
|
||||
('\u{1bc9d}', '\u{1bc9e}'),
|
||||
('\u{1d165}', '\u{1d165}'),
|
||||
('\u{1d167}', '\u{1d169}'),
|
||||
|
@ -1200,8 +1211,10 @@ pub const PREPEND: &'static [(char, char)] = &[
|
|||
('\u{110bd}', '\u{110bd}'),
|
||||
('\u{110cd}', '\u{110cd}'),
|
||||
('𑇂', '𑇃'),
|
||||
('\u{1193f}', '\u{1193f}'),
|
||||
('\u{11941}', '\u{11941}'),
|
||||
('𑨺', '𑨺'),
|
||||
('\u{11a84}', '𑪉'),
|
||||
('𑪄', '𑪉'),
|
||||
('𑵆', '𑵆'),
|
||||
];
|
||||
|
||||
|
@ -1311,6 +1324,7 @@ pub const SPACINGMARK: &'static [(char, char)] = &[
|
|||
('𑆂', '𑆂'),
|
||||
('𑆳', '𑆵'),
|
||||
('𑆿', '𑇀'),
|
||||
('\u{111ce}', '\u{111ce}'),
|
||||
('𑈬', '𑈮'),
|
||||
('𑈲', '𑈳'),
|
||||
('𑈵', '𑈵'),
|
||||
|
@ -1342,9 +1356,14 @@ pub const SPACINGMARK: &'static [(char, char)] = &[
|
|||
('𑜦', '𑜦'),
|
||||
('𑠬', '𑠮'),
|
||||
('𑠸', '𑠸'),
|
||||
('\u{119d1}', '\u{119d3}'),
|
||||
('\u{119dc}', '\u{119df}'),
|
||||
('\u{119e4}', '\u{119e4}'),
|
||||
('\u{11931}', '\u{11935}'),
|
||||
('\u{11937}', '\u{11938}'),
|
||||
('\u{1193d}', '\u{1193d}'),
|
||||
('\u{11940}', '\u{11940}'),
|
||||
('\u{11942}', '\u{11942}'),
|
||||
('𑧑', '𑧓'),
|
||||
('𑧜', '𑧟'),
|
||||
('𑧤', '𑧤'),
|
||||
('𑨹', '𑨹'),
|
||||
('𑩗', '𑩘'),
|
||||
('𑪗', '𑪗'),
|
||||
|
@ -1357,7 +1376,8 @@ pub const SPACINGMARK: &'static [(char, char)] = &[
|
|||
('𑶓', '𑶔'),
|
||||
('𑶖', '𑶖'),
|
||||
('𑻵', '𑻶'),
|
||||
('𖽑', '\u{16f87}'),
|
||||
('𖽑', '𖾇'),
|
||||
('\u{16ff0}', '\u{16ff1}'),
|
||||
('𝅦', '𝅦'),
|
||||
('𝅭', '𝅭'),
|
||||
];
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
|
||||
//
|
||||
// ucd-generate general-category /tmp/ucd/12.1.0/ --chars --include decimalnumber
|
||||
// ucd-generate general-category ucd-13.0.0 --chars --include decimalnumber
|
||||
//
|
||||
// ucd-generate is available on crates.io.
|
||||
// Unicode version: 13.0.0.
|
||||
//
|
||||
// ucd-generate 0.2.8 is available on crates.io.
|
||||
|
||||
pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] =
|
||||
&[("Decimal_Number", DECIMAL_NUMBER)];
|
||||
|
@ -58,13 +60,15 @@ pub const DECIMAL_NUMBER: &'static [(char, char)] = &[
|
|||
('𑛀', '𑛉'),
|
||||
('𑜰', '𑜹'),
|
||||
('𑣠', '𑣩'),
|
||||
('\u{11950}', '\u{11959}'),
|
||||
('𑱐', '𑱙'),
|
||||
('𑵐', '𑵙'),
|
||||
('𑶠', '𑶩'),
|
||||
('𖩠', '𖩩'),
|
||||
('𖭐', '𖭙'),
|
||||
('𝟎', '𝟿'),
|
||||
('\u{1e140}', '\u{1e149}'),
|
||||
('\u{1e2f0}', '\u{1e2f9}'),
|
||||
('𞅀', '𞅉'),
|
||||
('𞋰', '𞋹'),
|
||||
('𞥐', '𞥙'),
|
||||
('\u{1fbf0}', '\u{1fbf9}'),
|
||||
];
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
|
||||
//
|
||||
// ucd-generate property-bool /tmp/ucd/12.1.0/ --chars --include whitespace
|
||||
// ucd-generate property-bool ucd-13.0.0 --chars --include whitespace
|
||||
//
|
||||
// ucd-generate is available on crates.io.
|
||||
// Unicode version: 13.0.0.
|
||||
//
|
||||
// ucd-generate 0.2.8 is available on crates.io.
|
||||
|
||||
pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] =
|
||||
&[("White_Space", WHITE_SPACE)];
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
|
||||
//
|
||||
// ucd-generate perl-word /tmp/ucd/12.1.0/ --chars
|
||||
// ucd-generate perl-word ucd-13.0.0 --chars
|
||||
//
|
||||
// ucd-generate is available on crates.io.
|
||||
// Unicode version: 13.0.0.
|
||||
//
|
||||
// ucd-generate 0.2.8 is available on crates.io.
|
||||
|
||||
pub const PERL_WORD: &'static [(char, char)] = &[
|
||||
('0', '9'),
|
||||
|
@ -56,7 +58,7 @@ pub const PERL_WORD: &'static [(char, char)] = &[
|
|||
('ࡀ', '\u{85b}'),
|
||||
('ࡠ', 'ࡪ'),
|
||||
('ࢠ', 'ࢴ'),
|
||||
('ࢶ', 'ࢽ'),
|
||||
('ࢶ', '\u{8c7}'),
|
||||
('\u{8d3}', '\u{8e1}'),
|
||||
('\u{8e3}', '\u{963}'),
|
||||
('०', '९'),
|
||||
|
@ -116,7 +118,7 @@ pub const PERL_WORD: &'static [(char, char)] = &[
|
|||
('\u{b3c}', '\u{b44}'),
|
||||
('େ', 'ୈ'),
|
||||
('ୋ', '\u{b4d}'),
|
||||
('\u{b56}', '\u{b57}'),
|
||||
('\u{b55}', '\u{b57}'),
|
||||
('ଡ଼', 'ଢ଼'),
|
||||
('ୟ', '\u{b63}'),
|
||||
('୦', '୯'),
|
||||
|
@ -162,8 +164,7 @@ pub const PERL_WORD: &'static [(char, char)] = &[
|
|||
('ೠ', '\u{ce3}'),
|
||||
('೦', '೯'),
|
||||
('ೱ', 'ೲ'),
|
||||
('\u{d00}', 'ഃ'),
|
||||
('അ', 'ഌ'),
|
||||
('\u{d00}', 'ഌ'),
|
||||
('എ', 'ഐ'),
|
||||
('ഒ', '\u{d44}'),
|
||||
('െ', 'ൈ'),
|
||||
|
@ -172,7 +173,7 @@ pub const PERL_WORD: &'static [(char, char)] = &[
|
|||
('ൟ', '\u{d63}'),
|
||||
('൦', '൯'),
|
||||
('ൺ', 'ൿ'),
|
||||
('ං', 'ඃ'),
|
||||
('\u{d81}', 'ඃ'),
|
||||
('අ', 'ඖ'),
|
||||
('ක', 'න'),
|
||||
('ඳ', 'ර'),
|
||||
|
@ -189,8 +190,8 @@ pub const PERL_WORD: &'static [(char, char)] = &[
|
|||
('๐', '๙'),
|
||||
('ກ', 'ຂ'),
|
||||
('ຄ', 'ຄ'),
|
||||
('\u{e86}', 'ຊ'),
|
||||
('\u{e8c}', 'ຣ'),
|
||||
('ຆ', 'ຊ'),
|
||||
('ຌ', 'ຣ'),
|
||||
('ລ', 'ລ'),
|
||||
('ວ', 'ຽ'),
|
||||
('ເ', 'ໄ'),
|
||||
|
@ -271,7 +272,7 @@ pub const PERL_WORD: &'static [(char, char)] = &[
|
|||
('\u{1a7f}', '᪉'),
|
||||
('᪐', '᪙'),
|
||||
('ᪧ', 'ᪧ'),
|
||||
('\u{1ab0}', '\u{1abe}'),
|
||||
('\u{1ab0}', '\u{1ac0}'),
|
||||
('\u{1b00}', 'ᭋ'),
|
||||
('᭐', '᭙'),
|
||||
('\u{1b6b}', '\u{1b73}'),
|
||||
|
@ -283,7 +284,7 @@ pub const PERL_WORD: &'static [(char, char)] = &[
|
|||
('Ა', 'Ჺ'),
|
||||
('Ჽ', 'Ჿ'),
|
||||
('\u{1cd0}', '\u{1cd2}'),
|
||||
('\u{1cd4}', '\u{1cfa}'),
|
||||
('\u{1cd4}', 'ᳺ'),
|
||||
('ᴀ', '\u{1df9}'),
|
||||
('\u{1dfb}', 'ἕ'),
|
||||
('Ἐ', 'Ἕ'),
|
||||
|
@ -357,10 +358,10 @@ pub const PERL_WORD: &'static [(char, char)] = &[
|
|||
('ー', 'ヿ'),
|
||||
('ㄅ', 'ㄯ'),
|
||||
('ㄱ', 'ㆎ'),
|
||||
('ㆠ', 'ㆺ'),
|
||||
('ㆠ', '\u{31bf}'),
|
||||
('ㇰ', 'ㇿ'),
|
||||
('㐀', '䶵'),
|
||||
('一', '鿯'),
|
||||
('㐀', '\u{4dbf}'),
|
||||
('一', '\u{9ffc}'),
|
||||
('ꀀ', 'ꒌ'),
|
||||
('ꓐ', 'ꓽ'),
|
||||
('ꔀ', 'ꘌ'),
|
||||
|
@ -370,9 +371,10 @@ pub const PERL_WORD: &'static [(char, char)] = &[
|
|||
('ꙿ', '\u{a6f1}'),
|
||||
('ꜗ', 'ꜟ'),
|
||||
('Ꜣ', 'ꞈ'),
|
||||
('Ꞌ', '\u{a7bf}'),
|
||||
('\u{a7c2}', '\u{a7c6}'),
|
||||
('ꟷ', 'ꠧ'),
|
||||
('Ꞌ', 'ꞿ'),
|
||||
('Ꟃ', '\u{a7ca}'),
|
||||
('\u{a7f5}', 'ꠧ'),
|
||||
('\u{a82c}', '\u{a82c}'),
|
||||
('ꡀ', 'ꡳ'),
|
||||
('ꢀ', '\u{a8c5}'),
|
||||
('꣐', '꣙'),
|
||||
|
@ -398,7 +400,7 @@ pub const PERL_WORD: &'static [(char, char)] = &[
|
|||
('ꬠ', 'ꬦ'),
|
||||
('ꬨ', 'ꬮ'),
|
||||
('ꬰ', 'ꭚ'),
|
||||
('ꭜ', '\u{ab67}'),
|
||||
('ꭜ', '\u{ab69}'),
|
||||
('ꭰ', 'ꯪ'),
|
||||
('꯬', '\u{abed}'),
|
||||
('꯰', '꯹'),
|
||||
|
@ -497,10 +499,14 @@ pub const PERL_WORD: &'static [(char, char)] = &[
|
|||
('𐳀', '𐳲'),
|
||||
('𐴀', '\u{10d27}'),
|
||||
('𐴰', '𐴹'),
|
||||
('\u{10e80}', '\u{10ea9}'),
|
||||
('\u{10eab}', '\u{10eac}'),
|
||||
('\u{10eb0}', '\u{10eb1}'),
|
||||
('𐼀', '𐼜'),
|
||||
('𐼧', '𐼧'),
|
||||
('𐼰', '\u{10f50}'),
|
||||
('\u{10fe0}', '\u{10ff6}'),
|
||||
('\u{10fb0}', '\u{10fc4}'),
|
||||
('𐿠', '𐿶'),
|
||||
('𑀀', '\u{11046}'),
|
||||
('𑁦', '𑁯'),
|
||||
('\u{1107f}', '\u{110ba}'),
|
||||
|
@ -508,12 +514,12 @@ pub const PERL_WORD: &'static [(char, char)] = &[
|
|||
('𑃰', '𑃹'),
|
||||
('\u{11100}', '\u{11134}'),
|
||||
('𑄶', '𑄿'),
|
||||
('𑅄', '𑅆'),
|
||||
('𑅄', '\u{11147}'),
|
||||
('𑅐', '\u{11173}'),
|
||||
('𑅶', '𑅶'),
|
||||
('\u{11180}', '𑇄'),
|
||||
('\u{111c9}', '\u{111cc}'),
|
||||
('𑇐', '𑇚'),
|
||||
('\u{111ce}', '𑇚'),
|
||||
('𑇜', '𑇜'),
|
||||
('𑈀', '𑈑'),
|
||||
('𑈓', '\u{11237}'),
|
||||
|
@ -542,7 +548,7 @@ pub const PERL_WORD: &'static [(char, char)] = &[
|
|||
('\u{11370}', '\u{11374}'),
|
||||
('𑐀', '𑑊'),
|
||||
('𑑐', '𑑙'),
|
||||
('\u{1145e}', '\u{1145f}'),
|
||||
('\u{1145e}', '\u{11461}'),
|
||||
('𑒀', '𑓅'),
|
||||
('𑓇', '𑓇'),
|
||||
('𑓐', '𑓙'),
|
||||
|
@ -552,18 +558,25 @@ pub const PERL_WORD: &'static [(char, char)] = &[
|
|||
('𑘀', '\u{11640}'),
|
||||
('𑙄', '𑙄'),
|
||||
('𑙐', '𑙙'),
|
||||
('𑚀', '\u{116b8}'),
|
||||
('𑚀', '𑚸'),
|
||||
('𑛀', '𑛉'),
|
||||
('𑜀', '𑜚'),
|
||||
('\u{1171d}', '\u{1172b}'),
|
||||
('𑜰', '𑜹'),
|
||||
('𑠀', '\u{1183a}'),
|
||||
('𑢠', '𑣩'),
|
||||
('𑣿', '𑣿'),
|
||||
('\u{119a0}', '\u{119a7}'),
|
||||
('\u{119aa}', '\u{119d7}'),
|
||||
('\u{119da}', '\u{119e1}'),
|
||||
('\u{119e3}', '\u{119e4}'),
|
||||
('𑣿', '\u{11906}'),
|
||||
('\u{11909}', '\u{11909}'),
|
||||
('\u{1190c}', '\u{11913}'),
|
||||
('\u{11915}', '\u{11916}'),
|
||||
('\u{11918}', '\u{11935}'),
|
||||
('\u{11937}', '\u{11938}'),
|
||||
('\u{1193b}', '\u{11943}'),
|
||||
('\u{11950}', '\u{11959}'),
|
||||
('𑦠', '𑦧'),
|
||||
('𑦪', '\u{119d7}'),
|
||||
('\u{119da}', '𑧡'),
|
||||
('𑧣', '𑧤'),
|
||||
('𑨀', '\u{11a3e}'),
|
||||
('\u{11a47}', '\u{11a47}'),
|
||||
('𑩐', '\u{11a99}'),
|
||||
|
@ -590,6 +603,7 @@ pub const PERL_WORD: &'static [(char, char)] = &[
|
|||
('𑶓', '𑶘'),
|
||||
('𑶠', '𑶩'),
|
||||
('𑻠', '𑻶'),
|
||||
('\u{11fb0}', '\u{11fb0}'),
|
||||
('𒀀', '𒎙'),
|
||||
('𒐀', '𒑮'),
|
||||
('𒒀', '𒕃'),
|
||||
|
@ -606,16 +620,18 @@ pub const PERL_WORD: &'static [(char, char)] = &[
|
|||
('𖭣', '𖭷'),
|
||||
('𖭽', '𖮏'),
|
||||
('𖹀', '𖹿'),
|
||||
('𖼀', '\u{16f4a}'),
|
||||
('\u{16f4f}', '\u{16f87}'),
|
||||
('𖼀', '𖽊'),
|
||||
('\u{16f4f}', '𖾇'),
|
||||
('\u{16f8f}', '𖾟'),
|
||||
('𖿠', '𖿡'),
|
||||
('\u{16fe3}', '\u{16fe3}'),
|
||||
('𗀀', '\u{187f7}'),
|
||||
('𘠀', '𘫲'),
|
||||
('𖿣', '\u{16fe4}'),
|
||||
('\u{16ff0}', '\u{16ff1}'),
|
||||
('𗀀', '𘟷'),
|
||||
('𘠀', '\u{18cd5}'),
|
||||
('\u{18d00}', '\u{18d08}'),
|
||||
('𛀀', '𛄞'),
|
||||
('\u{1b150}', '\u{1b152}'),
|
||||
('\u{1b164}', '\u{1b167}'),
|
||||
('𛅐', '𛅒'),
|
||||
('𛅤', '𛅧'),
|
||||
('𛅰', '𛋻'),
|
||||
('𛰀', '𛱪'),
|
||||
('𛱰', '𛱼'),
|
||||
|
@ -670,14 +686,14 @@ pub const PERL_WORD: &'static [(char, char)] = &[
|
|||
('\u{1e01b}', '\u{1e021}'),
|
||||
('\u{1e023}', '\u{1e024}'),
|
||||
('\u{1e026}', '\u{1e02a}'),
|
||||
('\u{1e100}', '\u{1e12c}'),
|
||||
('\u{1e130}', '\u{1e13d}'),
|
||||
('\u{1e140}', '\u{1e149}'),
|
||||
('\u{1e14e}', '\u{1e14e}'),
|
||||
('\u{1e2c0}', '\u{1e2f9}'),
|
||||
('𞄀', '𞄬'),
|
||||
('\u{1e130}', '𞄽'),
|
||||
('𞅀', '𞅉'),
|
||||
('𞅎', '𞅎'),
|
||||
('𞋀', '𞋹'),
|
||||
('𞠀', '𞣄'),
|
||||
('\u{1e8d0}', '\u{1e8d6}'),
|
||||
('𞤀', '\u{1e94b}'),
|
||||
('𞤀', '𞥋'),
|
||||
('𞥐', '𞥙'),
|
||||
('𞸀', '𞸃'),
|
||||
('𞸅', '𞸟'),
|
||||
|
@ -715,11 +731,13 @@ pub const PERL_WORD: &'static [(char, char)] = &[
|
|||
('🄰', '🅉'),
|
||||
('🅐', '🅩'),
|
||||
('🅰', '🆉'),
|
||||
('𠀀', '𪛖'),
|
||||
('\u{1fbf0}', '\u{1fbf9}'),
|
||||
('𠀀', '\u{2a6dd}'),
|
||||
('𪜀', '𫜴'),
|
||||
('𫝀', '𫠝'),
|
||||
('𫠠', '𬺡'),
|
||||
('𬺰', '𮯠'),
|
||||
('丽', '𪘀'),
|
||||
('\u{30000}', '\u{3134a}'),
|
||||
('\u{e0100}', '\u{e01ef}'),
|
||||
];
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,8 +1,10 @@
|
|||
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
|
||||
//
|
||||
// ucd-generate property-names /tmp/ucd/12.1.0/
|
||||
// ucd-generate property-names ucd-13.0.0
|
||||
//
|
||||
// ucd-generate is available on crates.io.
|
||||
// Unicode version: 13.0.0.
|
||||
//
|
||||
// ucd-generate 0.2.8 is available on crates.io.
|
||||
|
||||
pub const PROPERTY_NAMES: &'static [(&'static str, &'static str)] = &[
|
||||
("age", "Age"),
|
||||
|
@ -47,7 +49,9 @@ pub const PROPERTY_NAMES: &'static [(&'static str, &'static str)] = &[
|
|||
("cjkirgkpsource", "kIRG_KPSource"),
|
||||
("cjkirgksource", "kIRG_KSource"),
|
||||
("cjkirgmsource", "kIRG_MSource"),
|
||||
("cjkirgssource", "kIRG_SSource"),
|
||||
("cjkirgtsource", "kIRG_TSource"),
|
||||
("cjkirguksource", "kIRG_UKSource"),
|
||||
("cjkirgusource", "kIRG_USource"),
|
||||
("cjkirgvsource", "kIRG_VSource"),
|
||||
("cjkothernumeric", "kOtherNumeric"),
|
||||
|
@ -74,11 +78,15 @@ pub const PROPERTY_NAMES: &'static [(&'static str, &'static str)] = &[
|
|||
("dt", "Decomposition_Type"),
|
||||
("ea", "East_Asian_Width"),
|
||||
("eastasianwidth", "East_Asian_Width"),
|
||||
("ebase", "Emoji_Modifier_Base"),
|
||||
("ecomp", "Emoji_Component"),
|
||||
("emod", "Emoji_Modifier"),
|
||||
("emoji", "Emoji"),
|
||||
("emojicomponent", "Emoji_Component"),
|
||||
("emojimodifier", "Emoji_Modifier"),
|
||||
("emojimodifierbase", "Emoji_Modifier_Base"),
|
||||
("emojipresentation", "Emoji_Presentation"),
|
||||
("epres", "Emoji_Presentation"),
|
||||
("equideo", "Equivalent_Unified_Ideograph"),
|
||||
("equivalentunifiedideograph", "Equivalent_Unified_Ideograph"),
|
||||
("expandsonnfc", "Expands_On_NFC"),
|
||||
|
@ -88,6 +96,7 @@ pub const PROPERTY_NAMES: &'static [(&'static str, &'static str)] = &[
|
|||
("ext", "Extender"),
|
||||
("extendedpictographic", "Extended_Pictographic"),
|
||||
("extender", "Extender"),
|
||||
("extpict", "Extended_Pictographic"),
|
||||
("fcnfkc", "FC_NFKC_Closure"),
|
||||
("fcnfkcclosure", "FC_NFKC_Closure"),
|
||||
("fullcompositionexclusion", "Full_Composition_Exclusion"),
|
||||
|
@ -138,7 +147,9 @@ pub const PROPERTY_NAMES: &'static [(&'static str, &'static str)] = &[
|
|||
("kirgkpsource", "kIRG_KPSource"),
|
||||
("kirgksource", "kIRG_KSource"),
|
||||
("kirgmsource", "kIRG_MSource"),
|
||||
("kirgssource", "kIRG_SSource"),
|
||||
("kirgtsource", "kIRG_TSource"),
|
||||
("kirguksource", "kIRG_UKSource"),
|
||||
("kirgusource", "kIRG_USource"),
|
||||
("kirgvsource", "kIRG_VSource"),
|
||||
("kothernumeric", "kOtherNumeric"),
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
|
||||
//
|
||||
// ucd-generate property-values /tmp/ucd/12.1.0/ --include gc,script,scx,age,gcb,wb,sb
|
||||
// ucd-generate property-values ucd-13.0.0 --include gc,script,scx,age,gcb,wb,sb
|
||||
//
|
||||
// ucd-generate is available on crates.io.
|
||||
// Unicode version: 13.0.0.
|
||||
//
|
||||
// ucd-generate 0.2.8 is available on crates.io.
|
||||
|
||||
pub const PROPERTY_VALUES: &'static [(
|
||||
&'static str,
|
||||
|
@ -16,6 +18,7 @@ pub const PROPERTY_VALUES: &'static [(
|
|||
("11.0", "V11_0"),
|
||||
("12.0", "V12_0"),
|
||||
("12.1", "V12_1"),
|
||||
("13.0", "V13_0"),
|
||||
("2.0", "V2_0"),
|
||||
("2.1", "V2_1"),
|
||||
("3.0", "V3_0"),
|
||||
|
@ -40,6 +43,7 @@ pub const PROPERTY_VALUES: &'static [(
|
|||
("v110", "V11_0"),
|
||||
("v120", "V12_0"),
|
||||
("v121", "V12_1"),
|
||||
("v130", "V13_0"),
|
||||
("v20", "V2_0"),
|
||||
("v21", "V2_1"),
|
||||
("v30", "V3_0"),
|
||||
|
@ -224,6 +228,8 @@ pub const PROPERTY_VALUES: &'static [(
|
|||
("cham", "Cham"),
|
||||
("cher", "Cherokee"),
|
||||
("cherokee", "Cherokee"),
|
||||
("chorasmian", "Chorasmian"),
|
||||
("chrs", "Chorasmian"),
|
||||
("common", "Common"),
|
||||
("copt", "Coptic"),
|
||||
("coptic", "Coptic"),
|
||||
|
@ -235,6 +241,8 @@ pub const PROPERTY_VALUES: &'static [(
|
|||
("deseret", "Deseret"),
|
||||
("deva", "Devanagari"),
|
||||
("devanagari", "Devanagari"),
|
||||
("diak", "Dives_Akuru"),
|
||||
("divesakuru", "Dives_Akuru"),
|
||||
("dogr", "Dogra"),
|
||||
("dogra", "Dogra"),
|
||||
("dsrt", "Deseret"),
|
||||
|
@ -299,11 +307,13 @@ pub const PROPERTY_VALUES: &'static [(
|
|||
("kayahli", "Kayah_Li"),
|
||||
("khar", "Kharoshthi"),
|
||||
("kharoshthi", "Kharoshthi"),
|
||||
("khitansmallscript", "Khitan_Small_Script"),
|
||||
("khmer", "Khmer"),
|
||||
("khmr", "Khmer"),
|
||||
("khoj", "Khojki"),
|
||||
("khojki", "Khojki"),
|
||||
("khudawadi", "Khudawadi"),
|
||||
("kits", "Khitan_Small_Script"),
|
||||
("knda", "Kannada"),
|
||||
("kthi", "Kaithi"),
|
||||
("lana", "Tai_Tham"),
|
||||
|
@ -477,6 +487,8 @@ pub const PROPERTY_VALUES: &'static [(
|
|||
("wcho", "Wancho"),
|
||||
("xpeo", "Old_Persian"),
|
||||
("xsux", "Cuneiform"),
|
||||
("yezi", "Yezidi"),
|
||||
("yezidi", "Yezidi"),
|
||||
("yi", "Yi"),
|
||||
("yiii", "Yi"),
|
||||
("zanabazarsquare", "Zanabazar_Square"),
|
||||
|
@ -533,6 +545,8 @@ pub const PROPERTY_VALUES: &'static [(
|
|||
("cham", "Cham"),
|
||||
("cher", "Cherokee"),
|
||||
("cherokee", "Cherokee"),
|
||||
("chorasmian", "Chorasmian"),
|
||||
("chrs", "Chorasmian"),
|
||||
("common", "Common"),
|
||||
("copt", "Coptic"),
|
||||
("coptic", "Coptic"),
|
||||
|
@ -544,6 +558,8 @@ pub const PROPERTY_VALUES: &'static [(
|
|||
("deseret", "Deseret"),
|
||||
("deva", "Devanagari"),
|
||||
("devanagari", "Devanagari"),
|
||||
("diak", "Dives_Akuru"),
|
||||
("divesakuru", "Dives_Akuru"),
|
||||
("dogr", "Dogra"),
|
||||
("dogra", "Dogra"),
|
||||
("dsrt", "Deseret"),
|
||||
|
@ -608,11 +624,13 @@ pub const PROPERTY_VALUES: &'static [(
|
|||
("kayahli", "Kayah_Li"),
|
||||
("khar", "Kharoshthi"),
|
||||
("kharoshthi", "Kharoshthi"),
|
||||
("khitansmallscript", "Khitan_Small_Script"),
|
||||
("khmer", "Khmer"),
|
||||
("khmr", "Khmer"),
|
||||
("khoj", "Khojki"),
|
||||
("khojki", "Khojki"),
|
||||
("khudawadi", "Khudawadi"),
|
||||
("kits", "Khitan_Small_Script"),
|
||||
("knda", "Kannada"),
|
||||
("kthi", "Kaithi"),
|
||||
("lana", "Tai_Tham"),
|
||||
|
@ -786,6 +804,8 @@ pub const PROPERTY_VALUES: &'static [(
|
|||
("wcho", "Wancho"),
|
||||
("xpeo", "Old_Persian"),
|
||||
("xsux", "Cuneiform"),
|
||||
("yezi", "Yezidi"),
|
||||
("yezidi", "Yezidi"),
|
||||
("yi", "Yi"),
|
||||
("yiii", "Yi"),
|
||||
("zanabazarsquare", "Zanabazar_Square"),
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
|
||||
//
|
||||
// ucd-generate script /tmp/ucd/12.1.0/ --chars
|
||||
// ucd-generate script ucd-13.0.0 --chars
|
||||
//
|
||||
// ucd-generate is available on crates.io.
|
||||
// Unicode version: 13.0.0.
|
||||
//
|
||||
// ucd-generate 0.2.8 is available on crates.io.
|
||||
|
||||
pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[
|
||||
("Adlam", ADLAM),
|
||||
|
@ -28,6 +30,7 @@ pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[
|
|||
("Chakma", CHAKMA),
|
||||
("Cham", CHAM),
|
||||
("Cherokee", CHEROKEE),
|
||||
("Chorasmian", CHORASMIAN),
|
||||
("Common", COMMON),
|
||||
("Coptic", COPTIC),
|
||||
("Cuneiform", CUNEIFORM),
|
||||
|
@ -35,6 +38,7 @@ pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[
|
|||
("Cyrillic", CYRILLIC),
|
||||
("Deseret", DESERET),
|
||||
("Devanagari", DEVANAGARI),
|
||||
("Dives_Akuru", DIVES_AKURU),
|
||||
("Dogra", DOGRA),
|
||||
("Duployan", DUPLOYAN),
|
||||
("Egyptian_Hieroglyphs", EGYPTIAN_HIEROGLYPHS),
|
||||
|
@ -66,6 +70,7 @@ pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[
|
|||
("Katakana", KATAKANA),
|
||||
("Kayah_Li", KAYAH_LI),
|
||||
("Kharoshthi", KHAROSHTHI),
|
||||
("Khitan_Small_Script", KHITAN_SMALL_SCRIPT),
|
||||
("Khmer", KHMER),
|
||||
("Khojki", KHOJKI),
|
||||
("Khudawadi", KHUDAWADI),
|
||||
|
@ -155,12 +160,13 @@ pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[
|
|||
("Vai", VAI),
|
||||
("Wancho", WANCHO),
|
||||
("Warang_Citi", WARANG_CITI),
|
||||
("Yezidi", YEZIDI),
|
||||
("Yi", YI),
|
||||
("Zanabazar_Square", ZANABAZAR_SQUARE),
|
||||
];
|
||||
|
||||
pub const ADLAM: &'static [(char, char)] =
|
||||
&[('𞤀', '\u{1e94b}'), ('𞥐', '𞥙'), ('𞥞', '𞥟')];
|
||||
&[('𞤀', '𞥋'), ('𞥐', '𞥙'), ('𞥞', '𞥟')];
|
||||
|
||||
pub const AHOM: &'static [(char, char)] =
|
||||
&[('𑜀', '𑜚'), ('\u{1171d}', '\u{1172b}'), ('𑜰', '𑜿')];
|
||||
|
@ -180,7 +186,7 @@ pub const ARABIC: &'static [(char, char)] = &[
|
|||
('۞', 'ۿ'),
|
||||
('ݐ', 'ݿ'),
|
||||
('ࢠ', 'ࢴ'),
|
||||
('ࢶ', 'ࢽ'),
|
||||
('ࢶ', '\u{8c7}'),
|
||||
('\u{8d3}', '\u{8e1}'),
|
||||
('\u{8e3}', '\u{8ff}'),
|
||||
('ﭐ', '﯁'),
|
||||
|
@ -228,13 +234,11 @@ pub const ARABIC: &'static [(char, char)] = &[
|
|||
];
|
||||
|
||||
pub const ARMENIAN: &'static [(char, char)] =
|
||||
&[('Ա', 'Ֆ'), ('ՙ', 'ֈ'), ('֊', '֊'), ('֍', '֏'), ('ﬓ', 'ﬗ')];
|
||||
&[('Ա', 'Ֆ'), ('ՙ', '֊'), ('֍', '֏'), ('ﬓ', 'ﬗ')];
|
||||
|
||||
pub const AVESTAN: &'static [(char, char)] =
|
||||
&[('𐬀', '𐬵'), ('𐬹', '𐬿')];
|
||||
pub const AVESTAN: &'static [(char, char)] = &[('𐬀', '𐬵'), ('𐬹', '𐬿')];
|
||||
|
||||
pub const BALINESE: &'static [(char, char)] =
|
||||
&[('\u{1b00}', 'ᭋ'), ('᭐', '᭼')];
|
||||
pub const BALINESE: &'static [(char, char)] = &[('\u{1b00}', 'ᭋ'), ('᭐', '᭼')];
|
||||
|
||||
pub const BAMUM: &'static [(char, char)] = &[('ꚠ', '꛷'), ('𖠀', '𖨸')];
|
||||
|
||||
|
@ -260,23 +264,18 @@ pub const BENGALI: &'static [(char, char)] = &[
|
|||
('০', '\u{9fe}'),
|
||||
];
|
||||
|
||||
pub const BHAIKSUKI: &'static [(char, char)] = &[
|
||||
('𑰀', '𑰈'),
|
||||
('𑰊', '\u{11c36}'),
|
||||
('\u{11c38}', '𑱅'),
|
||||
('𑱐', '𑱬'),
|
||||
];
|
||||
pub const BHAIKSUKI: &'static [(char, char)] =
|
||||
&[('𑰀', '𑰈'), ('𑰊', '\u{11c36}'), ('\u{11c38}', '𑱅'), ('𑱐', '𑱬')];
|
||||
|
||||
pub const BOPOMOFO: &'static [(char, char)] =
|
||||
&[('˪', '˫'), ('ㄅ', 'ㄯ'), ('ㆠ', 'ㆺ')];
|
||||
&[('˪', '˫'), ('ㄅ', 'ㄯ'), ('ㆠ', '\u{31bf}')];
|
||||
|
||||
pub const BRAHMI: &'static [(char, char)] =
|
||||
&[('𑀀', '𑁍'), ('𑁒', '𑁯'), ('\u{1107f}', '\u{1107f}')];
|
||||
|
||||
pub const BRAILLE: &'static [(char, char)] = &[('⠀', '⣿')];
|
||||
|
||||
pub const BUGINESE: &'static [(char, char)] =
|
||||
&[('ᨀ', '\u{1a1b}'), ('᨞', '᨟')];
|
||||
pub const BUGINESE: &'static [(char, char)] = &[('ᨀ', '\u{1a1b}'), ('᨞', '᨟')];
|
||||
|
||||
pub const BUHID: &'static [(char, char)] = &[('ᝀ', '\u{1753}')];
|
||||
|
||||
|
@ -289,7 +288,7 @@ pub const CAUCASIAN_ALBANIAN: &'static [(char, char)] =
|
|||
&[('𐔰', '𐕣'), ('𐕯', '𐕯')];
|
||||
|
||||
pub const CHAKMA: &'static [(char, char)] =
|
||||
&[('\u{11100}', '\u{11134}'), ('𑄶', '𑅆')];
|
||||
&[('\u{11100}', '\u{11134}'), ('𑄶', '\u{11147}')];
|
||||
|
||||
pub const CHAM: &'static [(char, char)] =
|
||||
&[('ꨀ', '\u{aa36}'), ('ꩀ', 'ꩍ'), ('꩐', '꩙'), ('꩜', '꩟')];
|
||||
|
@ -297,6 +296,8 @@ pub const CHAM: &'static [(char, char)] =
|
|||
pub const CHEROKEE: &'static [(char, char)] =
|
||||
&[('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ꭰ', 'ꮿ')];
|
||||
|
||||
pub const CHORASMIAN: &'static [(char, char)] = &[('\u{10fb0}', '\u{10fcb}')];
|
||||
|
||||
pub const COMMON: &'static [(char, char)] = &[
|
||||
('\u{0}', '@'),
|
||||
('[', '`'),
|
||||
|
@ -312,7 +313,6 @@ pub const COMMON: &'static [(char, char)] = &[
|
|||
(';', ';'),
|
||||
('΅', '΅'),
|
||||
('·', '·'),
|
||||
('։', '։'),
|
||||
('\u{605}', '\u{605}'),
|
||||
('،', '،'),
|
||||
('؛', '؛'),
|
||||
|
@ -333,7 +333,7 @@ pub const COMMON: &'static [(char, char)] = &[
|
|||
('ᳩ', 'ᳬ'),
|
||||
('ᳮ', 'ᳳ'),
|
||||
('ᳵ', '᳷'),
|
||||
('\u{1cfa}', '\u{1cfa}'),
|
||||
('ᳺ', 'ᳺ'),
|
||||
('\u{2000}', '\u{200b}'),
|
||||
('\u{200e}', '\u{2064}'),
|
||||
('\u{2066}', '⁰'),
|
||||
|
@ -351,8 +351,8 @@ pub const COMMON: &'static [(char, char)] = &[
|
|||
('①', '⟿'),
|
||||
('⤀', '⭳'),
|
||||
('⭶', '⮕'),
|
||||
('⮘', '\u{2bff}'),
|
||||
('⸀', '\u{2e4f}'),
|
||||
('\u{2b97}', '⯿'),
|
||||
('⸀', '\u{2e52}'),
|
||||
('⿰', '⿻'),
|
||||
('\u{3000}', '〄'),
|
||||
('〆', '〆'),
|
||||
|
@ -366,7 +366,7 @@ pub const COMMON: &'static [(char, char)] = &[
|
|||
('㇀', '㇣'),
|
||||
('㈠', '㉟'),
|
||||
('㉿', '㋏'),
|
||||
('\u{32ff}', '\u{32ff}'),
|
||||
('㋿', '㋿'),
|
||||
('㍘', '㏿'),
|
||||
('䷀', '䷿'),
|
||||
('꜀', '꜡'),
|
||||
|
@ -375,6 +375,7 @@ pub const COMMON: &'static [(char, char)] = &[
|
|||
('꤮', '꤮'),
|
||||
('ꧏ', 'ꧏ'),
|
||||
('꭛', '꭛'),
|
||||
('\u{ab6a}', '\u{ab6b}'),
|
||||
('﴾', '﴿'),
|
||||
('︐', '︙'),
|
||||
('︰', '﹒'),
|
||||
|
@ -392,10 +393,10 @@ pub const COMMON: &'static [(char, char)] = &[
|
|||
('𐄀', '𐄂'),
|
||||
('𐄇', '𐄳'),
|
||||
('𐄷', '𐄿'),
|
||||
('𐆐', '𐆛'),
|
||||
('𐆐', '\u{1019c}'),
|
||||
('𐇐', '𐇼'),
|
||||
('𐋡', '𐋻'),
|
||||
('\u{16fe2}', '\u{16fe3}'),
|
||||
('𖿢', '𖿣'),
|
||||
('\u{1bca0}', '\u{1bca3}'),
|
||||
('𝀀', '𝃵'),
|
||||
('𝄀', '𝄦'),
|
||||
|
@ -429,45 +430,46 @@ pub const COMMON: &'static [(char, char)] = &[
|
|||
('𝚨', '𝟋'),
|
||||
('𝟎', '𝟿'),
|
||||
('𞱱', '𞲴'),
|
||||
('\u{1ed01}', '\u{1ed3d}'),
|
||||
('𞴁', '𞴽'),
|
||||
('🀀', '🀫'),
|
||||
('🀰', '🂓'),
|
||||
('🂠', '🂮'),
|
||||
('🂱', '🂿'),
|
||||
('🃁', '🃏'),
|
||||
('🃑', '🃵'),
|
||||
('🄀', '🄌'),
|
||||
('🄐', '\u{1f16c}'),
|
||||
('🅰', '🆬'),
|
||||
('🄀', '\u{1f1ad}'),
|
||||
('🇦', '🇿'),
|
||||
('🈁', '🈂'),
|
||||
('🈐', '🈻'),
|
||||
('🉀', '🉈'),
|
||||
('🉐', '🉑'),
|
||||
('🉠', '🉥'),
|
||||
('🌀', '\u{1f6d5}'),
|
||||
('🌀', '\u{1f6d7}'),
|
||||
('🛠', '🛬'),
|
||||
('🛰', '\u{1f6fa}'),
|
||||
('🛰', '\u{1f6fc}'),
|
||||
('🜀', '🝳'),
|
||||
('🞀', '🟘'),
|
||||
('\u{1f7e0}', '\u{1f7eb}'),
|
||||
('🟠', '🟫'),
|
||||
('🠀', '🠋'),
|
||||
('🠐', '🡇'),
|
||||
('🡐', '🡙'),
|
||||
('🡠', '🢇'),
|
||||
('🢐', '🢭'),
|
||||
('🤀', '🤋'),
|
||||
('\u{1f90d}', '\u{1f971}'),
|
||||
('🥳', '🥶'),
|
||||
('🥺', '🦢'),
|
||||
('\u{1f9a5}', '\u{1f9aa}'),
|
||||
('\u{1f9ae}', '\u{1f9ca}'),
|
||||
('\u{1f9cd}', '\u{1fa53}'),
|
||||
('\u{1f8b0}', '\u{1f8b1}'),
|
||||
('🤀', '\u{1f978}'),
|
||||
('🥺', '\u{1f9cb}'),
|
||||
('🧍', '🩓'),
|
||||
('🩠', '🩭'),
|
||||
('\u{1fa70}', '\u{1fa73}'),
|
||||
('\u{1fa78}', '\u{1fa7a}'),
|
||||
('\u{1fa80}', '\u{1fa82}'),
|
||||
('\u{1fa90}', '\u{1fa95}'),
|
||||
('🩰', '\u{1fa74}'),
|
||||
('🩸', '🩺'),
|
||||
('🪀', '\u{1fa86}'),
|
||||
('🪐', '\u{1faa8}'),
|
||||
('\u{1fab0}', '\u{1fab6}'),
|
||||
('\u{1fac0}', '\u{1fac2}'),
|
||||
('\u{1fad0}', '\u{1fad6}'),
|
||||
('\u{1fb00}', '\u{1fb92}'),
|
||||
('\u{1fb94}', '\u{1fbca}'),
|
||||
('\u{1fbf0}', '\u{1fbf9}'),
|
||||
('\u{e0001}', '\u{e0001}'),
|
||||
('\u{e0020}', '\u{e007f}'),
|
||||
];
|
||||
|
@ -478,14 +480,8 @@ pub const COPTIC: &'static [(char, char)] =
|
|||
pub const CUNEIFORM: &'static [(char, char)] =
|
||||
&[('𒀀', '𒎙'), ('𒐀', '𒑮'), ('𒑰', '𒑴'), ('𒒀', '𒕃')];
|
||||
|
||||
pub const CYPRIOT: &'static [(char, char)] = &[
|
||||
('𐠀', '𐠅'),
|
||||
('𐠈', '𐠈'),
|
||||
('𐠊', '𐠵'),
|
||||
('𐠷', '𐠸'),
|
||||
('𐠼', '𐠼'),
|
||||
('𐠿', '𐠿'),
|
||||
];
|
||||
pub const CYPRIOT: &'static [(char, char)] =
|
||||
&[('𐠀', '𐠅'), ('𐠈', '𐠈'), ('𐠊', '𐠵'), ('𐠷', '𐠸'), ('𐠼', '𐠼'), ('𐠿', '𐠿')];
|
||||
|
||||
pub const CYRILLIC: &'static [(char, char)] = &[
|
||||
('Ѐ', '\u{484}'),
|
||||
|
@ -507,22 +503,28 @@ pub const DEVANAGARI: &'static [(char, char)] = &[
|
|||
('\u{a8e0}', '\u{a8ff}'),
|
||||
];
|
||||
|
||||
pub const DIVES_AKURU: &'static [(char, char)] = &[
|
||||
('\u{11900}', '\u{11906}'),
|
||||
('\u{11909}', '\u{11909}'),
|
||||
('\u{1190c}', '\u{11913}'),
|
||||
('\u{11915}', '\u{11916}'),
|
||||
('\u{11918}', '\u{11935}'),
|
||||
('\u{11937}', '\u{11938}'),
|
||||
('\u{1193b}', '\u{11946}'),
|
||||
('\u{11950}', '\u{11959}'),
|
||||
];
|
||||
|
||||
pub const DOGRA: &'static [(char, char)] = &[('𑠀', '𑠻')];
|
||||
|
||||
pub const DUPLOYAN: &'static [(char, char)] = &[
|
||||
('𛰀', '𛱪'),
|
||||
('𛱰', '𛱼'),
|
||||
('𛲀', '𛲈'),
|
||||
('𛲐', '𛲙'),
|
||||
('𛲜', '𛲟'),
|
||||
];
|
||||
pub const DUPLOYAN: &'static [(char, char)] =
|
||||
&[('𛰀', '𛱪'), ('𛱰', '𛱼'), ('𛲀', '𛲈'), ('𛲐', '𛲙'), ('𛲜', '𛲟')];
|
||||
|
||||
pub const EGYPTIAN_HIEROGLYPHS: &'static [(char, char)] =
|
||||
&[('𓀀', '𓐮'), ('\u{13430}', '\u{13438}')];
|
||||
|
||||
pub const ELBASAN: &'static [(char, char)] = &[('𐔀', '𐔧')];
|
||||
|
||||
pub const ELYMAIC: &'static [(char, char)] = &[('\u{10fe0}', '\u{10ff6}')];
|
||||
pub const ELYMAIC: &'static [(char, char)] = &[('𐿠', '𐿶')];
|
||||
|
||||
pub const ETHIOPIC: &'static [(char, char)] = &[
|
||||
('ሀ', 'ቈ'),
|
||||
|
@ -694,16 +696,18 @@ pub const HAN: &'static [(char, char)] = &[
|
|||
('〇', '〇'),
|
||||
('〡', '〩'),
|
||||
('〸', '〻'),
|
||||
('㐀', '䶵'),
|
||||
('一', '鿯'),
|
||||
('㐀', '\u{4dbf}'),
|
||||
('一', '\u{9ffc}'),
|
||||
('豈', '舘'),
|
||||
('並', '龎'),
|
||||
('𠀀', '𪛖'),
|
||||
('\u{16ff0}', '\u{16ff1}'),
|
||||
('𠀀', '\u{2a6dd}'),
|
||||
('𪜀', '𫜴'),
|
||||
('𫝀', '𫠝'),
|
||||
('𫠠', '𬺡'),
|
||||
('𬺰', '𮯠'),
|
||||
('丽', '𪘀'),
|
||||
('\u{30000}', '\u{3134a}'),
|
||||
];
|
||||
|
||||
pub const HANGUL: &'static [(char, char)] = &[
|
||||
|
@ -743,13 +747,8 @@ pub const HEBREW: &'static [(char, char)] = &[
|
|||
('צּ', 'ﭏ'),
|
||||
];
|
||||
|
||||
pub const HIRAGANA: &'static [(char, char)] = &[
|
||||
('ぁ', 'ゖ'),
|
||||
('ゝ', 'ゟ'),
|
||||
('𛀁', '𛄞'),
|
||||
('\u{1b150}', '\u{1b152}'),
|
||||
('🈀', '🈀'),
|
||||
];
|
||||
pub const HIRAGANA: &'static [(char, char)] =
|
||||
&[('ぁ', 'ゖ'), ('ゝ', 'ゟ'), ('𛀁', '𛄞'), ('𛅐', '𛅒'), ('🈀', '🈀')];
|
||||
|
||||
pub const IMPERIAL_ARAMAIC: &'static [(char, char)] =
|
||||
&[('𐡀', '𐡕'), ('𐡗', '𐡟')];
|
||||
|
@ -760,7 +759,7 @@ pub const INHERITED: &'static [(char, char)] = &[
|
|||
('\u{64b}', '\u{655}'),
|
||||
('\u{670}', '\u{670}'),
|
||||
('\u{951}', '\u{954}'),
|
||||
('\u{1ab0}', '\u{1abe}'),
|
||||
('\u{1ab0}', '\u{1ac0}'),
|
||||
('\u{1cd0}', '\u{1cd2}'),
|
||||
('\u{1cd4}', '\u{1ce0}'),
|
||||
('\u{1ce2}', '\u{1ce8}'),
|
||||
|
@ -822,11 +821,10 @@ pub const KATAKANA: &'static [(char, char)] = &[
|
|||
('ヲ', 'ッ'),
|
||||
('ア', 'ン'),
|
||||
('𛀀', '𛀀'),
|
||||
('\u{1b164}', '\u{1b167}'),
|
||||
('𛅤', '𛅧'),
|
||||
];
|
||||
|
||||
pub const KAYAH_LI: &'static [(char, char)] =
|
||||
&[('꤀', '\u{a92d}'), ('꤯', '꤯')];
|
||||
pub const KAYAH_LI: &'static [(char, char)] = &[('꤀', '\u{a92d}'), ('꤯', '꤯')];
|
||||
|
||||
pub const KHAROSHTHI: &'static [(char, char)] = &[
|
||||
('𐨀', '\u{10a03}'),
|
||||
|
@ -839,11 +837,13 @@ pub const KHAROSHTHI: &'static [(char, char)] = &[
|
|||
('𐩐', '𐩘'),
|
||||
];
|
||||
|
||||
pub const KHITAN_SMALL_SCRIPT: &'static [(char, char)] =
|
||||
&[('\u{16fe4}', '\u{16fe4}'), ('\u{18b00}', '\u{18cd5}')];
|
||||
|
||||
pub const KHMER: &'static [(char, char)] =
|
||||
&[('ក', '\u{17dd}'), ('០', '៩'), ('៰', '៹'), ('᧠', '᧿')];
|
||||
|
||||
pub const KHOJKI: &'static [(char, char)] =
|
||||
&[('𑈀', '𑈑'), ('𑈓', '\u{1123e}')];
|
||||
pub const KHOJKI: &'static [(char, char)] = &[('𑈀', '𑈑'), ('𑈓', '\u{1123e}')];
|
||||
|
||||
pub const KHUDAWADI: &'static [(char, char)] =
|
||||
&[('𑊰', '\u{112ea}'), ('𑋰', '𑋹')];
|
||||
|
@ -851,8 +851,8 @@ pub const KHUDAWADI: &'static [(char, char)] =
|
|||
pub const LAO: &'static [(char, char)] = &[
|
||||
('ກ', 'ຂ'),
|
||||
('ຄ', 'ຄ'),
|
||||
('\u{e86}', 'ຊ'),
|
||||
('\u{e8c}', 'ຣ'),
|
||||
('ຆ', 'ຊ'),
|
||||
('ຌ', 'ຣ'),
|
||||
('ລ', 'ລ'),
|
||||
('ວ', 'ຽ'),
|
||||
('ເ', 'ໄ'),
|
||||
|
@ -886,12 +886,12 @@ pub const LATIN: &'static [(char, char)] = &[
|
|||
('Ⅰ', 'ↈ'),
|
||||
('Ⱡ', 'Ɀ'),
|
||||
('Ꜣ', 'ꞇ'),
|
||||
('Ꞌ', '\u{a7bf}'),
|
||||
('\u{a7c2}', '\u{a7c6}'),
|
||||
('ꟷ', 'ꟿ'),
|
||||
('Ꞌ', 'ꞿ'),
|
||||
('Ꟃ', '\u{a7ca}'),
|
||||
('\u{a7f5}', 'ꟿ'),
|
||||
('ꬰ', 'ꭚ'),
|
||||
('ꭜ', 'ꭤ'),
|
||||
('\u{ab66}', '\u{ab67}'),
|
||||
('ꭦ', '\u{ab69}'),
|
||||
('ff', 'st'),
|
||||
('A', 'Z'),
|
||||
('a', 'z'),
|
||||
|
@ -921,20 +921,19 @@ pub const LINEAR_B: &'static [(char, char)] = &[
|
|||
('𐂀', '𐃺'),
|
||||
];
|
||||
|
||||
pub const LISU: &'static [(char, char)] = &[('ꓐ', '꓿')];
|
||||
pub const LISU: &'static [(char, char)] =
|
||||
&[('ꓐ', '꓿'), ('\u{11fb0}', '\u{11fb0}')];
|
||||
|
||||
pub const LYCIAN: &'static [(char, char)] = &[('𐊀', '𐊜')];
|
||||
|
||||
pub const LYDIAN: &'static [(char, char)] =
|
||||
&[('𐤠', '𐤹'), ('𐤿', '𐤿')];
|
||||
pub const LYDIAN: &'static [(char, char)] = &[('𐤠', '𐤹'), ('𐤿', '𐤿')];
|
||||
|
||||
pub const MAHAJANI: &'static [(char, char)] = &[('𑅐', '𑅶')];
|
||||
|
||||
pub const MAKASAR: &'static [(char, char)] = &[('𑻠', '𑻸')];
|
||||
|
||||
pub const MALAYALAM: &'static [(char, char)] = &[
|
||||
('\u{d00}', 'ഃ'),
|
||||
('അ', 'ഌ'),
|
||||
('\u{d00}', 'ഌ'),
|
||||
('എ', 'ഐ'),
|
||||
('ഒ', '\u{d44}'),
|
||||
('െ', 'ൈ'),
|
||||
|
@ -943,8 +942,7 @@ pub const MALAYALAM: &'static [(char, char)] = &[
|
|||
('൦', 'ൿ'),
|
||||
];
|
||||
|
||||
pub const MANDAIC: &'static [(char, char)] =
|
||||
&[('ࡀ', '\u{85b}'), ('࡞', '࡞')];
|
||||
pub const MANDAIC: &'static [(char, char)] = &[('ࡀ', '\u{85b}'), ('࡞', '࡞')];
|
||||
|
||||
pub const MANICHAEAN: &'static [(char, char)] =
|
||||
&[('𐫀', '\u{10ae6}'), ('𐫫', '𐫶')];
|
||||
|
@ -975,14 +973,10 @@ pub const MEROITIC_CURSIVE: &'static [(char, char)] =
|
|||
|
||||
pub const MEROITIC_HIEROGLYPHS: &'static [(char, char)] = &[('𐦀', '𐦟')];
|
||||
|
||||
pub const MIAO: &'static [(char, char)] = &[
|
||||
('𖼀', '\u{16f4a}'),
|
||||
('\u{16f4f}', '\u{16f87}'),
|
||||
('\u{16f8f}', '𖾟'),
|
||||
];
|
||||
pub const MIAO: &'static [(char, char)] =
|
||||
&[('𖼀', '𖽊'), ('\u{16f4f}', '𖾇'), ('\u{16f8f}', '𖾟')];
|
||||
|
||||
pub const MODI: &'static [(char, char)] =
|
||||
&[('𑘀', '𑙄'), ('𑙐', '𑙙')];
|
||||
pub const MODI: &'static [(char, char)] = &[('𑘀', '𑙄'), ('𑙐', '𑙙')];
|
||||
|
||||
pub const MONGOLIAN: &'static [(char, char)] = &[
|
||||
('᠀', '᠁'),
|
||||
|
@ -994,46 +988,30 @@ pub const MONGOLIAN: &'static [(char, char)] = &[
|
|||
('𑙠', '𑙬'),
|
||||
];
|
||||
|
||||
pub const MRO: &'static [(char, char)] =
|
||||
&[('𖩀', '𖩞'), ('𖩠', '𖩩'), ('𖩮', '𖩯')];
|
||||
pub const MRO: &'static [(char, char)] = &[('𖩀', '𖩞'), ('𖩠', '𖩩'), ('𖩮', '𖩯')];
|
||||
|
||||
pub const MULTANI: &'static [(char, char)] = &[
|
||||
('𑊀', '𑊆'),
|
||||
('𑊈', '𑊈'),
|
||||
('𑊊', '𑊍'),
|
||||
('𑊏', '𑊝'),
|
||||
('𑊟', '𑊩'),
|
||||
];
|
||||
pub const MULTANI: &'static [(char, char)] =
|
||||
&[('𑊀', '𑊆'), ('𑊈', '𑊈'), ('𑊊', '𑊍'), ('𑊏', '𑊝'), ('𑊟', '𑊩')];
|
||||
|
||||
pub const MYANMAR: &'static [(char, char)] =
|
||||
&[('က', '႟'), ('ꧠ', 'ꧾ'), ('ꩠ', 'ꩿ')];
|
||||
|
||||
pub const NABATAEAN: &'static [(char, char)] =
|
||||
&[('𐢀', '𐢞'), ('𐢧', '𐢯')];
|
||||
pub const NABATAEAN: &'static [(char, char)] = &[('𐢀', '𐢞'), ('𐢧', '𐢯')];
|
||||
|
||||
pub const NANDINAGARI: &'static [(char, char)] = &[
|
||||
('\u{119a0}', '\u{119a7}'),
|
||||
('\u{119aa}', '\u{119d7}'),
|
||||
('\u{119da}', '\u{119e4}'),
|
||||
];
|
||||
pub const NANDINAGARI: &'static [(char, char)] =
|
||||
&[('𑦠', '𑦧'), ('𑦪', '\u{119d7}'), ('\u{119da}', '𑧤')];
|
||||
|
||||
pub const NEW_TAI_LUE: &'static [(char, char)] =
|
||||
&[('ᦀ', 'ᦫ'), ('ᦰ', 'ᧉ'), ('᧐', '᧚'), ('᧞', '᧟')];
|
||||
|
||||
pub const NEWA: &'static [(char, char)] =
|
||||
&[('𑐀', '𑑙'), ('𑑛', '𑑛'), ('𑑝', '\u{1145f}')];
|
||||
pub const NEWA: &'static [(char, char)] = &[('𑐀', '𑑛'), ('𑑝', '\u{11461}')];
|
||||
|
||||
pub const NKO: &'static [(char, char)] = &[('߀', 'ߺ'), ('\u{7fd}', '߿')];
|
||||
|
||||
pub const NUSHU: &'static [(char, char)] =
|
||||
&[('𖿡', '𖿡'), ('𛅰', '𛋻')];
|
||||
pub const NUSHU: &'static [(char, char)] = &[('𖿡', '𖿡'), ('𛅰', '𛋻')];
|
||||
|
||||
pub const NYIAKENG_PUACHUE_HMONG: &'static [(char, char)] = &[
|
||||
('\u{1e100}', '\u{1e12c}'),
|
||||
('\u{1e130}', '\u{1e13d}'),
|
||||
('\u{1e140}', '\u{1e149}'),
|
||||
('\u{1e14e}', '\u{1e14f}'),
|
||||
];
|
||||
pub const NYIAKENG_PUACHUE_HMONG: &'static [(char, char)] =
|
||||
&[('𞄀', '𞄬'), ('\u{1e130}', '𞄽'), ('𞅀', '𞅉'), ('𞅎', '𞅏')];
|
||||
|
||||
pub const OGHAM: &'static [(char, char)] = &[('\u{1680}', '᚜')];
|
||||
|
||||
|
@ -1042,15 +1020,13 @@ pub const OL_CHIKI: &'static [(char, char)] = &[('᱐', '᱿')];
|
|||
pub const OLD_HUNGARIAN: &'static [(char, char)] =
|
||||
&[('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𐳺', '𐳿')];
|
||||
|
||||
pub const OLD_ITALIC: &'static [(char, char)] =
|
||||
&[('𐌀', '𐌣'), ('𐌭', '𐌯')];
|
||||
pub const OLD_ITALIC: &'static [(char, char)] = &[('𐌀', '𐌣'), ('𐌭', '𐌯')];
|
||||
|
||||
pub const OLD_NORTH_ARABIAN: &'static [(char, char)] = &[('𐪀', '𐪟')];
|
||||
|
||||
pub const OLD_PERMIC: &'static [(char, char)] = &[('𐍐', '\u{1037a}')];
|
||||
|
||||
pub const OLD_PERSIAN: &'static [(char, char)] =
|
||||
&[('𐎠', '𐏃'), ('𐏈', '𐏕')];
|
||||
pub const OLD_PERSIAN: &'static [(char, char)] = &[('𐎠', '𐏃'), ('𐏈', '𐏕')];
|
||||
|
||||
pub const OLD_SOGDIAN: &'static [(char, char)] = &[('𐼀', '𐼧')];
|
||||
|
||||
|
@ -1069,25 +1045,18 @@ pub const ORIYA: &'static [(char, char)] = &[
|
|||
('\u{b3c}', '\u{b44}'),
|
||||
('େ', 'ୈ'),
|
||||
('ୋ', '\u{b4d}'),
|
||||
('\u{b56}', '\u{b57}'),
|
||||
('\u{b55}', '\u{b57}'),
|
||||
('ଡ଼', 'ଢ଼'),
|
||||
('ୟ', '\u{b63}'),
|
||||
('୦', '୷'),
|
||||
];
|
||||
|
||||
pub const OSAGE: &'static [(char, char)] =
|
||||
&[('𐒰', '𐓓'), ('𐓘', '𐓻')];
|
||||
pub const OSAGE: &'static [(char, char)] = &[('𐒰', '𐓓'), ('𐓘', '𐓻')];
|
||||
|
||||
pub const OSMANYA: &'static [(char, char)] =
|
||||
&[('𐒀', '𐒝'), ('𐒠', '𐒩')];
|
||||
pub const OSMANYA: &'static [(char, char)] = &[('𐒀', '𐒝'), ('𐒠', '𐒩')];
|
||||
|
||||
pub const PAHAWH_HMONG: &'static [(char, char)] = &[
|
||||
('𖬀', '𖭅'),
|
||||
('𖭐', '𖭙'),
|
||||
('𖭛', '𖭡'),
|
||||
('𖭣', '𖭷'),
|
||||
('𖭽', '𖮏'),
|
||||
];
|
||||
pub const PAHAWH_HMONG: &'static [(char, char)] =
|
||||
&[('𖬀', '𖭅'), ('𖭐', '𖭙'), ('𖭛', '𖭡'), ('𖭣', '𖭷'), ('𖭽', '𖮏')];
|
||||
|
||||
pub const PALMYRENE: &'static [(char, char)] = &[('𐡠', '𐡿')];
|
||||
|
||||
|
@ -1095,8 +1064,7 @@ pub const PAU_CIN_HAU: &'static [(char, char)] = &[('𑫀', '𑫸')];
|
|||
|
||||
pub const PHAGS_PA: &'static [(char, char)] = &[('ꡀ', '꡷')];
|
||||
|
||||
pub const PHOENICIAN: &'static [(char, char)] =
|
||||
&[('𐤀', '𐤛'), ('𐤟', '𐤟')];
|
||||
pub const PHOENICIAN: &'static [(char, char)] = &[('𐤀', '𐤛'), ('𐤟', '𐤟')];
|
||||
|
||||
pub const PSALTER_PAHLAVI: &'static [(char, char)] =
|
||||
&[('𐮀', '𐮑'), ('𐮙', '𐮜'), ('𐮩', '𐮯')];
|
||||
|
@ -1105,28 +1073,23 @@ pub const REJANG: &'static [(char, char)] = &[('ꤰ', '꥓'), ('꥟', '꥟')];
|
|||
|
||||
pub const RUNIC: &'static [(char, char)] = &[('ᚠ', 'ᛪ'), ('ᛮ', 'ᛸ')];
|
||||
|
||||
pub const SAMARITAN: &'static [(char, char)] =
|
||||
&[('ࠀ', '\u{82d}'), ('࠰', '࠾')];
|
||||
pub const SAMARITAN: &'static [(char, char)] = &[('ࠀ', '\u{82d}'), ('࠰', '࠾')];
|
||||
|
||||
pub const SAURASHTRA: &'static [(char, char)] =
|
||||
&[('ꢀ', '\u{a8c5}'), ('꣎', '꣙')];
|
||||
|
||||
pub const SHARADA: &'static [(char, char)] =
|
||||
&[('\u{11180}', '𑇍'), ('𑇐', '𑇟')];
|
||||
pub const SHARADA: &'static [(char, char)] = &[('\u{11180}', '𑇟')];
|
||||
|
||||
pub const SHAVIAN: &'static [(char, char)] = &[('𐑐', '𐑿')];
|
||||
|
||||
pub const SIDDHAM: &'static [(char, char)] =
|
||||
&[('𑖀', '\u{115b5}'), ('𑖸', '\u{115dd}')];
|
||||
|
||||
pub const SIGNWRITING: &'static [(char, char)] = &[
|
||||
('𝠀', '𝪋'),
|
||||
('\u{1da9b}', '\u{1da9f}'),
|
||||
('\u{1daa1}', '\u{1daaf}'),
|
||||
];
|
||||
pub const SIGNWRITING: &'static [(char, char)] =
|
||||
&[('𝠀', '𝪋'), ('\u{1da9b}', '\u{1da9f}'), ('\u{1daa1}', '\u{1daaf}')];
|
||||
|
||||
pub const SINHALA: &'static [(char, char)] = &[
|
||||
('ං', 'ඃ'),
|
||||
('\u{d81}', 'ඃ'),
|
||||
('අ', 'ඖ'),
|
||||
('ක', 'න'),
|
||||
('ඳ', 'ර'),
|
||||
|
@ -1143,21 +1106,19 @@ pub const SINHALA: &'static [(char, char)] = &[
|
|||
|
||||
pub const SOGDIAN: &'static [(char, char)] = &[('𐼰', '𐽙')];
|
||||
|
||||
pub const SORA_SOMPENG: &'static [(char, char)] =
|
||||
&[('𑃐', '𑃨'), ('𑃰', '𑃹')];
|
||||
pub const SORA_SOMPENG: &'static [(char, char)] = &[('𑃐', '𑃨'), ('𑃰', '𑃹')];
|
||||
|
||||
pub const SOYOMBO: &'static [(char, char)] = &[('𑩐', '𑪢')];
|
||||
|
||||
pub const SUNDANESE: &'static [(char, char)] =
|
||||
&[('\u{1b80}', 'ᮿ'), ('᳀', '᳇')];
|
||||
|
||||
pub const SYLOTI_NAGRI: &'static [(char, char)] = &[('ꠀ', '꠫')];
|
||||
pub const SYLOTI_NAGRI: &'static [(char, char)] = &[('ꠀ', '\u{a82c}')];
|
||||
|
||||
pub const SYRIAC: &'static [(char, char)] =
|
||||
&[('܀', '܍'), ('\u{70f}', '\u{74a}'), ('ݍ', 'ݏ'), ('ࡠ', 'ࡪ')];
|
||||
|
||||
pub const TAGALOG: &'static [(char, char)] =
|
||||
&[('ᜀ', 'ᜌ'), ('ᜎ', '\u{1714}')];
|
||||
pub const TAGALOG: &'static [(char, char)] = &[('ᜀ', 'ᜌ'), ('ᜎ', '\u{1714}')];
|
||||
|
||||
pub const TAGBANWA: &'static [(char, char)] =
|
||||
&[('ᝠ', 'ᝬ'), ('ᝮ', 'ᝰ'), ('\u{1772}', '\u{1773}')];
|
||||
|
@ -1172,11 +1133,9 @@ pub const TAI_THAM: &'static [(char, char)] = &[
|
|||
('᪠', '᪭'),
|
||||
];
|
||||
|
||||
pub const TAI_VIET: &'static [(char, char)] =
|
||||
&[('ꪀ', 'ꫂ'), ('ꫛ', '꫟')];
|
||||
pub const TAI_VIET: &'static [(char, char)] = &[('ꪀ', 'ꫂ'), ('ꫛ', '꫟')];
|
||||
|
||||
pub const TAKRI: &'static [(char, char)] =
|
||||
&[('𑚀', '\u{116b8}'), ('𑛀', '𑛉')];
|
||||
pub const TAKRI: &'static [(char, char)] = &[('𑚀', '𑚸'), ('𑛀', '𑛉')];
|
||||
|
||||
pub const TAMIL: &'static [(char, char)] = &[
|
||||
('\u{b82}', 'ஃ'),
|
||||
|
@ -1195,12 +1154,16 @@ pub const TAMIL: &'static [(char, char)] = &[
|
|||
('ௐ', 'ௐ'),
|
||||
('\u{bd7}', '\u{bd7}'),
|
||||
('௦', '௺'),
|
||||
('\u{11fc0}', '\u{11ff1}'),
|
||||
('\u{11fff}', '\u{11fff}'),
|
||||
('𑿀', '𑿱'),
|
||||
('𑿿', '𑿿'),
|
||||
];
|
||||
|
||||
pub const TANGUT: &'static [(char, char)] =
|
||||
&[('𖿠', '𖿠'), ('𗀀', '\u{187f7}'), ('𘠀', '𘫲')];
|
||||
pub const TANGUT: &'static [(char, char)] = &[
|
||||
('𖿠', '𖿠'),
|
||||
('𗀀', '𘟷'),
|
||||
('𘠀', '\u{18aff}'),
|
||||
('\u{18d00}', '\u{18d08}'),
|
||||
];
|
||||
|
||||
pub const TELUGU: &'static [(char, char)] = &[
|
||||
('\u{c00}', 'ఌ'),
|
||||
|
@ -1214,13 +1177,12 @@ pub const TELUGU: &'static [(char, char)] = &[
|
|||
('ౘ', 'ౚ'),
|
||||
('ౠ', '\u{c63}'),
|
||||
('౦', '౯'),
|
||||
('\u{c77}', '౿'),
|
||||
('౷', '౿'),
|
||||
];
|
||||
|
||||
pub const THAANA: &'static [(char, char)] = &[('ހ', 'ޱ')];
|
||||
|
||||
pub const THAI: &'static [(char, char)] =
|
||||
&[('ก', '\u{e3a}'), ('เ', '๛')];
|
||||
pub const THAI: &'static [(char, char)] = &[('ก', '\u{e3a}'), ('เ', '๛')];
|
||||
|
||||
pub const TIBETAN: &'static [(char, char)] = &[
|
||||
('ༀ', 'ཇ'),
|
||||
|
@ -1235,19 +1197,21 @@ pub const TIBETAN: &'static [(char, char)] = &[
|
|||
pub const TIFINAGH: &'static [(char, char)] =
|
||||
&[('ⴰ', 'ⵧ'), ('ⵯ', '⵰'), ('\u{2d7f}', '\u{2d7f}')];
|
||||
|
||||
pub const TIRHUTA: &'static [(char, char)] =
|
||||
&[('𑒀', '𑓇'), ('𑓐', '𑓙')];
|
||||
pub const TIRHUTA: &'static [(char, char)] = &[('𑒀', '𑓇'), ('𑓐', '𑓙')];
|
||||
|
||||
pub const UGARITIC: &'static [(char, char)] =
|
||||
&[('𐎀', '𐎝'), ('𐎟', '𐎟')];
|
||||
pub const UGARITIC: &'static [(char, char)] = &[('𐎀', '𐎝'), ('𐎟', '𐎟')];
|
||||
|
||||
pub const VAI: &'static [(char, char)] = &[('ꔀ', 'ꘫ')];
|
||||
|
||||
pub const WANCHO: &'static [(char, char)] =
|
||||
&[('\u{1e2c0}', '\u{1e2f9}'), ('\u{1e2ff}', '\u{1e2ff}')];
|
||||
pub const WANCHO: &'static [(char, char)] = &[('𞋀', '𞋹'), ('𞋿', '𞋿')];
|
||||
|
||||
pub const WARANG_CITI: &'static [(char, char)] =
|
||||
&[('𑢠', '𑣲'), ('𑣿', '𑣿')];
|
||||
pub const WARANG_CITI: &'static [(char, char)] = &[('𑢠', '𑣲'), ('𑣿', '𑣿')];
|
||||
|
||||
pub const YEZIDI: &'static [(char, char)] = &[
|
||||
('\u{10e80}', '\u{10ea9}'),
|
||||
('\u{10eab}', '\u{10ead}'),
|
||||
('\u{10eb0}', '\u{10eb1}'),
|
||||
];
|
||||
|
||||
pub const YI: &'static [(char, char)] = &[('ꀀ', 'ꒌ'), ('꒐', '꓆')];
|
||||
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
|
||||
//
|
||||
// ucd-generate script-extension /tmp/ucd/12.1.0/ --chars
|
||||
// ucd-generate script-extension ucd-13.0.0 --chars
|
||||
//
|
||||
// ucd-generate is available on crates.io.
|
||||
// Unicode version: 13.0.0.
|
||||
//
|
||||
// ucd-generate 0.2.8 is available on crates.io.
|
||||
|
||||
pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[
|
||||
("Adlam", ADLAM),
|
||||
|
@ -28,6 +30,7 @@ pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[
|
|||
("Chakma", CHAKMA),
|
||||
("Cham", CHAM),
|
||||
("Cherokee", CHEROKEE),
|
||||
("Chorasmian", CHORASMIAN),
|
||||
("Common", COMMON),
|
||||
("Coptic", COPTIC),
|
||||
("Cuneiform", CUNEIFORM),
|
||||
|
@ -35,6 +38,7 @@ pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[
|
|||
("Cyrillic", CYRILLIC),
|
||||
("Deseret", DESERET),
|
||||
("Devanagari", DEVANAGARI),
|
||||
("Dives_Akuru", DIVES_AKURU),
|
||||
("Dogra", DOGRA),
|
||||
("Duployan", DUPLOYAN),
|
||||
("Egyptian_Hieroglyphs", EGYPTIAN_HIEROGLYPHS),
|
||||
|
@ -66,6 +70,7 @@ pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[
|
|||
("Katakana", KATAKANA),
|
||||
("Kayah_Li", KAYAH_LI),
|
||||
("Kharoshthi", KHAROSHTHI),
|
||||
("Khitan_Small_Script", KHITAN_SMALL_SCRIPT),
|
||||
("Khmer", KHMER),
|
||||
("Khojki", KHOJKI),
|
||||
("Khudawadi", KHUDAWADI),
|
||||
|
@ -155,12 +160,13 @@ pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[
|
|||
("Vai", VAI),
|
||||
("Wancho", WANCHO),
|
||||
("Warang_Citi", WARANG_CITI),
|
||||
("Yezidi", YEZIDI),
|
||||
("Yi", YI),
|
||||
("Zanabazar_Square", ZANABAZAR_SQUARE),
|
||||
];
|
||||
|
||||
pub const ADLAM: &'static [(char, char)] =
|
||||
&[('ـ', 'ـ'), ('𞤀', '\u{1e94b}'), ('𞥐', '𞥙'), ('𞥞', '𞥟')];
|
||||
&[('ـ', 'ـ'), ('𞤀', '𞥋'), ('𞥐', '𞥙'), ('𞥞', '𞥟')];
|
||||
|
||||
pub const AHOM: &'static [(char, char)] =
|
||||
&[('𑜀', '𑜚'), ('\u{1171d}', '\u{1172b}'), ('𑜰', '𑜿')];
|
||||
|
@ -174,7 +180,7 @@ pub const ARABIC: &'static [(char, char)] = &[
|
|||
('۞', 'ۿ'),
|
||||
('ݐ', 'ݿ'),
|
||||
('ࢠ', 'ࢴ'),
|
||||
('ࢶ', 'ࢽ'),
|
||||
('ࢶ', '\u{8c7}'),
|
||||
('\u{8d3}', '\u{8e1}'),
|
||||
('\u{8e3}', '\u{8ff}'),
|
||||
('ﭐ', '﯁'),
|
||||
|
@ -225,11 +231,9 @@ pub const ARABIC: &'static [(char, char)] = &[
|
|||
pub const ARMENIAN: &'static [(char, char)] =
|
||||
&[('Ա', 'Ֆ'), ('ՙ', '֊'), ('֍', '֏'), ('ﬓ', 'ﬗ')];
|
||||
|
||||
pub const AVESTAN: &'static [(char, char)] =
|
||||
&[('𐬀', '𐬵'), ('𐬹', '𐬿')];
|
||||
pub const AVESTAN: &'static [(char, char)] = &[('𐬀', '𐬵'), ('𐬹', '𐬿')];
|
||||
|
||||
pub const BALINESE: &'static [(char, char)] =
|
||||
&[('\u{1b00}', 'ᭋ'), ('᭐', '᭼')];
|
||||
pub const BALINESE: &'static [(char, char)] = &[('\u{1b00}', 'ᭋ'), ('᭐', '᭼')];
|
||||
|
||||
pub const BAMUM: &'static [(char, char)] = &[('ꚠ', '꛷'), ('𖠀', '𖨸')];
|
||||
|
||||
|
@ -267,12 +271,8 @@ pub const BENGALI: &'static [(char, char)] = &[
|
|||
('\u{a8f1}', '\u{a8f1}'),
|
||||
];
|
||||
|
||||
pub const BHAIKSUKI: &'static [(char, char)] = &[
|
||||
('𑰀', '𑰈'),
|
||||
('𑰊', '\u{11c36}'),
|
||||
('\u{11c38}', '𑱅'),
|
||||
('𑱐', '𑱬'),
|
||||
];
|
||||
pub const BHAIKSUKI: &'static [(char, char)] =
|
||||
&[('𑰀', '𑰈'), ('𑰊', '\u{11c36}'), ('\u{11c38}', '𑱅'), ('𑱐', '𑱬')];
|
||||
|
||||
pub const BOPOMOFO: &'static [(char, char)] = &[
|
||||
('˪', '˫'),
|
||||
|
@ -284,7 +284,7 @@ pub const BOPOMOFO: &'static [(char, char)] = &[
|
|||
('〷', '〷'),
|
||||
('・', '・'),
|
||||
('ㄅ', 'ㄯ'),
|
||||
('ㆠ', 'ㆺ'),
|
||||
('ㆠ', '\u{31bf}'),
|
||||
('﹅', '﹆'),
|
||||
('。', '・'),
|
||||
];
|
||||
|
@ -297,8 +297,7 @@ pub const BRAILLE: &'static [(char, char)] = &[('⠀', '⣿')];
|
|||
pub const BUGINESE: &'static [(char, char)] =
|
||||
&[('ᨀ', '\u{1a1b}'), ('᨞', '᨟'), ('ꧏ', 'ꧏ')];
|
||||
|
||||
pub const BUHID: &'static [(char, char)] =
|
||||
&[('᜵', '᜶'), ('ᝀ', '\u{1753}')];
|
||||
pub const BUHID: &'static [(char, char)] = &[('᜵', '᜶'), ('ᝀ', '\u{1753}')];
|
||||
|
||||
pub const CANADIAN_ABORIGINAL: &'static [(char, char)] =
|
||||
&[('᐀', 'ᙿ'), ('ᢰ', 'ᣵ')];
|
||||
|
@ -308,12 +307,8 @@ pub const CARIAN: &'static [(char, char)] = &[('𐊠', '𐋐')];
|
|||
pub const CAUCASIAN_ALBANIAN: &'static [(char, char)] =
|
||||
&[('𐔰', '𐕣'), ('𐕯', '𐕯')];
|
||||
|
||||
pub const CHAKMA: &'static [(char, char)] = &[
|
||||
('০', '৯'),
|
||||
('၀', '၉'),
|
||||
('\u{11100}', '\u{11134}'),
|
||||
('𑄶', '𑅆'),
|
||||
];
|
||||
pub const CHAKMA: &'static [(char, char)] =
|
||||
&[('০', '৯'), ('၀', '၉'), ('\u{11100}', '\u{11134}'), ('𑄶', '\u{11147}')];
|
||||
|
||||
pub const CHAM: &'static [(char, char)] =
|
||||
&[('ꨀ', '\u{aa36}'), ('ꩀ', 'ꩍ'), ('꩐', '꩙'), ('꩜', '꩟')];
|
||||
|
@ -321,6 +316,8 @@ pub const CHAM: &'static [(char, char)] =
|
|||
pub const CHEROKEE: &'static [(char, char)] =
|
||||
&[('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ꭰ', 'ꮿ')];
|
||||
|
||||
pub const CHORASMIAN: &'static [(char, char)] = &[('\u{10fb0}', '\u{10fcb}')];
|
||||
|
||||
pub const COMMON: &'static [(char, char)] = &[
|
||||
('\u{0}', '@'),
|
||||
('[', '`'),
|
||||
|
@ -360,9 +357,9 @@ pub const COMMON: &'static [(char, char)] = &[
|
|||
('①', '⟿'),
|
||||
('⤀', '⭳'),
|
||||
('⭶', '⮕'),
|
||||
('⮘', '\u{2bff}'),
|
||||
('\u{2b97}', '⯿'),
|
||||
('⸀', '⹂'),
|
||||
('⹄', '\u{2e4f}'),
|
||||
('⹄', '\u{2e52}'),
|
||||
('⿰', '⿻'),
|
||||
('\u{3000}', '\u{3000}'),
|
||||
('〄', '〄'),
|
||||
|
@ -377,9 +374,10 @@ pub const COMMON: &'static [(char, char)] = &[
|
|||
('㎀', '㏟'),
|
||||
('㏿', '㏿'),
|
||||
('䷀', '䷿'),
|
||||
('꜀', '꜡'),
|
||||
('꜈', '꜡'),
|
||||
('ꞈ', '꞊'),
|
||||
('꭛', '꭛'),
|
||||
('\u{ab6a}', '\u{ab6b}'),
|
||||
('﴾', '﴿'),
|
||||
('︐', '︙'),
|
||||
('︰', '﹄'),
|
||||
|
@ -393,9 +391,9 @@ pub const COMMON: &'static [(char, char)] = &[
|
|||
('¢', '₩'),
|
||||
('│', '○'),
|
||||
('\u{fff9}', '<27>'),
|
||||
('𐆐', '𐆛'),
|
||||
('𐆐', '\u{1019c}'),
|
||||
('𐇐', '𐇼'),
|
||||
('\u{16fe2}', '\u{16fe3}'),
|
||||
('𖿢', '𖿣'),
|
||||
('𝀀', '𝃵'),
|
||||
('𝄀', '𝄦'),
|
||||
('𝄩', '𝅦'),
|
||||
|
@ -428,44 +426,45 @@ pub const COMMON: &'static [(char, char)] = &[
|
|||
('𝚨', '𝟋'),
|
||||
('𝟎', '𝟿'),
|
||||
('𞱱', '𞲴'),
|
||||
('\u{1ed01}', '\u{1ed3d}'),
|
||||
('𞴁', '𞴽'),
|
||||
('🀀', '🀫'),
|
||||
('🀰', '🂓'),
|
||||
('🂠', '🂮'),
|
||||
('🂱', '🂿'),
|
||||
('🃁', '🃏'),
|
||||
('🃑', '🃵'),
|
||||
('🄀', '🄌'),
|
||||
('🄐', '\u{1f16c}'),
|
||||
('🅰', '🆬'),
|
||||
('🄀', '\u{1f1ad}'),
|
||||
('🇦', '🇿'),
|
||||
('🈁', '🈂'),
|
||||
('🈐', '🈻'),
|
||||
('🉀', '🉈'),
|
||||
('🉠', '🉥'),
|
||||
('🌀', '\u{1f6d5}'),
|
||||
('🌀', '\u{1f6d7}'),
|
||||
('🛠', '🛬'),
|
||||
('🛰', '\u{1f6fa}'),
|
||||
('🛰', '\u{1f6fc}'),
|
||||
('🜀', '🝳'),
|
||||
('🞀', '🟘'),
|
||||
('\u{1f7e0}', '\u{1f7eb}'),
|
||||
('🟠', '🟫'),
|
||||
('🠀', '🠋'),
|
||||
('🠐', '🡇'),
|
||||
('🡐', '🡙'),
|
||||
('🡠', '🢇'),
|
||||
('🢐', '🢭'),
|
||||
('🤀', '🤋'),
|
||||
('\u{1f90d}', '\u{1f971}'),
|
||||
('🥳', '🥶'),
|
||||
('🥺', '🦢'),
|
||||
('\u{1f9a5}', '\u{1f9aa}'),
|
||||
('\u{1f9ae}', '\u{1f9ca}'),
|
||||
('\u{1f9cd}', '\u{1fa53}'),
|
||||
('\u{1f8b0}', '\u{1f8b1}'),
|
||||
('🤀', '\u{1f978}'),
|
||||
('🥺', '\u{1f9cb}'),
|
||||
('🧍', '🩓'),
|
||||
('🩠', '🩭'),
|
||||
('\u{1fa70}', '\u{1fa73}'),
|
||||
('\u{1fa78}', '\u{1fa7a}'),
|
||||
('\u{1fa80}', '\u{1fa82}'),
|
||||
('\u{1fa90}', '\u{1fa95}'),
|
||||
('🩰', '\u{1fa74}'),
|
||||
('🩸', '🩺'),
|
||||
('🪀', '\u{1fa86}'),
|
||||
('🪐', '\u{1faa8}'),
|
||||
('\u{1fab0}', '\u{1fab6}'),
|
||||
('\u{1fac0}', '\u{1fac2}'),
|
||||
('\u{1fad0}', '\u{1fad6}'),
|
||||
('\u{1fb00}', '\u{1fb92}'),
|
||||
('\u{1fb94}', '\u{1fbca}'),
|
||||
('\u{1fbf0}', '\u{1fbf9}'),
|
||||
('\u{e0001}', '\u{e0001}'),
|
||||
('\u{e0020}', '\u{e007f}'),
|
||||
];
|
||||
|
@ -493,6 +492,7 @@ pub const CYRILLIC: &'static [(char, char)] = &[
|
|||
('ᲀ', 'ᲈ'),
|
||||
('ᴫ', 'ᴫ'),
|
||||
('ᵸ', 'ᵸ'),
|
||||
('\u{1df8}', '\u{1df8}'),
|
||||
('\u{2de0}', '\u{2dff}'),
|
||||
('⹃', '⹃'),
|
||||
('Ꙁ', '\u{a69f}'),
|
||||
|
@ -511,23 +511,29 @@ pub const DEVANAGARI: &'static [(char, char)] = &[
|
|||
('\u{a8e0}', '\u{a8ff}'),
|
||||
];
|
||||
|
||||
pub const DIVES_AKURU: &'static [(char, char)] = &[
|
||||
('\u{11900}', '\u{11906}'),
|
||||
('\u{11909}', '\u{11909}'),
|
||||
('\u{1190c}', '\u{11913}'),
|
||||
('\u{11915}', '\u{11916}'),
|
||||
('\u{11918}', '\u{11935}'),
|
||||
('\u{11937}', '\u{11938}'),
|
||||
('\u{1193b}', '\u{11946}'),
|
||||
('\u{11950}', '\u{11959}'),
|
||||
];
|
||||
|
||||
pub const DOGRA: &'static [(char, char)] =
|
||||
&[('।', '९'), ('꠰', '꠹'), ('𑠀', '𑠻')];
|
||||
|
||||
pub const DUPLOYAN: &'static [(char, char)] = &[
|
||||
('𛰀', '𛱪'),
|
||||
('𛱰', '𛱼'),
|
||||
('𛲀', '𛲈'),
|
||||
('𛲐', '𛲙'),
|
||||
('𛲜', '\u{1bca3}'),
|
||||
];
|
||||
pub const DUPLOYAN: &'static [(char, char)] =
|
||||
&[('𛰀', '𛱪'), ('𛱰', '𛱼'), ('𛲀', '𛲈'), ('𛲐', '𛲙'), ('𛲜', '\u{1bca3}')];
|
||||
|
||||
pub const EGYPTIAN_HIEROGLYPHS: &'static [(char, char)] =
|
||||
&[('𓀀', '𓐮'), ('\u{13430}', '\u{13438}')];
|
||||
|
||||
pub const ELBASAN: &'static [(char, char)] = &[('𐔀', '𐔧')];
|
||||
|
||||
pub const ELYMAIC: &'static [(char, char)] = &[('\u{10fe0}', '\u{10ff6}')];
|
||||
pub const ELYMAIC: &'static [(char, char)] = &[('𐿠', '𐿶')];
|
||||
|
||||
pub const ETHIOPIC: &'static [(char, char)] = &[
|
||||
('ሀ', 'ቈ'),
|
||||
|
@ -565,7 +571,6 @@ pub const ETHIOPIC: &'static [(char, char)] = &[
|
|||
];
|
||||
|
||||
pub const GEORGIAN: &'static [(char, char)] = &[
|
||||
('։', '։'),
|
||||
('Ⴀ', 'Ⴥ'),
|
||||
('Ⴧ', 'Ⴧ'),
|
||||
('Ⴭ', 'Ⴭ'),
|
||||
|
@ -617,8 +622,8 @@ pub const GRANTHA: &'static [(char, char)] = &[
|
|||
('𑍝', '𑍣'),
|
||||
('\u{11366}', '\u{1136c}'),
|
||||
('\u{11370}', '\u{11374}'),
|
||||
('\u{11fd0}', '\u{11fd1}'),
|
||||
('\u{11fd3}', '\u{11fd3}'),
|
||||
('𑿐', '𑿑'),
|
||||
('𑿓', '𑿓'),
|
||||
];
|
||||
|
||||
pub const GREEK: &'static [(char, char)] = &[
|
||||
|
@ -730,24 +735,27 @@ pub const HAN: &'static [(char, char)] = &[
|
|||
('㈠', '㉇'),
|
||||
('㊀', '㊰'),
|
||||
('㋀', '㋋'),
|
||||
('\u{32ff}', '\u{32ff}'),
|
||||
('㋿', '㋿'),
|
||||
('㍘', '㍰'),
|
||||
('㍻', '㍿'),
|
||||
('㏠', '㏾'),
|
||||
('㐀', '䶵'),
|
||||
('一', '鿯'),
|
||||
('㐀', '\u{4dbf}'),
|
||||
('一', '\u{9ffc}'),
|
||||
('꜀', '꜇'),
|
||||
('豈', '舘'),
|
||||
('並', '龎'),
|
||||
('﹅', '﹆'),
|
||||
('。', '・'),
|
||||
('\u{16ff0}', '\u{16ff1}'),
|
||||
('𝍠', '𝍱'),
|
||||
('🉐', '🉑'),
|
||||
('𠀀', '𪛖'),
|
||||
('𠀀', '\u{2a6dd}'),
|
||||
('𪜀', '𫜴'),
|
||||
('𫝀', '𫠝'),
|
||||
('𫠠', '𬺡'),
|
||||
('𬺰', '𮯠'),
|
||||
('丽', '𪘀'),
|
||||
('\u{30000}', '\u{3134a}'),
|
||||
];
|
||||
|
||||
pub const HANGUL: &'static [(char, char)] = &[
|
||||
|
@ -816,7 +824,7 @@ pub const HIRAGANA: &'static [(char, char)] = &[
|
|||
('ー', 'ー'),
|
||||
('\u{ff9e}', '\u{ff9f}'),
|
||||
('𛀁', '𛄞'),
|
||||
('\u{1b150}', '\u{1b152}'),
|
||||
('𛅐', '𛅒'),
|
||||
('🈀', '🈀'),
|
||||
];
|
||||
|
||||
|
@ -828,8 +836,9 @@ pub const INHERITED: &'static [(char, char)] = &[
|
|||
('\u{343}', '\u{344}'),
|
||||
('\u{346}', '\u{362}'),
|
||||
('\u{953}', '\u{954}'),
|
||||
('\u{1ab0}', '\u{1abe}'),
|
||||
('\u{1dc2}', '\u{1df9}'),
|
||||
('\u{1ab0}', '\u{1ac0}'),
|
||||
('\u{1dc2}', '\u{1df7}'),
|
||||
('\u{1df9}', '\u{1df9}'),
|
||||
('\u{1dfb}', '\u{1dff}'),
|
||||
('\u{200c}', '\u{200d}'),
|
||||
('\u{20d0}', '\u{20ef}'),
|
||||
|
@ -852,12 +861,8 @@ pub const INSCRIPTIONAL_PARTHIAN: &'static [(char, char)] =
|
|||
pub const JAVANESE: &'static [(char, char)] =
|
||||
&[('\u{a980}', '꧍'), ('ꧏ', '꧙'), ('꧞', '꧟')];
|
||||
|
||||
pub const KAITHI: &'static [(char, char)] = &[
|
||||
('०', '९'),
|
||||
('꠰', '꠹'),
|
||||
('\u{11080}', '𑃁'),
|
||||
('\u{110cd}', '\u{110cd}'),
|
||||
];
|
||||
pub const KAITHI: &'static [(char, char)] =
|
||||
&[('०', '९'), ('꠰', '꠹'), ('\u{11080}', '𑃁'), ('\u{110cd}', '\u{110cd}')];
|
||||
|
||||
pub const KANNADA: &'static [(char, char)] = &[
|
||||
('\u{951}', '\u{952}'),
|
||||
|
@ -898,7 +903,7 @@ pub const KATAKANA: &'static [(char, char)] = &[
|
|||
('﹅', '﹆'),
|
||||
('。', '\u{ff9f}'),
|
||||
('𛀀', '𛀀'),
|
||||
('\u{1b164}', '\u{1b167}'),
|
||||
('𛅤', '𛅧'),
|
||||
];
|
||||
|
||||
pub const KAYAH_LI: &'static [(char, char)] = &[('꤀', '꤯')];
|
||||
|
@ -914,6 +919,9 @@ pub const KHAROSHTHI: &'static [(char, char)] = &[
|
|||
('𐩐', '𐩘'),
|
||||
];
|
||||
|
||||
pub const KHITAN_SMALL_SCRIPT: &'static [(char, char)] =
|
||||
&[('\u{16fe4}', '\u{16fe4}'), ('\u{18b00}', '\u{18cd5}')];
|
||||
|
||||
pub const KHMER: &'static [(char, char)] =
|
||||
&[('ក', '\u{17dd}'), ('០', '៩'), ('៰', '៹'), ('᧠', '᧿')];
|
||||
|
||||
|
@ -926,8 +934,8 @@ pub const KHUDAWADI: &'static [(char, char)] =
|
|||
pub const LAO: &'static [(char, char)] = &[
|
||||
('ກ', 'ຂ'),
|
||||
('ຄ', 'ຄ'),
|
||||
('\u{e86}', 'ຊ'),
|
||||
('\u{e8c}', 'ຣ'),
|
||||
('ຆ', 'ຊ'),
|
||||
('ຌ', 'ຣ'),
|
||||
('ລ', 'ລ'),
|
||||
('ວ', 'ຽ'),
|
||||
('ເ', 'ໄ'),
|
||||
|
@ -966,14 +974,15 @@ pub const LATIN: &'static [(char, char)] = &[
|
|||
('ⅎ', 'ⅎ'),
|
||||
('Ⅰ', 'ↈ'),
|
||||
('Ⱡ', 'Ɀ'),
|
||||
('꜀', '꜇'),
|
||||
('Ꜣ', 'ꞇ'),
|
||||
('Ꞌ', '\u{a7bf}'),
|
||||
('\u{a7c2}', '\u{a7c6}'),
|
||||
('ꟷ', 'ꟿ'),
|
||||
('Ꞌ', 'ꞿ'),
|
||||
('Ꟃ', '\u{a7ca}'),
|
||||
('\u{a7f5}', 'ꟿ'),
|
||||
('꤮', '꤮'),
|
||||
('ꬰ', 'ꭚ'),
|
||||
('ꭜ', 'ꭤ'),
|
||||
('\u{ab66}', '\u{ab67}'),
|
||||
('ꭦ', '\u{ab69}'),
|
||||
('ff', 'st'),
|
||||
('A', 'Z'),
|
||||
('a', 'z'),
|
||||
|
@ -1007,12 +1016,12 @@ pub const LINEAR_B: &'static [(char, char)] = &[
|
|||
('𐄷', '𐄿'),
|
||||
];
|
||||
|
||||
pub const LISU: &'static [(char, char)] = &[('ꓐ', '꓿')];
|
||||
pub const LISU: &'static [(char, char)] =
|
||||
&[('ꓐ', '꓿'), ('\u{11fb0}', '\u{11fb0}')];
|
||||
|
||||
pub const LYCIAN: &'static [(char, char)] = &[('𐊀', '𐊜')];
|
||||
|
||||
pub const LYDIAN: &'static [(char, char)] =
|
||||
&[('𐤠', '𐤹'), ('𐤿', '𐤿')];
|
||||
pub const LYDIAN: &'static [(char, char)] = &[('𐤠', '𐤹'), ('𐤿', '𐤿')];
|
||||
|
||||
pub const MAHAJANI: &'static [(char, char)] =
|
||||
&[('।', '९'), ('꠰', '꠹'), ('𑅐', '𑅶')];
|
||||
|
@ -1022,8 +1031,7 @@ pub const MAKASAR: &'static [(char, char)] = &[('𑻠', '𑻸')];
|
|||
pub const MALAYALAM: &'static [(char, char)] = &[
|
||||
('\u{951}', '\u{952}'),
|
||||
('।', '॥'),
|
||||
('\u{d00}', 'ഃ'),
|
||||
('അ', 'ഌ'),
|
||||
('\u{d00}', 'ഌ'),
|
||||
('എ', 'ഐ'),
|
||||
('ഒ', '\u{d44}'),
|
||||
('െ', 'ൈ'),
|
||||
|
@ -1067,11 +1075,8 @@ pub const MEROITIC_CURSIVE: &'static [(char, char)] =
|
|||
|
||||
pub const MEROITIC_HIEROGLYPHS: &'static [(char, char)] = &[('𐦀', '𐦟')];
|
||||
|
||||
pub const MIAO: &'static [(char, char)] = &[
|
||||
('𖼀', '\u{16f4a}'),
|
||||
('\u{16f4f}', '\u{16f87}'),
|
||||
('\u{16f8f}', '𖾟'),
|
||||
];
|
||||
pub const MIAO: &'static [(char, char)] =
|
||||
&[('𖼀', '𖽊'), ('\u{16f4f}', '𖾇'), ('\u{16f8f}', '𖾟')];
|
||||
|
||||
pub const MODI: &'static [(char, char)] =
|
||||
&[('꠰', '꠹'), ('𑘀', '𑙄'), ('𑙐', '𑙙')];
|
||||
|
@ -1085,53 +1090,39 @@ pub const MONGOLIAN: &'static [(char, char)] = &[
|
|||
('𑙠', '𑙬'),
|
||||
];
|
||||
|
||||
pub const MRO: &'static [(char, char)] =
|
||||
&[('𖩀', '𖩞'), ('𖩠', '𖩩'), ('𖩮', '𖩯')];
|
||||
pub const MRO: &'static [(char, char)] = &[('𖩀', '𖩞'), ('𖩠', '𖩩'), ('𖩮', '𖩯')];
|
||||
|
||||
pub const MULTANI: &'static [(char, char)] = &[
|
||||
('੦', '੯'),
|
||||
('𑊀', '𑊆'),
|
||||
('𑊈', '𑊈'),
|
||||
('𑊊', '𑊍'),
|
||||
('𑊏', '𑊝'),
|
||||
('𑊟', '𑊩'),
|
||||
];
|
||||
pub const MULTANI: &'static [(char, char)] =
|
||||
&[('੦', '੯'), ('𑊀', '𑊆'), ('𑊈', '𑊈'), ('𑊊', '𑊍'), ('𑊏', '𑊝'), ('𑊟', '𑊩')];
|
||||
|
||||
pub const MYANMAR: &'static [(char, char)] =
|
||||
&[('က', '႟'), ('꤮', '꤮'), ('ꧠ', 'ꧾ'), ('ꩠ', 'ꩿ')];
|
||||
|
||||
pub const NABATAEAN: &'static [(char, char)] =
|
||||
&[('𐢀', '𐢞'), ('𐢧', '𐢯')];
|
||||
pub const NABATAEAN: &'static [(char, char)] = &[('𐢀', '𐢞'), ('𐢧', '𐢯')];
|
||||
|
||||
pub const NANDINAGARI: &'static [(char, char)] = &[
|
||||
('।', '॥'),
|
||||
('೦', '೯'),
|
||||
('ᳩ', 'ᳩ'),
|
||||
('ᳲ', 'ᳲ'),
|
||||
('\u{1cfa}', '\u{1cfa}'),
|
||||
('ᳺ', 'ᳺ'),
|
||||
('꠰', '꠵'),
|
||||
('\u{119a0}', '\u{119a7}'),
|
||||
('\u{119aa}', '\u{119d7}'),
|
||||
('\u{119da}', '\u{119e4}'),
|
||||
('𑦠', '𑦧'),
|
||||
('𑦪', '\u{119d7}'),
|
||||
('\u{119da}', '𑧤'),
|
||||
];
|
||||
|
||||
pub const NEW_TAI_LUE: &'static [(char, char)] =
|
||||
&[('ᦀ', 'ᦫ'), ('ᦰ', 'ᧉ'), ('᧐', '᧚'), ('᧞', '᧟')];
|
||||
|
||||
pub const NEWA: &'static [(char, char)] =
|
||||
&[('𑐀', '𑑙'), ('𑑛', '𑑛'), ('𑑝', '\u{1145f}')];
|
||||
pub const NEWA: &'static [(char, char)] = &[('𑐀', '𑑛'), ('𑑝', '\u{11461}')];
|
||||
|
||||
pub const NKO: &'static [(char, char)] = &[('߀', 'ߺ'), ('\u{7fd}', '߿')];
|
||||
|
||||
pub const NUSHU: &'static [(char, char)] =
|
||||
&[('𖿡', '𖿡'), ('𛅰', '𛋻')];
|
||||
pub const NUSHU: &'static [(char, char)] = &[('𖿡', '𖿡'), ('𛅰', '𛋻')];
|
||||
|
||||
pub const NYIAKENG_PUACHUE_HMONG: &'static [(char, char)] = &[
|
||||
('\u{1e100}', '\u{1e12c}'),
|
||||
('\u{1e130}', '\u{1e13d}'),
|
||||
('\u{1e140}', '\u{1e149}'),
|
||||
('\u{1e14e}', '\u{1e14f}'),
|
||||
];
|
||||
pub const NYIAKENG_PUACHUE_HMONG: &'static [(char, char)] =
|
||||
&[('𞄀', '𞄬'), ('\u{1e130}', '𞄽'), ('𞅀', '𞅉'), ('𞅎', '𞅏')];
|
||||
|
||||
pub const OGHAM: &'static [(char, char)] = &[('\u{1680}', '᚜')];
|
||||
|
||||
|
@ -1140,16 +1131,14 @@ pub const OL_CHIKI: &'static [(char, char)] = &[('᱐', '᱿')];
|
|||
pub const OLD_HUNGARIAN: &'static [(char, char)] =
|
||||
&[('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𐳺', '𐳿')];
|
||||
|
||||
pub const OLD_ITALIC: &'static [(char, char)] =
|
||||
&[('𐌀', '𐌣'), ('𐌭', '𐌯')];
|
||||
pub const OLD_ITALIC: &'static [(char, char)] = &[('𐌀', '𐌣'), ('𐌭', '𐌯')];
|
||||
|
||||
pub const OLD_NORTH_ARABIAN: &'static [(char, char)] = &[('𐪀', '𐪟')];
|
||||
|
||||
pub const OLD_PERMIC: &'static [(char, char)] =
|
||||
&[('\u{483}', '\u{483}'), ('𐍐', '\u{1037a}')];
|
||||
|
||||
pub const OLD_PERSIAN: &'static [(char, char)] =
|
||||
&[('𐎠', '𐏃'), ('𐏈', '𐏕')];
|
||||
pub const OLD_PERSIAN: &'static [(char, char)] = &[('𐎠', '𐏃'), ('𐏈', '𐏕')];
|
||||
|
||||
pub const OLD_SOGDIAN: &'static [(char, char)] = &[('𐼀', '𐼧')];
|
||||
|
||||
|
@ -1170,7 +1159,7 @@ pub const ORIYA: &'static [(char, char)] = &[
|
|||
('\u{b3c}', '\u{b44}'),
|
||||
('େ', 'ୈ'),
|
||||
('ୋ', '\u{b4d}'),
|
||||
('\u{b56}', '\u{b57}'),
|
||||
('\u{b55}', '\u{b57}'),
|
||||
('ଡ଼', 'ଢ଼'),
|
||||
('ୟ', '\u{b63}'),
|
||||
('୦', '୷'),
|
||||
|
@ -1178,19 +1167,12 @@ pub const ORIYA: &'static [(char, char)] = &[
|
|||
('ᳲ', 'ᳲ'),
|
||||
];
|
||||
|
||||
pub const OSAGE: &'static [(char, char)] =
|
||||
&[('𐒰', '𐓓'), ('𐓘', '𐓻')];
|
||||
pub const OSAGE: &'static [(char, char)] = &[('𐒰', '𐓓'), ('𐓘', '𐓻')];
|
||||
|
||||
pub const OSMANYA: &'static [(char, char)] =
|
||||
&[('𐒀', '𐒝'), ('𐒠', '𐒩')];
|
||||
pub const OSMANYA: &'static [(char, char)] = &[('𐒀', '𐒝'), ('𐒠', '𐒩')];
|
||||
|
||||
pub const PAHAWH_HMONG: &'static [(char, char)] = &[
|
||||
('𖬀', '𖭅'),
|
||||
('𖭐', '𖭙'),
|
||||
('𖭛', '𖭡'),
|
||||
('𖭣', '𖭷'),
|
||||
('𖭽', '𖮏'),
|
||||
];
|
||||
pub const PAHAWH_HMONG: &'static [(char, char)] =
|
||||
&[('𖬀', '𖭅'), ('𖭐', '𖭙'), ('𖭛', '𖭡'), ('𖭣', '𖭷'), ('𖭽', '𖮏')];
|
||||
|
||||
pub const PALMYRENE: &'static [(char, char)] = &[('𐡠', '𐡿')];
|
||||
|
||||
|
@ -1199,8 +1181,7 @@ pub const PAU_CIN_HAU: &'static [(char, char)] = &[('𑫀', '𑫸')];
|
|||
pub const PHAGS_PA: &'static [(char, char)] =
|
||||
&[('᠂', '᠃'), ('᠅', '᠅'), ('ꡀ', '꡷')];
|
||||
|
||||
pub const PHOENICIAN: &'static [(char, char)] =
|
||||
&[('𐤀', '𐤛'), ('𐤟', '𐤟')];
|
||||
pub const PHOENICIAN: &'static [(char, char)] = &[('𐤀', '𐤛'), ('𐤟', '𐤟')];
|
||||
|
||||
pub const PSALTER_PAHLAVI: &'static [(char, char)] =
|
||||
&[('ـ', 'ـ'), ('𐮀', '𐮑'), ('𐮙', '𐮜'), ('𐮩', '𐮯')];
|
||||
|
@ -1209,8 +1190,7 @@ pub const REJANG: &'static [(char, char)] = &[('ꤰ', '꥓'), ('꥟', '꥟')];
|
|||
|
||||
pub const RUNIC: &'static [(char, char)] = &[('ᚠ', 'ᛪ'), ('ᛮ', 'ᛸ')];
|
||||
|
||||
pub const SAMARITAN: &'static [(char, char)] =
|
||||
&[('ࠀ', '\u{82d}'), ('࠰', '࠾')];
|
||||
pub const SAMARITAN: &'static [(char, char)] = &[('ࠀ', '\u{82d}'), ('࠰', '࠾')];
|
||||
|
||||
pub const SAURASHTRA: &'static [(char, char)] =
|
||||
&[('ꢀ', '\u{a8c5}'), ('꣎', '꣙')];
|
||||
|
@ -1221,8 +1201,7 @@ pub const SHARADA: &'static [(char, char)] = &[
|
|||
('\u{1cd9}', '\u{1cd9}'),
|
||||
('\u{1cdc}', '\u{1cdd}'),
|
||||
('\u{1ce0}', '\u{1ce0}'),
|
||||
('\u{11180}', '𑇍'),
|
||||
('𑇐', '𑇟'),
|
||||
('\u{11180}', '𑇟'),
|
||||
];
|
||||
|
||||
pub const SHAVIAN: &'static [(char, char)] = &[('𐑐', '𐑿')];
|
||||
|
@ -1230,15 +1209,12 @@ pub const SHAVIAN: &'static [(char, char)] = &[('𐑐', '𐑿')];
|
|||
pub const SIDDHAM: &'static [(char, char)] =
|
||||
&[('𑖀', '\u{115b5}'), ('𑖸', '\u{115dd}')];
|
||||
|
||||
pub const SIGNWRITING: &'static [(char, char)] = &[
|
||||
('𝠀', '𝪋'),
|
||||
('\u{1da9b}', '\u{1da9f}'),
|
||||
('\u{1daa1}', '\u{1daaf}'),
|
||||
];
|
||||
pub const SIGNWRITING: &'static [(char, char)] =
|
||||
&[('𝠀', '𝪋'), ('\u{1da9b}', '\u{1da9f}'), ('\u{1daa1}', '\u{1daaf}')];
|
||||
|
||||
pub const SINHALA: &'static [(char, char)] = &[
|
||||
('।', '॥'),
|
||||
('ං', 'ඃ'),
|
||||
('\u{d81}', 'ඃ'),
|
||||
('අ', 'ඖ'),
|
||||
('ක', 'න'),
|
||||
('ඳ', 'ර'),
|
||||
|
@ -1255,8 +1231,7 @@ pub const SINHALA: &'static [(char, char)] = &[
|
|||
|
||||
pub const SOGDIAN: &'static [(char, char)] = &[('ـ', 'ـ'), ('𐼰', '𐽙')];
|
||||
|
||||
pub const SORA_SOMPENG: &'static [(char, char)] =
|
||||
&[('𑃐', '𑃨'), ('𑃰', '𑃹')];
|
||||
pub const SORA_SOMPENG: &'static [(char, char)] = &[('𑃐', '𑃨'), ('𑃰', '𑃹')];
|
||||
|
||||
pub const SOYOMBO: &'static [(char, char)] = &[('𑩐', '𑪢')];
|
||||
|
||||
|
@ -1264,7 +1239,7 @@ pub const SUNDANESE: &'static [(char, char)] =
|
|||
&[('\u{1b80}', 'ᮿ'), ('᳀', '᳇')];
|
||||
|
||||
pub const SYLOTI_NAGRI: &'static [(char, char)] =
|
||||
&[('।', '॥'), ('০', '৯'), ('ꠀ', '꠫')];
|
||||
&[('।', '॥'), ('০', '৯'), ('ꠀ', '\u{a82c}')];
|
||||
|
||||
pub const SYRIAC: &'static [(char, char)] = &[
|
||||
('،', '،'),
|
||||
|
@ -1277,17 +1252,14 @@ pub const SYRIAC: &'static [(char, char)] = &[
|
|||
('\u{70f}', '\u{74a}'),
|
||||
('ݍ', 'ݏ'),
|
||||
('ࡠ', 'ࡪ'),
|
||||
('\u{1df8}', '\u{1df8}'),
|
||||
];
|
||||
|
||||
pub const TAGALOG: &'static [(char, char)] =
|
||||
&[('ᜀ', 'ᜌ'), ('ᜎ', '\u{1714}'), ('᜵', '᜶')];
|
||||
|
||||
pub const TAGBANWA: &'static [(char, char)] = &[
|
||||
('᜵', '᜶'),
|
||||
('ᝠ', 'ᝬ'),
|
||||
('ᝮ', 'ᝰ'),
|
||||
('\u{1772}', '\u{1773}'),
|
||||
];
|
||||
pub const TAGBANWA: &'static [(char, char)] =
|
||||
&[('᜵', '᜶'), ('ᝠ', 'ᝬ'), ('ᝮ', 'ᝰ'), ('\u{1772}', '\u{1773}')];
|
||||
|
||||
pub const TAI_LE: &'static [(char, char)] =
|
||||
&[('၀', '၉'), ('ᥐ', 'ᥭ'), ('ᥰ', 'ᥴ')];
|
||||
|
@ -1300,11 +1272,10 @@ pub const TAI_THAM: &'static [(char, char)] = &[
|
|||
('᪠', '᪭'),
|
||||
];
|
||||
|
||||
pub const TAI_VIET: &'static [(char, char)] =
|
||||
&[('ꪀ', 'ꫂ'), ('ꫛ', '꫟')];
|
||||
pub const TAI_VIET: &'static [(char, char)] = &[('ꪀ', 'ꫂ'), ('ꫛ', '꫟')];
|
||||
|
||||
pub const TAKRI: &'static [(char, char)] =
|
||||
&[('।', '॥'), ('꠰', '꠹'), ('𑚀', '\u{116b8}'), ('𑛀', '𑛉')];
|
||||
&[('।', '॥'), ('꠰', '꠹'), ('𑚀', '𑚸'), ('𑛀', '𑛉')];
|
||||
|
||||
pub const TAMIL: &'static [(char, char)] = &[
|
||||
('\u{951}', '\u{952}'),
|
||||
|
@ -1330,12 +1301,16 @@ pub const TAMIL: &'static [(char, char)] = &[
|
|||
('\u{11301}', '\u{11301}'),
|
||||
('𑌃', '𑌃'),
|
||||
('\u{1133b}', '\u{1133c}'),
|
||||
('\u{11fc0}', '\u{11ff1}'),
|
||||
('\u{11fff}', '\u{11fff}'),
|
||||
('𑿀', '𑿱'),
|
||||
('𑿿', '𑿿'),
|
||||
];
|
||||
|
||||
pub const TANGUT: &'static [(char, char)] =
|
||||
&[('𖿠', '𖿠'), ('𗀀', '\u{187f7}'), ('𘠀', '𘫲')];
|
||||
pub const TANGUT: &'static [(char, char)] = &[
|
||||
('𖿠', '𖿠'),
|
||||
('𗀀', '𘟷'),
|
||||
('𘠀', '\u{18aff}'),
|
||||
('\u{18d00}', '\u{18d08}'),
|
||||
];
|
||||
|
||||
pub const TELUGU: &'static [(char, char)] = &[
|
||||
('\u{951}', '\u{952}'),
|
||||
|
@ -1351,7 +1326,7 @@ pub const TELUGU: &'static [(char, char)] = &[
|
|||
('ౘ', 'ౚ'),
|
||||
('ౠ', '\u{c63}'),
|
||||
('౦', '౯'),
|
||||
('\u{c77}', '౿'),
|
||||
('౷', '౿'),
|
||||
('\u{1cda}', '\u{1cda}'),
|
||||
('ᳲ', 'ᳲ'),
|
||||
];
|
||||
|
@ -1366,8 +1341,7 @@ pub const THAANA: &'static [(char, char)] = &[
|
|||
('﷽', '﷽'),
|
||||
];
|
||||
|
||||
pub const THAI: &'static [(char, char)] =
|
||||
&[('ก', '\u{e3a}'), ('เ', '๛')];
|
||||
pub const THAI: &'static [(char, char)] = &[('ก', '\u{e3a}'), ('เ', '๛')];
|
||||
|
||||
pub const TIBETAN: &'static [(char, char)] = &[
|
||||
('ༀ', 'ཇ'),
|
||||
|
@ -1391,16 +1365,23 @@ pub const TIRHUTA: &'static [(char, char)] = &[
|
|||
('𑓐', '𑓙'),
|
||||
];
|
||||
|
||||
pub const UGARITIC: &'static [(char, char)] =
|
||||
&[('𐎀', '𐎝'), ('𐎟', '𐎟')];
|
||||
pub const UGARITIC: &'static [(char, char)] = &[('𐎀', '𐎝'), ('𐎟', '𐎟')];
|
||||
|
||||
pub const VAI: &'static [(char, char)] = &[('ꔀ', 'ꘫ')];
|
||||
|
||||
pub const WANCHO: &'static [(char, char)] =
|
||||
&[('\u{1e2c0}', '\u{1e2f9}'), ('\u{1e2ff}', '\u{1e2ff}')];
|
||||
pub const WANCHO: &'static [(char, char)] = &[('𞋀', '𞋹'), ('𞋿', '𞋿')];
|
||||
|
||||
pub const WARANG_CITI: &'static [(char, char)] =
|
||||
&[('𑢠', '𑣲'), ('𑣿', '𑣿')];
|
||||
pub const WARANG_CITI: &'static [(char, char)] = &[('𑢠', '𑣲'), ('𑣿', '𑣿')];
|
||||
|
||||
pub const YEZIDI: &'static [(char, char)] = &[
|
||||
('،', '،'),
|
||||
('؛', '؛'),
|
||||
('؟', '؟'),
|
||||
('٠', '٩'),
|
||||
('\u{10e80}', '\u{10ea9}'),
|
||||
('\u{10eab}', '\u{10ead}'),
|
||||
('\u{10eb0}', '\u{10eb1}'),
|
||||
];
|
||||
|
||||
pub const YI: &'static [(char, char)] = &[
|
||||
('、', '。'),
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
|
||||
//
|
||||
// ucd-generate sentence-break /tmp/ucd/12.1.0/ --chars
|
||||
// ucd-generate sentence-break ucd-13.0.0 --chars
|
||||
//
|
||||
// ucd-generate is available on crates.io.
|
||||
// Unicode version: 13.0.0.
|
||||
//
|
||||
// ucd-generate 0.2.8 is available on crates.io.
|
||||
|
||||
pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[
|
||||
("ATerm", ATERM),
|
||||
|
@ -132,7 +134,7 @@ pub const EXTEND: &'static [(char, char)] = &[
|
|||
('\u{b3e}', '\u{b44}'),
|
||||
('େ', 'ୈ'),
|
||||
('ୋ', '\u{b4d}'),
|
||||
('\u{b56}', '\u{b57}'),
|
||||
('\u{b55}', '\u{b57}'),
|
||||
('\u{b62}', '\u{b63}'),
|
||||
('\u{b82}', '\u{b82}'),
|
||||
('\u{bbe}', 'ூ'),
|
||||
|
@ -159,7 +161,7 @@ pub const EXTEND: &'static [(char, char)] = &[
|
|||
('ൊ', '\u{d4d}'),
|
||||
('\u{d57}', '\u{d57}'),
|
||||
('\u{d62}', '\u{d63}'),
|
||||
('ං', 'ඃ'),
|
||||
('\u{d81}', 'ඃ'),
|
||||
('\u{dca}', '\u{dca}'),
|
||||
('\u{dcf}', '\u{dd4}'),
|
||||
('\u{dd6}', '\u{dd6}'),
|
||||
|
@ -206,7 +208,7 @@ pub const EXTEND: &'static [(char, char)] = &[
|
|||
('ᩕ', '\u{1a5e}'),
|
||||
('\u{1a60}', '\u{1a7c}'),
|
||||
('\u{1a7f}', '\u{1a7f}'),
|
||||
('\u{1ab0}', '\u{1abe}'),
|
||||
('\u{1ab0}', '\u{1ac0}'),
|
||||
('\u{1b00}', 'ᬄ'),
|
||||
('\u{1b34}', '᭄'),
|
||||
('\u{1b6b}', '\u{1b73}'),
|
||||
|
@ -236,6 +238,7 @@ pub const EXTEND: &'static [(char, char)] = &[
|
|||
('\u{a806}', '\u{a806}'),
|
||||
('\u{a80b}', '\u{a80b}'),
|
||||
('ꠣ', 'ꠧ'),
|
||||
('\u{a82c}', '\u{a82c}'),
|
||||
('ꢀ', 'ꢁ'),
|
||||
('ꢴ', '\u{a8c5}'),
|
||||
('\u{a8e0}', '\u{a8f1}'),
|
||||
|
@ -272,6 +275,7 @@ pub const EXTEND: &'static [(char, char)] = &[
|
|||
('\u{10a3f}', '\u{10a3f}'),
|
||||
('\u{10ae5}', '\u{10ae6}'),
|
||||
('\u{10d24}', '\u{10d27}'),
|
||||
('\u{10eab}', '\u{10eac}'),
|
||||
('\u{10f46}', '\u{10f50}'),
|
||||
('𑀀', '𑀂'),
|
||||
('\u{11038}', '\u{11046}'),
|
||||
|
@ -284,6 +288,7 @@ pub const EXTEND: &'static [(char, char)] = &[
|
|||
('\u{11180}', '𑆂'),
|
||||
('𑆳', '𑇀'),
|
||||
('\u{111c9}', '\u{111cc}'),
|
||||
('\u{111ce}', '\u{111cf}'),
|
||||
('𑈬', '\u{11237}'),
|
||||
('\u{1123e}', '\u{1123e}'),
|
||||
('\u{112df}', '\u{112ea}'),
|
||||
|
@ -306,9 +311,14 @@ pub const EXTEND: &'static [(char, char)] = &[
|
|||
('\u{116ab}', '\u{116b7}'),
|
||||
('\u{1171d}', '\u{1172b}'),
|
||||
('𑠬', '\u{1183a}'),
|
||||
('\u{119d1}', '\u{119d7}'),
|
||||
('\u{11930}', '\u{11935}'),
|
||||
('\u{11937}', '\u{11938}'),
|
||||
('\u{1193b}', '\u{1193e}'),
|
||||
('\u{11940}', '\u{11940}'),
|
||||
('\u{11942}', '\u{11943}'),
|
||||
('𑧑', '\u{119d7}'),
|
||||
('\u{119da}', '\u{119e0}'),
|
||||
('\u{119e4}', '\u{119e4}'),
|
||||
('𑧤', '𑧤'),
|
||||
('\u{11a01}', '\u{11a0a}'),
|
||||
('\u{11a33}', '𑨹'),
|
||||
('\u{11a3b}', '\u{11a3e}'),
|
||||
|
@ -331,8 +341,10 @@ pub const EXTEND: &'static [(char, char)] = &[
|
|||
('\u{16af0}', '\u{16af4}'),
|
||||
('\u{16b30}', '\u{16b36}'),
|
||||
('\u{16f4f}', '\u{16f4f}'),
|
||||
('𖽑', '\u{16f87}'),
|
||||
('𖽑', '𖾇'),
|
||||
('\u{16f8f}', '\u{16f92}'),
|
||||
('\u{16fe4}', '\u{16fe4}'),
|
||||
('\u{16ff0}', '\u{16ff1}'),
|
||||
('\u{1bc9d}', '\u{1bc9e}'),
|
||||
('\u{1d165}', '\u{1d169}'),
|
||||
('𝅭', '\u{1d172}'),
|
||||
|
@ -986,13 +998,16 @@ pub const LOWER: &'static [(char, char)] = &[
|
|||
('ꞵ', 'ꞵ'),
|
||||
('ꞷ', 'ꞷ'),
|
||||
('ꞹ', 'ꞹ'),
|
||||
('\u{a7bb}', '\u{a7bb}'),
|
||||
('\u{a7bd}', '\u{a7bd}'),
|
||||
('\u{a7bf}', '\u{a7bf}'),
|
||||
('\u{a7c3}', '\u{a7c3}'),
|
||||
('ꞻ', 'ꞻ'),
|
||||
('ꞽ', 'ꞽ'),
|
||||
('ꞿ', 'ꞿ'),
|
||||
('ꟃ', 'ꟃ'),
|
||||
('\u{a7c8}', '\u{a7c8}'),
|
||||
('\u{a7ca}', '\u{a7ca}'),
|
||||
('\u{a7f6}', '\u{a7f6}'),
|
||||
('ꟸ', 'ꟺ'),
|
||||
('ꬰ', 'ꭚ'),
|
||||
('ꭜ', '\u{ab67}'),
|
||||
('ꭜ', '\u{ab68}'),
|
||||
('ꭰ', 'ꮿ'),
|
||||
('ff', 'st'),
|
||||
('ﬓ', 'ﬗ'),
|
||||
|
@ -1085,15 +1100,17 @@ pub const NUMERIC: &'static [(char, char)] = &[
|
|||
('𑛀', '𑛉'),
|
||||
('𑜰', '𑜹'),
|
||||
('𑣠', '𑣩'),
|
||||
('\u{11950}', '\u{11959}'),
|
||||
('𑱐', '𑱙'),
|
||||
('𑵐', '𑵙'),
|
||||
('𑶠', '𑶩'),
|
||||
('𖩠', '𖩩'),
|
||||
('𖭐', '𖭙'),
|
||||
('𝟎', '𝟿'),
|
||||
('\u{1e140}', '\u{1e149}'),
|
||||
('\u{1e2f0}', '\u{1e2f9}'),
|
||||
('𞅀', '𞅉'),
|
||||
('𞋰', '𞋹'),
|
||||
('𞥐', '𞥙'),
|
||||
('\u{1fbf0}', '\u{1fbf9}'),
|
||||
];
|
||||
|
||||
pub const OLETTER: &'static [(char, char)] = &[
|
||||
|
@ -1130,7 +1147,7 @@ pub const OLETTER: &'static [(char, char)] = &[
|
|||
('ࡀ', 'ࡘ'),
|
||||
('ࡠ', 'ࡪ'),
|
||||
('ࢠ', 'ࢴ'),
|
||||
('ࢶ', 'ࢽ'),
|
||||
('ࢶ', '\u{8c7}'),
|
||||
('ऄ', 'ह'),
|
||||
('ऽ', 'ऽ'),
|
||||
('ॐ', 'ॐ'),
|
||||
|
@ -1206,7 +1223,7 @@ pub const OLETTER: &'static [(char, char)] = &[
|
|||
('ೞ', 'ೞ'),
|
||||
('ೠ', 'ೡ'),
|
||||
('ೱ', 'ೲ'),
|
||||
('അ', 'ഌ'),
|
||||
('\u{d04}', 'ഌ'),
|
||||
('എ', 'ഐ'),
|
||||
('ഒ', 'ഺ'),
|
||||
('ഽ', 'ഽ'),
|
||||
|
@ -1224,8 +1241,8 @@ pub const OLETTER: &'static [(char, char)] = &[
|
|||
('เ', 'ๆ'),
|
||||
('ກ', 'ຂ'),
|
||||
('ຄ', 'ຄ'),
|
||||
('\u{e86}', 'ຊ'),
|
||||
('\u{e8c}', 'ຣ'),
|
||||
('ຆ', 'ຊ'),
|
||||
('ຌ', 'ຣ'),
|
||||
('ລ', 'ລ'),
|
||||
('ວ', 'ະ'),
|
||||
('າ', 'ຳ'),
|
||||
|
@ -1304,7 +1321,7 @@ pub const OLETTER: &'static [(char, char)] = &[
|
|||
('ᳩ', 'ᳬ'),
|
||||
('ᳮ', 'ᳳ'),
|
||||
('ᳵ', 'ᳶ'),
|
||||
('\u{1cfa}', '\u{1cfa}'),
|
||||
('ᳺ', 'ᳺ'),
|
||||
('ℵ', 'ℸ'),
|
||||
('ↀ', 'ↂ'),
|
||||
('ↅ', 'ↈ'),
|
||||
|
@ -1330,10 +1347,10 @@ pub const OLETTER: &'static [(char, char)] = &[
|
|||
('ー', 'ヿ'),
|
||||
('ㄅ', 'ㄯ'),
|
||||
('ㄱ', 'ㆎ'),
|
||||
('ㆠ', 'ㆺ'),
|
||||
('ㆠ', '\u{31bf}'),
|
||||
('ㇰ', 'ㇿ'),
|
||||
('㐀', '䶵'),
|
||||
('一', '鿯'),
|
||||
('㐀', '\u{4dbf}'),
|
||||
('一', '\u{9ffc}'),
|
||||
('ꀀ', 'ꒌ'),
|
||||
('ꓐ', 'ꓽ'),
|
||||
('ꔀ', 'ꘌ'),
|
||||
|
@ -1382,6 +1399,7 @@ pub const OLETTER: &'static [(char, char)] = &[
|
|||
('ꬑ', 'ꬖ'),
|
||||
('ꬠ', 'ꬦ'),
|
||||
('ꬨ', 'ꬮ'),
|
||||
('\u{ab69}', '\u{ab69}'),
|
||||
('ꯀ', 'ꯢ'),
|
||||
('가', '힣'),
|
||||
('ힰ', 'ퟆ'),
|
||||
|
@ -1459,15 +1477,19 @@ pub const OLETTER: &'static [(char, char)] = &[
|
|||
('𐮀', '𐮑'),
|
||||
('𐰀', '𐱈'),
|
||||
('𐴀', '𐴣'),
|
||||
('\u{10e80}', '\u{10ea9}'),
|
||||
('\u{10eb0}', '\u{10eb1}'),
|
||||
('𐼀', '𐼜'),
|
||||
('𐼧', '𐼧'),
|
||||
('𐼰', '𐽅'),
|
||||
('\u{10fe0}', '\u{10ff6}'),
|
||||
('\u{10fb0}', '\u{10fc4}'),
|
||||
('𐿠', '𐿶'),
|
||||
('𑀃', '𑀷'),
|
||||
('𑂃', '𑂯'),
|
||||
('𑃐', '𑃨'),
|
||||
('𑄃', '𑄦'),
|
||||
('𑅄', '𑅄'),
|
||||
('\u{11147}', '\u{11147}'),
|
||||
('𑅐', '𑅲'),
|
||||
('𑅶', '𑅶'),
|
||||
('𑆃', '𑆲'),
|
||||
|
@ -1493,7 +1515,7 @@ pub const OLETTER: &'static [(char, char)] = &[
|
|||
('𑍝', '𑍡'),
|
||||
('𑐀', '𑐴'),
|
||||
('𑑇', '𑑊'),
|
||||
('\u{1145f}', '\u{1145f}'),
|
||||
('𑑟', '\u{11461}'),
|
||||
('𑒀', '𑒯'),
|
||||
('𑓄', '𑓅'),
|
||||
('𑓇', '𑓇'),
|
||||
|
@ -1502,14 +1524,20 @@ pub const OLETTER: &'static [(char, char)] = &[
|
|||
('𑘀', '𑘯'),
|
||||
('𑙄', '𑙄'),
|
||||
('𑚀', '𑚪'),
|
||||
('\u{116b8}', '\u{116b8}'),
|
||||
('𑚸', '𑚸'),
|
||||
('𑜀', '𑜚'),
|
||||
('𑠀', '𑠫'),
|
||||
('𑣿', '𑣿'),
|
||||
('\u{119a0}', '\u{119a7}'),
|
||||
('\u{119aa}', '\u{119d0}'),
|
||||
('\u{119e1}', '\u{119e1}'),
|
||||
('\u{119e3}', '\u{119e3}'),
|
||||
('𑣿', '\u{11906}'),
|
||||
('\u{11909}', '\u{11909}'),
|
||||
('\u{1190c}', '\u{11913}'),
|
||||
('\u{11915}', '\u{11916}'),
|
||||
('\u{11918}', '\u{1192f}'),
|
||||
('\u{1193f}', '\u{1193f}'),
|
||||
('\u{11941}', '\u{11941}'),
|
||||
('𑦠', '𑦧'),
|
||||
('𑦪', '𑧐'),
|
||||
('𑧡', '𑧡'),
|
||||
('𑧣', '𑧣'),
|
||||
('𑨀', '𑨀'),
|
||||
('𑨋', '𑨲'),
|
||||
('𑨺', '𑨺'),
|
||||
|
@ -1530,6 +1558,7 @@ pub const OLETTER: &'static [(char, char)] = &[
|
|||
('𑵪', '𑶉'),
|
||||
('𑶘', '𑶘'),
|
||||
('𑻠', '𑻲'),
|
||||
('\u{11fb0}', '\u{11fb0}'),
|
||||
('𒀀', '𒎙'),
|
||||
('𒐀', '𒑮'),
|
||||
('𒒀', '𒕃'),
|
||||
|
@ -1542,27 +1571,28 @@ pub const OLETTER: &'static [(char, char)] = &[
|
|||
('𖭀', '𖭃'),
|
||||
('𖭣', '𖭷'),
|
||||
('𖭽', '𖮏'),
|
||||
('𖼀', '\u{16f4a}'),
|
||||
('𖼀', '𖽊'),
|
||||
('𖽐', '𖽐'),
|
||||
('𖾓', '𖾟'),
|
||||
('𖿠', '𖿡'),
|
||||
('\u{16fe3}', '\u{16fe3}'),
|
||||
('𗀀', '\u{187f7}'),
|
||||
('𘠀', '𘫲'),
|
||||
('𖿣', '𖿣'),
|
||||
('𗀀', '𘟷'),
|
||||
('𘠀', '\u{18cd5}'),
|
||||
('\u{18d00}', '\u{18d08}'),
|
||||
('𛀀', '𛄞'),
|
||||
('\u{1b150}', '\u{1b152}'),
|
||||
('\u{1b164}', '\u{1b167}'),
|
||||
('𛅐', '𛅒'),
|
||||
('𛅤', '𛅧'),
|
||||
('𛅰', '𛋻'),
|
||||
('𛰀', '𛱪'),
|
||||
('𛱰', '𛱼'),
|
||||
('𛲀', '𛲈'),
|
||||
('𛲐', '𛲙'),
|
||||
('\u{1e100}', '\u{1e12c}'),
|
||||
('\u{1e137}', '\u{1e13d}'),
|
||||
('\u{1e14e}', '\u{1e14e}'),
|
||||
('\u{1e2c0}', '\u{1e2eb}'),
|
||||
('𞄀', '𞄬'),
|
||||
('𞄷', '𞄽'),
|
||||
('𞅎', '𞅎'),
|
||||
('𞋀', '𞋫'),
|
||||
('𞠀', '𞣄'),
|
||||
('\u{1e94b}', '\u{1e94b}'),
|
||||
('𞥋', '𞥋'),
|
||||
('𞸀', '𞸃'),
|
||||
('𞸅', '𞸟'),
|
||||
('𞸡', '𞸢'),
|
||||
|
@ -1596,12 +1626,13 @@ pub const OLETTER: &'static [(char, char)] = &[
|
|||
('𞺡', '𞺣'),
|
||||
('𞺥', '𞺩'),
|
||||
('𞺫', '𞺻'),
|
||||
('𠀀', '𪛖'),
|
||||
('𠀀', '\u{2a6dd}'),
|
||||
('𪜀', '𫜴'),
|
||||
('𫝀', '𫠝'),
|
||||
('𫠠', '𬺡'),
|
||||
('𬺰', '𮯠'),
|
||||
('丽', '𪘀'),
|
||||
('\u{30000}', '\u{3134a}'),
|
||||
];
|
||||
|
||||
pub const SCONTINUE: &'static [(char, char)] = &[
|
||||
|
@ -1687,6 +1718,8 @@ pub const STERM: &'static [(char, char)] = &[
|
|||
('𑗉', '𑗗'),
|
||||
('𑙁', '𑙂'),
|
||||
('𑜼', '𑜾'),
|
||||
('\u{11944}', '\u{11944}'),
|
||||
('\u{11946}', '\u{11946}'),
|
||||
('𑩂', '𑩃'),
|
||||
('𑪛', '𑪜'),
|
||||
('𑱁', '𑱂'),
|
||||
|
@ -2312,11 +2345,13 @@ pub const UPPER: &'static [(char, char)] = &[
|
|||
('Ʞ', 'Ꞵ'),
|
||||
('Ꞷ', 'Ꞷ'),
|
||||
('Ꞹ', 'Ꞹ'),
|
||||
('\u{a7ba}', '\u{a7ba}'),
|
||||
('\u{a7bc}', '\u{a7bc}'),
|
||||
('\u{a7be}', '\u{a7be}'),
|
||||
('\u{a7c2}', '\u{a7c2}'),
|
||||
('\u{a7c4}', '\u{a7c6}'),
|
||||
('Ꞻ', 'Ꞻ'),
|
||||
('Ꞽ', 'Ꞽ'),
|
||||
('Ꞿ', 'Ꞿ'),
|
||||
('Ꟃ', 'Ꟃ'),
|
||||
('Ꞔ', '\u{a7c7}'),
|
||||
('\u{a7c9}', '\u{a7c9}'),
|
||||
('\u{a7f5}', '\u{a7f5}'),
|
||||
('A', 'Z'),
|
||||
('𐐀', '𐐧'),
|
||||
('𐒰', '𐓓'),
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
|
||||
//
|
||||
// ucd-generate word-break /tmp/ucd/12.1.0/ --chars
|
||||
// ucd-generate word-break ucd-13.0.0 --chars
|
||||
//
|
||||
// ucd-generate is available on crates.io.
|
||||
// Unicode version: 13.0.0.
|
||||
//
|
||||
// ucd-generate 0.2.8 is available on crates.io.
|
||||
|
||||
pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[
|
||||
("ALetter", ALETTER),
|
||||
|
@ -34,8 +36,7 @@ pub const ALETTER: &'static [(char, char)] = &[
|
|||
('À', 'Ö'),
|
||||
('Ø', 'ö'),
|
||||
('ø', '˗'),
|
||||
('˞', 'ˤ'),
|
||||
('ˬ', '˿'),
|
||||
('˞', '˿'),
|
||||
('Ͱ', 'ʹ'),
|
||||
('Ͷ', 'ͷ'),
|
||||
('ͺ', 'ͽ'),
|
||||
|
@ -48,10 +49,10 @@ pub const ALETTER: &'static [(char, char)] = &[
|
|||
('Ϸ', 'ҁ'),
|
||||
('Ҋ', 'ԯ'),
|
||||
('Ա', 'Ֆ'),
|
||||
('ՙ', 'ՙ'),
|
||||
('՛', '՜'),
|
||||
('ՙ', '՜'),
|
||||
('՞', '՞'),
|
||||
('ՠ', 'ֈ'),
|
||||
('֊', '֊'),
|
||||
('׳', '׳'),
|
||||
('ؠ', 'ي'),
|
||||
('ٮ', 'ٯ'),
|
||||
|
@ -75,7 +76,7 @@ pub const ALETTER: &'static [(char, char)] = &[
|
|||
('ࡀ', 'ࡘ'),
|
||||
('ࡠ', 'ࡪ'),
|
||||
('ࢠ', 'ࢴ'),
|
||||
('ࢶ', 'ࢽ'),
|
||||
('ࢶ', '\u{8c7}'),
|
||||
('ऄ', 'ह'),
|
||||
('ऽ', 'ऽ'),
|
||||
('ॐ', 'ॐ'),
|
||||
|
@ -151,7 +152,7 @@ pub const ALETTER: &'static [(char, char)] = &[
|
|||
('ೞ', 'ೞ'),
|
||||
('ೠ', 'ೡ'),
|
||||
('ೱ', 'ೲ'),
|
||||
('അ', 'ഌ'),
|
||||
('\u{d04}', 'ഌ'),
|
||||
('എ', 'ഐ'),
|
||||
('ഒ', 'ഺ'),
|
||||
('ഽ', 'ഽ'),
|
||||
|
@ -223,7 +224,7 @@ pub const ALETTER: &'static [(char, char)] = &[
|
|||
('ᳩ', 'ᳬ'),
|
||||
('ᳮ', 'ᳳ'),
|
||||
('ᳵ', 'ᳶ'),
|
||||
('\u{1cfa}', '\u{1cfa}'),
|
||||
('ᳺ', 'ᳺ'),
|
||||
('ᴀ', 'ᶿ'),
|
||||
('Ḁ', 'ἕ'),
|
||||
('Ἐ', 'Ἕ'),
|
||||
|
@ -286,7 +287,7 @@ pub const ALETTER: &'static [(char, char)] = &[
|
|||
('〻', '〼'),
|
||||
('ㄅ', 'ㄯ'),
|
||||
('ㄱ', 'ㆎ'),
|
||||
('ㆠ', 'ㆺ'),
|
||||
('ㆠ', '\u{31bf}'),
|
||||
('ꀀ', 'ꒌ'),
|
||||
('ꓐ', 'ꓽ'),
|
||||
('ꔀ', 'ꘌ'),
|
||||
|
@ -295,9 +296,9 @@ pub const ALETTER: &'static [(char, char)] = &[
|
|||
('Ꙁ', 'ꙮ'),
|
||||
('ꙿ', 'ꚝ'),
|
||||
('ꚠ', 'ꛯ'),
|
||||
('ꜗ', '\u{a7bf}'),
|
||||
('\u{a7c2}', '\u{a7c6}'),
|
||||
('ꟷ', 'ꠁ'),
|
||||
('꜈', 'ꞿ'),
|
||||
('Ꟃ', '\u{a7ca}'),
|
||||
('\u{a7f5}', 'ꠁ'),
|
||||
('ꠃ', 'ꠅ'),
|
||||
('ꠇ', 'ꠊ'),
|
||||
('ꠌ', 'ꠢ'),
|
||||
|
@ -321,7 +322,7 @@ pub const ALETTER: &'static [(char, char)] = &[
|
|||
('ꬑ', 'ꬖ'),
|
||||
('ꬠ', 'ꬦ'),
|
||||
('ꬨ', 'ꬮ'),
|
||||
('ꬰ', '\u{ab67}'),
|
||||
('ꬰ', '\u{ab69}'),
|
||||
('ꭰ', 'ꯢ'),
|
||||
('가', '힣'),
|
||||
('ힰ', 'ퟆ'),
|
||||
|
@ -397,15 +398,19 @@ pub const ALETTER: &'static [(char, char)] = &[
|
|||
('𐲀', '𐲲'),
|
||||
('𐳀', '𐳲'),
|
||||
('𐴀', '𐴣'),
|
||||
('\u{10e80}', '\u{10ea9}'),
|
||||
('\u{10eb0}', '\u{10eb1}'),
|
||||
('𐼀', '𐼜'),
|
||||
('𐼧', '𐼧'),
|
||||
('𐼰', '𐽅'),
|
||||
('\u{10fe0}', '\u{10ff6}'),
|
||||
('\u{10fb0}', '\u{10fc4}'),
|
||||
('𐿠', '𐿶'),
|
||||
('𑀃', '𑀷'),
|
||||
('𑂃', '𑂯'),
|
||||
('𑃐', '𑃨'),
|
||||
('𑄃', '𑄦'),
|
||||
('𑅄', '𑅄'),
|
||||
('\u{11147}', '\u{11147}'),
|
||||
('𑅐', '𑅲'),
|
||||
('𑅶', '𑅶'),
|
||||
('𑆃', '𑆲'),
|
||||
|
@ -431,7 +436,7 @@ pub const ALETTER: &'static [(char, char)] = &[
|
|||
('𑍝', '𑍡'),
|
||||
('𑐀', '𑐴'),
|
||||
('𑑇', '𑑊'),
|
||||
('\u{1145f}', '\u{1145f}'),
|
||||
('𑑟', '\u{11461}'),
|
||||
('𑒀', '𑒯'),
|
||||
('𑓄', '𑓅'),
|
||||
('𑓇', '𑓇'),
|
||||
|
@ -440,14 +445,20 @@ pub const ALETTER: &'static [(char, char)] = &[
|
|||
('𑘀', '𑘯'),
|
||||
('𑙄', '𑙄'),
|
||||
('𑚀', '𑚪'),
|
||||
('\u{116b8}', '\u{116b8}'),
|
||||
('𑚸', '𑚸'),
|
||||
('𑠀', '𑠫'),
|
||||
('𑢠', '𑣟'),
|
||||
('𑣿', '𑣿'),
|
||||
('\u{119a0}', '\u{119a7}'),
|
||||
('\u{119aa}', '\u{119d0}'),
|
||||
('\u{119e1}', '\u{119e1}'),
|
||||
('\u{119e3}', '\u{119e3}'),
|
||||
('𑣿', '\u{11906}'),
|
||||
('\u{11909}', '\u{11909}'),
|
||||
('\u{1190c}', '\u{11913}'),
|
||||
('\u{11915}', '\u{11916}'),
|
||||
('\u{11918}', '\u{1192f}'),
|
||||
('\u{1193f}', '\u{1193f}'),
|
||||
('\u{11941}', '\u{11941}'),
|
||||
('𑦠', '𑦧'),
|
||||
('𑦪', '𑧐'),
|
||||
('𑧡', '𑧡'),
|
||||
('𑧣', '𑧣'),
|
||||
('𑨀', '𑨀'),
|
||||
('𑨋', '𑨲'),
|
||||
('𑨺', '𑨺'),
|
||||
|
@ -468,6 +479,7 @@ pub const ALETTER: &'static [(char, char)] = &[
|
|||
('𑵪', '𑶉'),
|
||||
('𑶘', '𑶘'),
|
||||
('𑻠', '𑻲'),
|
||||
('\u{11fb0}', '\u{11fb0}'),
|
||||
('𒀀', '𒎙'),
|
||||
('𒐀', '𒑮'),
|
||||
('𒒀', '𒕃'),
|
||||
|
@ -481,11 +493,11 @@ pub const ALETTER: &'static [(char, char)] = &[
|
|||
('𖭣', '𖭷'),
|
||||
('𖭽', '𖮏'),
|
||||
('𖹀', '𖹿'),
|
||||
('𖼀', '\u{16f4a}'),
|
||||
('𖼀', '𖽊'),
|
||||
('𖽐', '𖽐'),
|
||||
('𖾓', '𖾟'),
|
||||
('𖿠', '𖿡'),
|
||||
('\u{16fe3}', '\u{16fe3}'),
|
||||
('𖿣', '𖿣'),
|
||||
('𛰀', '𛱪'),
|
||||
('𛱰', '𛱼'),
|
||||
('𛲀', '𛲈'),
|
||||
|
@ -520,13 +532,13 @@ pub const ALETTER: &'static [(char, char)] = &[
|
|||
('𝞊', '𝞨'),
|
||||
('𝞪', '𝟂'),
|
||||
('𝟄', '𝟋'),
|
||||
('\u{1e100}', '\u{1e12c}'),
|
||||
('\u{1e137}', '\u{1e13d}'),
|
||||
('\u{1e14e}', '\u{1e14e}'),
|
||||
('\u{1e2c0}', '\u{1e2eb}'),
|
||||
('𞄀', '𞄬'),
|
||||
('𞄷', '𞄽'),
|
||||
('𞅎', '𞅎'),
|
||||
('𞋀', '𞋫'),
|
||||
('𞠀', '𞣄'),
|
||||
('𞤀', '𞥃'),
|
||||
('\u{1e94b}', '\u{1e94b}'),
|
||||
('𞥋', '𞥋'),
|
||||
('𞸀', '𞸃'),
|
||||
('𞸅', '𞸟'),
|
||||
('𞸡', '𞸢'),
|
||||
|
@ -628,7 +640,7 @@ pub const EXTEND: &'static [(char, char)] = &[
|
|||
('\u{b3e}', '\u{b44}'),
|
||||
('େ', 'ୈ'),
|
||||
('ୋ', '\u{b4d}'),
|
||||
('\u{b56}', '\u{b57}'),
|
||||
('\u{b55}', '\u{b57}'),
|
||||
('\u{b62}', '\u{b63}'),
|
||||
('\u{b82}', '\u{b82}'),
|
||||
('\u{bbe}', 'ூ'),
|
||||
|
@ -655,7 +667,7 @@ pub const EXTEND: &'static [(char, char)] = &[
|
|||
('ൊ', '\u{d4d}'),
|
||||
('\u{d57}', '\u{d57}'),
|
||||
('\u{d62}', '\u{d63}'),
|
||||
('ං', 'ඃ'),
|
||||
('\u{d81}', 'ඃ'),
|
||||
('\u{dca}', '\u{dca}'),
|
||||
('\u{dcf}', '\u{dd4}'),
|
||||
('\u{dd6}', '\u{dd6}'),
|
||||
|
@ -702,7 +714,7 @@ pub const EXTEND: &'static [(char, char)] = &[
|
|||
('ᩕ', '\u{1a5e}'),
|
||||
('\u{1a60}', '\u{1a7c}'),
|
||||
('\u{1a7f}', '\u{1a7f}'),
|
||||
('\u{1ab0}', '\u{1abe}'),
|
||||
('\u{1ab0}', '\u{1ac0}'),
|
||||
('\u{1b00}', 'ᬄ'),
|
||||
('\u{1b34}', '᭄'),
|
||||
('\u{1b6b}', '\u{1b73}'),
|
||||
|
@ -732,6 +744,7 @@ pub const EXTEND: &'static [(char, char)] = &[
|
|||
('\u{a806}', '\u{a806}'),
|
||||
('\u{a80b}', '\u{a80b}'),
|
||||
('ꠣ', 'ꠧ'),
|
||||
('\u{a82c}', '\u{a82c}'),
|
||||
('ꢀ', 'ꢁ'),
|
||||
('ꢴ', '\u{a8c5}'),
|
||||
('\u{a8e0}', '\u{a8f1}'),
|
||||
|
@ -768,6 +781,7 @@ pub const EXTEND: &'static [(char, char)] = &[
|
|||
('\u{10a3f}', '\u{10a3f}'),
|
||||
('\u{10ae5}', '\u{10ae6}'),
|
||||
('\u{10d24}', '\u{10d27}'),
|
||||
('\u{10eab}', '\u{10eac}'),
|
||||
('\u{10f46}', '\u{10f50}'),
|
||||
('𑀀', '𑀂'),
|
||||
('\u{11038}', '\u{11046}'),
|
||||
|
@ -780,6 +794,7 @@ pub const EXTEND: &'static [(char, char)] = &[
|
|||
('\u{11180}', '𑆂'),
|
||||
('𑆳', '𑇀'),
|
||||
('\u{111c9}', '\u{111cc}'),
|
||||
('\u{111ce}', '\u{111cf}'),
|
||||
('𑈬', '\u{11237}'),
|
||||
('\u{1123e}', '\u{1123e}'),
|
||||
('\u{112df}', '\u{112ea}'),
|
||||
|
@ -802,9 +817,14 @@ pub const EXTEND: &'static [(char, char)] = &[
|
|||
('\u{116ab}', '\u{116b7}'),
|
||||
('\u{1171d}', '\u{1172b}'),
|
||||
('𑠬', '\u{1183a}'),
|
||||
('\u{119d1}', '\u{119d7}'),
|
||||
('\u{11930}', '\u{11935}'),
|
||||
('\u{11937}', '\u{11938}'),
|
||||
('\u{1193b}', '\u{1193e}'),
|
||||
('\u{11940}', '\u{11940}'),
|
||||
('\u{11942}', '\u{11943}'),
|
||||
('𑧑', '\u{119d7}'),
|
||||
('\u{119da}', '\u{119e0}'),
|
||||
('\u{119e4}', '\u{119e4}'),
|
||||
('𑧤', '𑧤'),
|
||||
('\u{11a01}', '\u{11a0a}'),
|
||||
('\u{11a33}', '𑨹'),
|
||||
('\u{11a3b}', '\u{11a3e}'),
|
||||
|
@ -827,8 +847,10 @@ pub const EXTEND: &'static [(char, char)] = &[
|
|||
('\u{16af0}', '\u{16af4}'),
|
||||
('\u{16b30}', '\u{16b36}'),
|
||||
('\u{16f4f}', '\u{16f4f}'),
|
||||
('𖽑', '\u{16f87}'),
|
||||
('𖽑', '𖾇'),
|
||||
('\u{16f8f}', '\u{16f92}'),
|
||||
('\u{16fe4}', '\u{16fe4}'),
|
||||
('\u{16ff0}', '\u{16ff1}'),
|
||||
('\u{1bc9d}', '\u{1bc9e}'),
|
||||
('\u{1d165}', '\u{1d169}'),
|
||||
('𝅭', '\u{1d172}'),
|
||||
|
@ -911,7 +933,7 @@ pub const KATAKANA: &'static [(char, char)] = &[
|
|||
('㌀', '㍗'),
|
||||
('ヲ', 'ン'),
|
||||
('𛀀', '𛀀'),
|
||||
('\u{1b164}', '\u{1b167}'),
|
||||
('𛅤', '𛅧'),
|
||||
];
|
||||
|
||||
pub const LF: &'static [(char, char)] = &[('\n', '\n')];
|
||||
|
@ -920,6 +942,7 @@ pub const MIDLETTER: &'static [(char, char)] = &[
|
|||
(':', ':'),
|
||||
('·', '·'),
|
||||
('·', '·'),
|
||||
('՟', '՟'),
|
||||
('״', '״'),
|
||||
('‧', '‧'),
|
||||
('︓', '︓'),
|
||||
|
@ -1008,15 +1031,17 @@ pub const NUMERIC: &'static [(char, char)] = &[
|
|||
('𑛀', '𑛉'),
|
||||
('𑜰', '𑜹'),
|
||||
('𑣠', '𑣩'),
|
||||
('\u{11950}', '\u{11959}'),
|
||||
('𑱐', '𑱙'),
|
||||
('𑵐', '𑵙'),
|
||||
('𑶠', '𑶩'),
|
||||
('𖩠', '𖩩'),
|
||||
('𖭐', '𖭙'),
|
||||
('𝟎', '𝟿'),
|
||||
('\u{1e140}', '\u{1e149}'),
|
||||
('\u{1e2f0}', '\u{1e2f9}'),
|
||||
('𞅀', '𞅉'),
|
||||
('𞋰', '𞋹'),
|
||||
('𞥐', '𞥙'),
|
||||
('\u{1fbf0}', '\u{1fbf9}'),
|
||||
];
|
||||
|
||||
pub const REGIONAL_INDICATOR: &'static [(char, char)] = &[('🇦', '🇿')];
|
||||
|
|
|
@ -15,7 +15,7 @@ whether a particular byte sequence was a Cyrillic character. One possible
|
|||
scalar value range is `[0400-04FF]`. The set of allowed bytes for this
|
||||
range can be expressed as a sequence of byte ranges:
|
||||
|
||||
```ignore
|
||||
```text
|
||||
[D0-D3][80-BF]
|
||||
```
|
||||
|
||||
|
@ -32,7 +32,7 @@ for example, `04FF` (because its last byte, `BF` isn't in the range `80-AF`).
|
|||
|
||||
Instead, you need multiple sequences of byte ranges:
|
||||
|
||||
```ignore
|
||||
```text
|
||||
[D0-D3][80-BF] # matches codepoints 0400-04FF
|
||||
[D4][80-AF] # matches codepoints 0500-052F
|
||||
```
|
||||
|
@ -41,7 +41,7 @@ This gets even more complicated if you want bigger ranges, particularly if
|
|||
they naively contain surrogate codepoints. For example, the sequence of byte
|
||||
ranges for the basic multilingual plane (`[0000-FFFF]`) look like this:
|
||||
|
||||
```ignore
|
||||
```text
|
||||
[0-7F]
|
||||
[C2-DF][80-BF]
|
||||
[E0][A0-BF][80-BF]
|
||||
|
@ -55,7 +55,7 @@ UTF-8, including encodings of surrogate codepoints.
|
|||
|
||||
And, of course, for all of Unicode (`[000000-10FFFF]`):
|
||||
|
||||
```ignore
|
||||
```text
|
||||
[0-7F]
|
||||
[C2-DF][80-BF]
|
||||
[E0][A0-BF][80-BF]
|
||||
|
@ -84,6 +84,7 @@ which uses it for executing automata on their term index.
|
|||
|
||||
use std::char;
|
||||
use std::fmt;
|
||||
use std::iter::FusedIterator;
|
||||
use std::slice;
|
||||
|
||||
const MAX_UTF8_BYTES: usize = 4;
|
||||
|
@ -152,6 +153,31 @@ impl Utf8Sequence {
|
|||
self.as_slice().len()
|
||||
}
|
||||
|
||||
/// Reverses the ranges in this sequence.
|
||||
///
|
||||
/// For example, if this corresponds to the following sequence:
|
||||
///
|
||||
/// ```text
|
||||
/// [D0-D3][80-BF]
|
||||
/// ```
|
||||
///
|
||||
/// Then after reversal, it will be
|
||||
///
|
||||
/// ```text
|
||||
/// [80-BF][D0-D3]
|
||||
/// ```
|
||||
///
|
||||
/// This is useful when one is constructing a UTF-8 automaton to match
|
||||
/// character classes in reverse.
|
||||
pub fn reverse(&mut self) {
|
||||
match *self {
|
||||
Utf8Sequence::One(_) => {}
|
||||
Utf8Sequence::Two(ref mut x) => x.reverse(),
|
||||
Utf8Sequence::Three(ref mut x) => x.reverse(),
|
||||
Utf8Sequence::Four(ref mut x) => x.reverse(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true if and only if a prefix of `bytes` matches this sequence
|
||||
/// of byte ranges.
|
||||
pub fn matches(&self, bytes: &[u8]) -> bool {
|
||||
|
@ -177,7 +203,7 @@ impl<'a> IntoIterator for &'a Utf8Sequence {
|
|||
}
|
||||
|
||||
impl fmt::Debug for Utf8Sequence {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
use self::Utf8Sequence::*;
|
||||
match *self {
|
||||
One(ref r) => write!(f, "{:?}", r),
|
||||
|
@ -201,7 +227,7 @@ pub struct Utf8Range {
|
|||
|
||||
impl Utf8Range {
|
||||
fn new(start: u8, end: u8) -> Self {
|
||||
Utf8Range { start: start, end: end }
|
||||
Utf8Range { start, end }
|
||||
}
|
||||
|
||||
/// Returns true if and only if the given byte is in this range.
|
||||
|
@ -211,7 +237,7 @@ impl Utf8Range {
|
|||
}
|
||||
|
||||
impl fmt::Debug for Utf8Range {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
if self.start == self.end {
|
||||
write!(f, "[{:X}]", self.start)
|
||||
} else {
|
||||
|
@ -270,6 +296,7 @@ impl fmt::Debug for Utf8Range {
|
|||
/// illustrative. In practice, you could just try to decode your byte sequence
|
||||
/// and compare it with the scalar value range directly. However, this is not
|
||||
/// always possible (for example, in a byte based automaton).
|
||||
#[derive(Debug)]
|
||||
pub struct Utf8Sequences {
|
||||
range_stack: Vec<ScalarRange>,
|
||||
}
|
||||
|
@ -294,7 +321,7 @@ impl Utf8Sequences {
|
|||
}
|
||||
|
||||
fn push(&mut self, start: u32, end: u32) {
|
||||
self.range_stack.push(ScalarRange { start: start, end: end });
|
||||
self.range_stack.push(ScalarRange { start, end });
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -304,7 +331,7 @@ struct ScalarRange {
|
|||
}
|
||||
|
||||
impl fmt::Debug for ScalarRange {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "ScalarRange({:X}, {:X})", self.start, self.end)
|
||||
}
|
||||
}
|
||||
|
@ -363,6 +390,8 @@ impl Iterator for Utf8Sequences {
|
|||
}
|
||||
}
|
||||
|
||||
impl FusedIterator for Utf8Sequences {}
|
||||
|
||||
impl ScalarRange {
|
||||
/// split splits this range if it overlaps with a surrogate codepoint.
|
||||
///
|
||||
|
@ -428,7 +457,7 @@ fn max_scalar_value(nbytes: usize) -> u32 {
|
|||
mod tests {
|
||||
use std::char;
|
||||
|
||||
use utf8::{Utf8Range, Utf8Sequences};
|
||||
use crate::utf8::{Utf8Range, Utf8Sequences};
|
||||
|
||||
fn rutf8(s: u8, e: u8) -> Utf8Range {
|
||||
Utf8Range::new(s, e)
|
||||
|
@ -475,7 +504,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn bmp() {
|
||||
use utf8::Utf8Sequence::*;
|
||||
use crate::utf8::Utf8Sequence::*;
|
||||
|
||||
let seqs = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect::<Vec<_>>();
|
||||
assert_eq!(
|
||||
|
@ -507,6 +536,43 @@ mod tests {
|
|||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn reverse() {
|
||||
use crate::utf8::Utf8Sequence::*;
|
||||
|
||||
let mut s = One(rutf8(0xA, 0xB));
|
||||
s.reverse();
|
||||
assert_eq!(s.as_slice(), &[rutf8(0xA, 0xB)]);
|
||||
|
||||
let mut s = Two([rutf8(0xA, 0xB), rutf8(0xB, 0xC)]);
|
||||
s.reverse();
|
||||
assert_eq!(s.as_slice(), &[rutf8(0xB, 0xC), rutf8(0xA, 0xB)]);
|
||||
|
||||
let mut s = Three([rutf8(0xA, 0xB), rutf8(0xB, 0xC), rutf8(0xC, 0xD)]);
|
||||
s.reverse();
|
||||
assert_eq!(
|
||||
s.as_slice(),
|
||||
&[rutf8(0xC, 0xD), rutf8(0xB, 0xC), rutf8(0xA, 0xB)]
|
||||
);
|
||||
|
||||
let mut s = Four([
|
||||
rutf8(0xA, 0xB),
|
||||
rutf8(0xB, 0xC),
|
||||
rutf8(0xC, 0xD),
|
||||
rutf8(0xD, 0xE),
|
||||
]);
|
||||
s.reverse();
|
||||
assert_eq!(
|
||||
s.as_slice(),
|
||||
&[
|
||||
rutf8(0xD, 0xE),
|
||||
rutf8(0xC, 0xD),
|
||||
rutf8(0xB, 0xC),
|
||||
rutf8(0xA, 0xB)
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
fn encode_surrogate(cp: u32) -> [u8; 3] {
|
||||
const TAG_CONT: u8 = 0b1000_0000;
|
||||
const TAG_THREE_B: u8 = 0b1110_0000;
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
# This is a convenience script for running a broad swath of the syntax tests.
|
||||
echo "===== DEFAULT FEATURES ==="
|
||||
cargo test
|
||||
|
|
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
|
@ -1,3 +1,235 @@
|
|||
1.5.4 (2021-05-06)
|
||||
==================
|
||||
This release fixes another compilation failure when building regex. This time,
|
||||
the fix is for when the `pattern` feature is enabled, which only works on
|
||||
nightly Rust. CI has been updated to test this case.
|
||||
|
||||
* [BUG #772](https://github.com/rust-lang/regex/pull/772):
|
||||
Fix build when `pattern` feature is enabled.
|
||||
|
||||
|
||||
1.5.3 (2021-05-01)
|
||||
==================
|
||||
This releases fixes a bug when building regex with only the `unicode-perl`
|
||||
feature. It turns out that while CI was building this configuration, it wasn't
|
||||
actually failing the overall build on a failed compilation.
|
||||
|
||||
* [BUG #769](https://github.com/rust-lang/regex/issues/769):
|
||||
Fix build in `regex-syntax` when only the `unicode-perl` feature is enabled.
|
||||
|
||||
|
||||
1.5.2 (2021-05-01)
|
||||
==================
|
||||
This release fixes a performance bug when Unicode word boundaries are used.
|
||||
Namely, for certain regexes on certain inputs, it's possible for the lazy DFA
|
||||
to stop searching (causing a fallback to a slower engine) when it doesn't
|
||||
actually need to.
|
||||
|
||||
[PR #768](https://github.com/rust-lang/regex/pull/768) fixes the bug, which was
|
||||
originally reported in
|
||||
[ripgrep#1860](https://github.com/BurntSushi/ripgrep/issues/1860).
|
||||
|
||||
|
||||
1.5.1 (2021-04-30)
|
||||
==================
|
||||
This is a patch release that fixes a compilation error when the `perf-literal`
|
||||
feature is not enabled.
|
||||
|
||||
|
||||
1.5.0 (2021-04-30)
|
||||
==================
|
||||
This release primarily updates to Rust 2018 (finally) and bumps the MSRV to
|
||||
Rust 1.41 (from Rust 1.28). Rust 1.41 was chosen because it's still reasonably
|
||||
old, and is what's in Debian stable at the time of writing.
|
||||
|
||||
This release also drops this crate's own bespoke substring search algorithms
|
||||
in favor of a new
|
||||
[`memmem` implementation provided by the `memchr` crate](https://docs.rs/memchr/2.4.0/memchr/memmem/index.html).
|
||||
This will change the performance profile of some regexes, sometimes getting a
|
||||
little worse, and hopefully more frequently, getting a lot better. Please
|
||||
report any serious performance regressions if you find them.
|
||||
|
||||
|
||||
1.4.6 (2021-04-22)
|
||||
==================
|
||||
This is a small patch release that fixes the compiler's size check on how much
|
||||
heap memory a regex uses. Previously, the compiler did not account for the
|
||||
heap usage of Unicode character classes. Now it does. It's possible that this
|
||||
may make some regexes fail to compile that previously did compile. If that
|
||||
happens, please file an issue.
|
||||
|
||||
* [BUG OSS-fuzz#33579](https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=33579):
|
||||
Some regexes can use more heap memory than one would expect.
|
||||
|
||||
|
||||
1.4.5 (2021-03-14)
|
||||
==================
|
||||
This is a small patch release that fixes a regression in the size of a `Regex`
|
||||
in the 1.4.4 release. Prior to 1.4.4, a `Regex` was 552 bytes. In the 1.4.4
|
||||
release, it was 856 bytes due to internal changes. In this release, a `Regex`
|
||||
is now 16 bytes. In general, the size of a `Regex` was never something that was
|
||||
on my radar, but this increased size in the 1.4.4 release seems to have crossed
|
||||
a threshold and resulted in stack overflows in some programs.
|
||||
|
||||
* [BUG #750](https://github.com/rust-lang/regex/pull/750):
|
||||
Fixes stack overflows seemingly caused by a large `Regex` size by decreasing
|
||||
its size.
|
||||
|
||||
|
||||
1.4.4 (2021-03-11)
|
||||
==================
|
||||
This is a small patch release that contains some bug fixes. Notably, it also
|
||||
drops the `thread_local` (and `lazy_static`, via transitivity) dependencies.
|
||||
|
||||
Bug fixes:
|
||||
|
||||
* [BUG #362](https://github.com/rust-lang/regex/pull/362):
|
||||
Memory leaks caused by an internal caching strategy should now be fixed.
|
||||
* [BUG #576](https://github.com/rust-lang/regex/pull/576):
|
||||
All regex types now implement `UnwindSafe` and `RefUnwindSafe`.
|
||||
* [BUG #728](https://github.com/rust-lang/regex/pull/749):
|
||||
Add missing `Replacer` impls for `Vec<u8>`, `String`, `Cow`, etc.
|
||||
|
||||
|
||||
1.4.3 (2021-01-08)
|
||||
==================
|
||||
This is a small patch release that adds some missing standard trait
|
||||
implementations for some types in the public API.
|
||||
|
||||
Bug fixes:
|
||||
|
||||
* [BUG #734](https://github.com/rust-lang/regex/pull/734):
|
||||
Add `FusedIterator` and `ExactSizeIterator` impls to iterator types.
|
||||
* [BUG #735](https://github.com/rust-lang/regex/pull/735):
|
||||
Add missing `Debug` impls to public API types.
|
||||
|
||||
|
||||
1.4.2 (2020-11-01)
|
||||
==================
|
||||
This is a small bug fix release that bans `\P{any}`. We previously banned empty
|
||||
classes like `[^\w\W]`, but missed the `\P{any}` case. In the future, we hope
|
||||
to permit empty classes.
|
||||
|
||||
* [BUG #722](https://github.com/rust-lang/regex/issues/722):
|
||||
Ban `\P{any}` to avoid a panic in the regex compiler. Found by OSS-Fuzz.
|
||||
|
||||
|
||||
1.4.1 (2020-10-13)
|
||||
==================
|
||||
This is a small bug fix release that makes `\p{cf}` work. Previously, it would
|
||||
report "property not found" even though `cf` is a valid abbreviation for the
|
||||
`Format` general category.
|
||||
|
||||
* [BUG #719](https://github.com/rust-lang/regex/issues/719):
|
||||
Fixes bug that prevented `\p{cf}` from working.
|
||||
|
||||
|
||||
1.4.0 (2020-10-11)
|
||||
==================
|
||||
This releases has a few minor documentation fixes as well as some very minor
|
||||
API additions. The MSRV remains at Rust 1.28 for now, but this is intended to
|
||||
increase to at least Rust 1.41.1 soon.
|
||||
|
||||
This release also adds support for OSS-Fuzz. Kudos to
|
||||
[@DavidKorczynski](https://github.com/DavidKorczynski)
|
||||
for doing the heavy lifting for that!
|
||||
|
||||
New features:
|
||||
|
||||
* [FEATURE #649](https://github.com/rust-lang/regex/issues/649):
|
||||
Support `[`, `]` and `.` in capture group names.
|
||||
* [FEATURE #687](https://github.com/rust-lang/regex/issues/687):
|
||||
Add `is_empty` predicate to `RegexSet`.
|
||||
* [FEATURE #689](https://github.com/rust-lang/regex/issues/689):
|
||||
Implement `Clone` for `SubCaptureMatches`.
|
||||
* [FEATURE #715](https://github.com/rust-lang/regex/issues/715):
|
||||
Add `empty` constructor to `RegexSet` for convenience.
|
||||
|
||||
Bug fixes:
|
||||
|
||||
* [BUG #694](https://github.com/rust-lang/regex/issues/694):
|
||||
Fix doc example for `Replacer::replace_append`.
|
||||
* [BUG #698](https://github.com/rust-lang/regex/issues/698):
|
||||
Clarify docs for `s` flag when using a `bytes::Regex`.
|
||||
* [BUG #711](https://github.com/rust-lang/regex/issues/711):
|
||||
Clarify `is_match` docs to indicate that it can match anywhere in string.
|
||||
|
||||
|
||||
1.3.9 (2020-05-28)
|
||||
==================
|
||||
This release fixes a MSRV (Minimum Support Rust Version) regression in the
|
||||
1.3.8 release. Namely, while 1.3.8 compiles on Rust 1.28, it actually does not
|
||||
compile on other Rust versions, such as Rust 1.39.
|
||||
|
||||
Bug fixes:
|
||||
|
||||
* [BUG #685](https://github.com/rust-lang/regex/issues/685):
|
||||
Remove use of `doc_comment` crate, which cannot be used before Rust 1.43.
|
||||
|
||||
|
||||
1.3.8 (2020-05-28)
|
||||
==================
|
||||
This release contains a couple of important bug fixes driven
|
||||
by better support for empty-subexpressions in regexes. For
|
||||
example, regexes like `b|` are now allowed. Major thanks to
|
||||
[@sliquister](https://github.com/sliquister) for implementing support for this
|
||||
in [#677](https://github.com/rust-lang/regex/pull/677).
|
||||
|
||||
Bug fixes:
|
||||
|
||||
* [BUG #523](https://github.com/rust-lang/regex/pull/523):
|
||||
Add note to documentation that spaces can be escaped in `x` mode.
|
||||
* [BUG #524](https://github.com/rust-lang/regex/issues/524):
|
||||
Add support for empty sub-expressions, including empty alternations.
|
||||
* [BUG #659](https://github.com/rust-lang/regex/issues/659):
|
||||
Fix match bug caused by an empty sub-expression miscompilation.
|
||||
|
||||
|
||||
1.3.7 (2020-04-17)
|
||||
==================
|
||||
This release contains a small bug fix that fixes how `regex` forwards crate
|
||||
features to `regex-syntax`. In particular, this will reduce recompilations in
|
||||
some cases.
|
||||
|
||||
Bug fixes:
|
||||
|
||||
* [BUG #665](https://github.com/rust-lang/regex/pull/665):
|
||||
Fix feature forwarding to `regex-syntax`.
|
||||
|
||||
|
||||
1.3.6 (2020-03-24)
|
||||
==================
|
||||
This release contains a sizable (~30%) performance improvement when compiling
|
||||
some kinds of large regular expressions.
|
||||
|
||||
Performance improvements:
|
||||
|
||||
* [PERF #657](https://github.com/rust-lang/regex/pull/657):
|
||||
Improvement performance of compiling large regular expressions.
|
||||
|
||||
|
||||
1.3.5 (2020-03-12)
|
||||
==================
|
||||
This release updates this crate to Unicode 13.
|
||||
|
||||
New features:
|
||||
|
||||
* [FEATURE #653](https://github.com/rust-lang/regex/pull/653):
|
||||
Update `regex-syntax` to Unicode 13.
|
||||
|
||||
|
||||
1.3.4 (2020-01-30)
|
||||
==================
|
||||
This is a small bug fix release that fixes a bug related to the scoping of
|
||||
flags in a regex. Namely, before this fix, a regex like `((?i)a)b)` would
|
||||
match `aB` despite the fact that `b` should not be matched case insensitively.
|
||||
|
||||
Bug fixes:
|
||||
|
||||
* [BUG #640](https://github.com/rust-lang/regex/issues/640):
|
||||
Fix bug related to the scoping of flags in a regex.
|
||||
|
||||
|
||||
1.3.3 (2020-01-09)
|
||||
==================
|
||||
This is a small maintenance release that upgrades the dependency on
|
||||
|
|
|
@ -1,47 +1,33 @@
|
|||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
version = 3
|
||||
|
||||
[[package]]
|
||||
name = "aho-corasick"
|
||||
version = "0.7.6"
|
||||
version = "0.7.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "58fb5e95d83b38284460a5fda7d6470aa0b8844d283a0b614b8535e880800d2d"
|
||||
checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "autocfg"
|
||||
version = "0.1.7"
|
||||
name = "cfg-if"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1d49d90015b3c36167a20fe2810c5cd875ad504b39cff3d4eae7977e6b7c1cb2"
|
||||
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
||||
|
||||
[[package]]
|
||||
name = "bitflags"
|
||||
version = "1.2.1"
|
||||
name = "getrandom"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693"
|
||||
|
||||
[[package]]
|
||||
name = "cloudabi"
|
||||
version = "0.0.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f"
|
||||
checksum = "c9495705279e7140bf035dde1f6e750c162df8b625267cd52cc44e0b156732c8"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"wasi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "doc-comment"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "923dea538cea0aa3025e8685b20d6ee21ef99c4f77e954a30febbaac5ec73a97"
|
||||
|
||||
[[package]]
|
||||
name = "fuchsia-cprng"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba"
|
||||
|
||||
[[package]]
|
||||
name = "lazy_static"
|
||||
version = "1.4.0"
|
||||
|
@ -50,188 +36,63 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
|
|||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.66"
|
||||
version = "0.2.80"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d515b1f41455adea1313a4a2ac8a8a477634fbae63cc6100e3aebb207ce61558"
|
||||
checksum = "4d58d1b70b004888f764dfbf6a26a3b0342a1632d33968e4a179d8011c760614"
|
||||
|
||||
[[package]]
|
||||
name = "memchr"
|
||||
version = "2.2.1"
|
||||
version = "2.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "88579771288728879b57485cc7d6b07d648c9f0141eb955f8ab7f9d45394468e"
|
||||
checksum = "b16bd47d9e329435e309c58469fe0791c2d0d1ba96ec0954152a5ae2b04387dc"
|
||||
|
||||
[[package]]
|
||||
name = "quickcheck"
|
||||
version = "0.8.5"
|
||||
version = "1.0.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9c35d9c36a562f37eca96e79f66d5fd56eefbc22560dacc4a864cabd2d277456"
|
||||
checksum = "588f6378e4dd99458b60ec275b4477add41ce4fa9f64dcba6f15adccb19b50d6"
|
||||
dependencies = [
|
||||
"rand",
|
||||
"rand_core 0.4.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand"
|
||||
version = "0.6.5"
|
||||
version = "0.8.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6d71dacdc3c88c1fde3885a3be3fbab9f35724e6ce99467f7d9c5026132184ca"
|
||||
checksum = "0ef9e7e66b4468674bfcb0c81af8b7fa0bb154fa9f28eb840da5c447baeb8d7e"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"libc",
|
||||
"rand_chacha",
|
||||
"rand_core 0.4.2",
|
||||
"rand_hc",
|
||||
"rand_isaac",
|
||||
"rand_jitter",
|
||||
"rand_os",
|
||||
"rand_pcg",
|
||||
"rand_xorshift",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_chacha"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "556d3a1ca6600bfcbab7c7c91ccb085ac7fbbcd70e008a98742e7847f4f7bcef"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"rand_core 0.3.1",
|
||||
"rand_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_core"
|
||||
version = "0.3.1"
|
||||
version = "0.6.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7a6fdeb83b075e8266dcc8762c22776f6877a63111121f5f8c7411e5be7eed4b"
|
||||
checksum = "34cf66eb183df1c5876e2dcf6b13d57340741e8dc255b48e40a26de954d06ae7"
|
||||
dependencies = [
|
||||
"rand_core 0.4.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_core"
|
||||
version = "0.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9c33a3c44ca05fa6f1807d8e6743f3824e8509beca625669633be0acbdf509dc"
|
||||
|
||||
[[package]]
|
||||
name = "rand_hc"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7b40677c7be09ae76218dc623efbf7b18e34bced3f38883af07bb75630a21bc4"
|
||||
dependencies = [
|
||||
"rand_core 0.3.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_isaac"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ded997c9d5f13925be2a6fd7e66bf1872597f759fd9dd93513dd7e92e5a5ee08"
|
||||
dependencies = [
|
||||
"rand_core 0.3.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_jitter"
|
||||
version = "0.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1166d5c91dc97b88d1decc3285bb0a99ed84b05cfd0bc2341bdf2d43fc41e39b"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"rand_core 0.4.2",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_os"
|
||||
version = "0.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7b75f676a1e053fc562eafbb47838d67c84801e38fc1ba459e8f180deabd5071"
|
||||
dependencies = [
|
||||
"cloudabi",
|
||||
"fuchsia-cprng",
|
||||
"libc",
|
||||
"rand_core 0.4.2",
|
||||
"rdrand",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_pcg"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "abf9b09b01790cfe0364f52bf32995ea3c39f4d2dd011eac241d2914146d0b44"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"rand_core 0.4.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_xorshift"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cbf7e9e623549b0e21f6e97cf8ecf247c1a8fd2e8a992ae265314300b2455d5c"
|
||||
dependencies = [
|
||||
"rand_core 0.3.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rdrand"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "678054eb77286b51581ba43620cc911abf02758c91f93f479767aed0f90458b2"
|
||||
dependencies = [
|
||||
"rand_core 0.3.1",
|
||||
"getrandom",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex"
|
||||
version = "1.3.3"
|
||||
version = "1.5.4"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"doc-comment",
|
||||
"lazy_static",
|
||||
"memchr",
|
||||
"quickcheck",
|
||||
"rand",
|
||||
"regex-syntax",
|
||||
"thread_local",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.6.13"
|
||||
version = "0.6.25"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e734e891f5b408a29efbf8309e656876276f49ab6a6ac208600b4419bd893d90"
|
||||
checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b"
|
||||
|
||||
[[package]]
|
||||
name = "thread_local"
|
||||
version = "1.0.0"
|
||||
name = "wasi"
|
||||
version = "0.10.2+wasi-snapshot-preview1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "88ddf1ad580c7e3d1efff877d972bcc93f995556b9087a5a259630985c88ceab"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winapi"
|
||||
version = "0.3.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8093091eeb260906a183e6ae1abdba2ef5ef2257a21801128899c3fc699229c6"
|
||||
dependencies = [
|
||||
"winapi-i686-pc-windows-gnu",
|
||||
"winapi-x86_64-pc-windows-gnu",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winapi-i686-pc-windows-gnu"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
|
||||
|
||||
[[package]]
|
||||
name = "winapi-x86_64-pc-windows-gnu"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
|
||||
checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6"
|
||||
|
|
|
@ -11,10 +11,11 @@
|
|||
# will likely look very different (and much more reasonable)
|
||||
|
||||
[package]
|
||||
edition = "2018"
|
||||
name = "regex"
|
||||
version = "1.3.3"
|
||||
version = "1.5.4"
|
||||
authors = ["The Rust Project Developers"]
|
||||
exclude = ["/.travis.yml", "/appveyor.yml", "/ci/*", "/scripts/*"]
|
||||
exclude = ["/scripts/*", "/.github/*"]
|
||||
autotests = false
|
||||
description = "An implementation of regular expressions for Rust. This implementation uses\nfinite automata and guarantees linear time matching on all inputs.\n"
|
||||
homepage = "https://github.com/rust-lang/regex"
|
||||
|
@ -72,43 +73,38 @@ path = "tests/test_backtrack_bytes.rs"
|
|||
name = "crates-regex"
|
||||
path = "tests/test_crates_regex.rs"
|
||||
[dependencies.aho-corasick]
|
||||
version = "0.7.6"
|
||||
version = "0.7.18"
|
||||
optional = true
|
||||
|
||||
[dependencies.memchr]
|
||||
version = "2.2.1"
|
||||
version = "2.4.0"
|
||||
optional = true
|
||||
|
||||
[dependencies.regex-syntax]
|
||||
version = "0.6.12"
|
||||
version = "0.6.25"
|
||||
default-features = false
|
||||
|
||||
[dependencies.thread_local]
|
||||
version = "1"
|
||||
optional = true
|
||||
[dev-dependencies.doc-comment]
|
||||
version = "0.3"
|
||||
|
||||
[dev-dependencies.lazy_static]
|
||||
version = "1"
|
||||
|
||||
[dev-dependencies.quickcheck]
|
||||
version = "0.8"
|
||||
version = "1.0.3"
|
||||
default-features = false
|
||||
|
||||
[dev-dependencies.rand]
|
||||
version = "0.6.5"
|
||||
version = "0.8.3"
|
||||
features = ["getrandom", "small_rng"]
|
||||
default-features = false
|
||||
|
||||
[features]
|
||||
default = ["std", "perf", "unicode"]
|
||||
default = ["std", "perf", "unicode", "regex-syntax/default"]
|
||||
pattern = []
|
||||
perf = ["perf-cache", "perf-dfa", "perf-inline", "perf-literal"]
|
||||
perf-cache = ["thread_local"]
|
||||
perf-cache = []
|
||||
perf-dfa = []
|
||||
perf-inline = []
|
||||
perf-literal = ["aho-corasick", "memchr"]
|
||||
std = []
|
||||
unicode = ["unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"]
|
||||
unicode = ["unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment", "regex-syntax/unicode"]
|
||||
unicode-age = ["regex-syntax/unicode-age"]
|
||||
unicode-bool = ["regex-syntax/unicode-bool"]
|
||||
unicode-case = ["regex-syntax/unicode-case"]
|
||||
|
@ -118,8 +114,3 @@ unicode-script = ["regex-syntax/unicode-script"]
|
|||
unicode-segment = ["regex-syntax/unicode-segment"]
|
||||
unstable = ["pattern"]
|
||||
use_std = ["std"]
|
||||
[badges.appveyor]
|
||||
repository = "rust-lang-libs/regex"
|
||||
|
||||
[badges.travis-ci]
|
||||
repository = "rust-lang/regex"
|
||||
|
|
|
@ -62,9 +62,7 @@ on how your program is structured. Thankfully, the
|
|||
[`lazy_static`](https://crates.io/crates/lazy_static)
|
||||
crate provides an answer that works well:
|
||||
|
||||
#[macro_use] extern crate lazy_static;
|
||||
extern crate regex;
|
||||
|
||||
use lazy_static::lazy_static;
|
||||
use regex::Regex;
|
||||
|
||||
fn some_helper_function(text: &str) -> bool {
|
||||
|
@ -147,9 +145,9 @@ In general, these are ordered from fastest to slowest.
|
|||
`is_match` is fastest because it doesn't actually need to find the start or the
|
||||
end of the leftmost-first match. It can quit immediately after it knows there
|
||||
is a match. For example, given the regex `a+` and the haystack, `aaaaa`, the
|
||||
search will quit after examing the first byte.
|
||||
search will quit after examining the first byte.
|
||||
|
||||
In constrast, `find` must return both the start and end location of the
|
||||
In contrast, `find` must return both the start and end location of the
|
||||
leftmost-first match. It can use the DFA matcher for this, but must run it
|
||||
forwards once to find the end of the match *and then run it backwards* to find
|
||||
the start of the match. The two scans and the cost of finding the real end of
|
||||
|
@ -198,7 +196,7 @@ a few examples of regexes that get literal prefixes detected:
|
|||
|
||||
Literals in anchored regexes can also be used for detecting non-matches very
|
||||
quickly. For example, `^foo\w+` and `\w+foo$` may be able to detect a non-match
|
||||
just by examing the first (or last) three bytes of the haystack.
|
||||
just by examining the first (or last) three bytes of the haystack.
|
||||
|
||||
## Unicode word boundaries may prevent the DFA from being used
|
||||
|
||||
|
|
|
@ -7,11 +7,9 @@ linear time with respect to the size of the regular expression and search text.
|
|||
Much of the syntax and implementation is inspired
|
||||
by [RE2](https://github.com/google/re2).
|
||||
|
||||
[![Build status](https://travis-ci.com/rust-lang/regex.svg?branch=master)](https://travis-ci.com/rust-lang/regex)
|
||||
[![Build status](https://ci.appveyor.com/api/projects/status/github/rust-lang/regex?svg=true)](https://ci.appveyor.com/project/rust-lang-libs/regex)
|
||||
[![Coverage Status](https://coveralls.io/repos/github/rust-lang/regex/badge.svg?branch=master)](https://coveralls.io/github/rust-lang/regex?branch=master)
|
||||
[![Build status](https://github.com/rust-lang/regex/workflows/ci/badge.svg)](https://github.com/rust-lang/regex/actions)
|
||||
[![](https://meritbadge.herokuapp.com/regex)](https://crates.io/crates/regex)
|
||||
[![Rust](https://img.shields.io/badge/rust-1.28.0%2B-blue.svg?maxAge=3600)](https://github.com/rust-lang/regex)
|
||||
[![Rust](https://img.shields.io/badge/rust-1.41.1%2B-blue.svg?maxAge=3600)](https://github.com/rust-lang/regex)
|
||||
|
||||
### Documentation
|
||||
|
||||
|
@ -29,13 +27,7 @@ Add this to your `Cargo.toml`:
|
|||
|
||||
```toml
|
||||
[dependencies]
|
||||
regex = "1"
|
||||
```
|
||||
|
||||
and this to your crate root (if you're using Rust 2015):
|
||||
|
||||
```rust
|
||||
extern crate regex;
|
||||
regex = "1.5"
|
||||
```
|
||||
|
||||
Here's a simple example that matches a date in YYYY-MM-DD format and prints the
|
||||
|
@ -230,7 +222,7 @@ The full set of features one can disable are
|
|||
|
||||
### Minimum Rust version policy
|
||||
|
||||
This crate's minimum supported `rustc` version is `1.28.0`.
|
||||
This crate's minimum supported `rustc` version is `1.41.1`.
|
||||
|
||||
The current **tentative** policy is that the minimum Rust version required
|
||||
to use this crate can be increased in minor version updates. For example, if
|
||||
|
@ -247,12 +239,12 @@ supported version of Rust.
|
|||
This project is licensed under either of
|
||||
|
||||
* Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or
|
||||
http://www.apache.org/licenses/LICENSE-2.0)
|
||||
https://www.apache.org/licenses/LICENSE-2.0)
|
||||
* MIT license ([LICENSE-MIT](LICENSE-MIT) or
|
||||
http://opensource.org/licenses/MIT)
|
||||
https://opensource.org/licenses/MIT)
|
||||
|
||||
at your option.
|
||||
|
||||
The data in `regex-syntax/src/unicode_tables/` is licensed under the Unicode
|
||||
License Agreement
|
||||
([LICENSE-UNICODE](http://www.unicode.org/copyright.html#License)).
|
||||
([LICENSE-UNICODE](https://www.unicode.org/copyright.html#License)).
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# Unicode conformance
|
||||
|
||||
This document describes the regex crate's conformance to Unicode's
|
||||
[UTS#18](http://unicode.org/reports/tr18/)
|
||||
[UTS#18](https://unicode.org/reports/tr18/)
|
||||
report, which lays out 3 levels of support: Basic, Extended and Tailored.
|
||||
|
||||
Full support for Level 1 ("Basic Unicode Support") is provided with two
|
||||
|
@ -10,7 +10,7 @@ exceptions:
|
|||
1. Line boundaries are not Unicode aware. Namely, only the `\n`
|
||||
(`END OF LINE`) character is recognized as a line boundary.
|
||||
2. The compatibility properties specified by
|
||||
[RL1.2a](http://unicode.org/reports/tr18/#RL1.2a)
|
||||
[RL1.2a](https://unicode.org/reports/tr18/#RL1.2a)
|
||||
are ASCII-only definitions.
|
||||
|
||||
Little to no support is provided for either Level 2 or Level 3. For the most
|
||||
|
@ -61,18 +61,18 @@ provide a convenient way to construct character classes of groups of code
|
|||
points specified by Unicode. The regex crate does not provide exhaustive
|
||||
support, but covers a useful subset. In particular:
|
||||
|
||||
* [General categories](http://unicode.org/reports/tr18/#General_Category_Property)
|
||||
* [Scripts and Script Extensions](http://unicode.org/reports/tr18/#Script_Property)
|
||||
* [Age](http://unicode.org/reports/tr18/#Age)
|
||||
* [General categories](https://unicode.org/reports/tr18/#General_Category_Property)
|
||||
* [Scripts and Script Extensions](https://unicode.org/reports/tr18/#Script_Property)
|
||||
* [Age](https://unicode.org/reports/tr18/#Age)
|
||||
* A smattering of boolean properties, including all of those specified by
|
||||
[RL1.2](http://unicode.org/reports/tr18/#RL1.2) explicitly.
|
||||
[RL1.2](https://unicode.org/reports/tr18/#RL1.2) explicitly.
|
||||
|
||||
In all cases, property name and value abbreviations are supported, and all
|
||||
names/values are matched loosely without regard for case, whitespace or
|
||||
underscores. Property name aliases can be found in Unicode's
|
||||
[`PropertyAliases.txt`](http://www.unicode.org/Public/UCD/latest/ucd/PropertyAliases.txt)
|
||||
[`PropertyAliases.txt`](https://www.unicode.org/Public/UCD/latest/ucd/PropertyAliases.txt)
|
||||
file, while property value aliases can be found in Unicode's
|
||||
[`PropertyValueAliases.txt`](http://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt)
|
||||
[`PropertyValueAliases.txt`](https://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt)
|
||||
file.
|
||||
|
||||
The syntax supported is also consistent with the UTS#18 recommendation:
|
||||
|
@ -149,10 +149,10 @@ properties correspond to properties required by RL1.2):
|
|||
|
||||
## RL1.2a Compatibility Properties
|
||||
|
||||
[UTS#18 RL1.2a](http://unicode.org/reports/tr18/#RL1.2a)
|
||||
[UTS#18 RL1.2a](https://unicode.org/reports/tr18/#RL1.2a)
|
||||
|
||||
The regex crate only provides ASCII definitions of the
|
||||
[compatibility properties documented in UTS#18 Annex C](http://unicode.org/reports/tr18/#Compatibility_Properties)
|
||||
[compatibility properties documented in UTS#18 Annex C](https://unicode.org/reports/tr18/#Compatibility_Properties)
|
||||
(sans the `\X` class, for matching grapheme clusters, which isn't provided
|
||||
at all). This is because it seems to be consistent with most other regular
|
||||
expression engines, and in particular, because these are often referred to as
|
||||
|
@ -165,7 +165,7 @@ Their traditional ASCII definition can be used by disabling Unicode. That is,
|
|||
|
||||
## RL1.3 Subtraction and Intersection
|
||||
|
||||
[UTS#18 RL1.3](http://unicode.org/reports/tr18/#Subtraction_and_Intersection)
|
||||
[UTS#18 RL1.3](https://unicode.org/reports/tr18/#Subtraction_and_Intersection)
|
||||
|
||||
The regex crate provides full support for nested character classes, along with
|
||||
union, intersection (`&&`), difference (`--`) and symmetric difference (`~~`)
|
||||
|
@ -178,7 +178,7 @@ For example, to match all non-ASCII letters, you could use either
|
|||
|
||||
## RL1.4 Simple Word Boundaries
|
||||
|
||||
[UTS#18 RL1.4](http://unicode.org/reports/tr18/#Simple_Word_Boundaries)
|
||||
[UTS#18 RL1.4](https://unicode.org/reports/tr18/#Simple_Word_Boundaries)
|
||||
|
||||
The regex crate provides basic Unicode aware word boundary assertions. A word
|
||||
boundary assertion can be written as `\b`, or `\B` as its negation. A word
|
||||
|
@ -196,9 +196,9 @@ the following classes:
|
|||
* `\p{gc:Connector_Punctuation}`
|
||||
|
||||
In particular, this differs slightly from the
|
||||
[prescription given in RL1.4](http://unicode.org/reports/tr18/#Simple_Word_Boundaries)
|
||||
[prescription given in RL1.4](https://unicode.org/reports/tr18/#Simple_Word_Boundaries)
|
||||
but is permissible according to
|
||||
[UTS#18 Annex C](http://unicode.org/reports/tr18/#Compatibility_Properties).
|
||||
[UTS#18 Annex C](https://unicode.org/reports/tr18/#Compatibility_Properties).
|
||||
Namely, it is convenient and simpler to have `\w` and `\b` be in sync with
|
||||
one another.
|
||||
|
||||
|
@ -211,7 +211,7 @@ boundaries is currently sub-optimal on non-ASCII text.
|
|||
|
||||
## RL1.5 Simple Loose Matches
|
||||
|
||||
[UTS#18 RL1.5](http://unicode.org/reports/tr18/#Simple_Loose_Matches)
|
||||
[UTS#18 RL1.5](https://unicode.org/reports/tr18/#Simple_Loose_Matches)
|
||||
|
||||
The regex crate provides full support for case insensitive matching in
|
||||
accordance with RL1.5. That is, it uses the "simple" case folding mapping. The
|
||||
|
@ -226,7 +226,7 @@ then all characters classes are case folded as well.
|
|||
|
||||
## RL1.6 Line Boundaries
|
||||
|
||||
[UTS#18 RL1.6](http://unicode.org/reports/tr18/#Line_Boundaries)
|
||||
[UTS#18 RL1.6](https://unicode.org/reports/tr18/#Line_Boundaries)
|
||||
|
||||
The regex crate only provides support for recognizing the `\n` (`END OF LINE`)
|
||||
character as a line boundary. This choice was made mostly for implementation
|
||||
|
@ -239,7 +239,7 @@ well, and in theory, this could be done efficiently.
|
|||
|
||||
## RL1.7 Code Points
|
||||
|
||||
[UTS#18 RL1.7](http://unicode.org/reports/tr18/#Supplementary_Characters)
|
||||
[UTS#18 RL1.7](https://unicode.org/reports/tr18/#Supplementary_Characters)
|
||||
|
||||
The regex crate provides full support for Unicode code point matching. Namely,
|
||||
the fundamental atom of any match is always a single code point.
|
||||
|
|
|
@ -1,12 +1,10 @@
|
|||
// The Computer Language Benchmarks Game
|
||||
// http://benchmarksgame.alioth.debian.org/
|
||||
// https://benchmarksgame-team.pages.debian.net/benchmarksgame/
|
||||
//
|
||||
// contributed by the Rust Project Developers
|
||||
// contributed by TeXitoi
|
||||
// contributed by BurntSushi
|
||||
|
||||
extern crate regex;
|
||||
|
||||
use std::io::{self, Read};
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
// The Computer Language Benchmarks Game
|
||||
// http://benchmarksgame.alioth.debian.org/
|
||||
// https://benchmarksgame-team.pages.debian.net/benchmarksgame/
|
||||
//
|
||||
// contributed by the Rust Project Developers
|
||||
// contributed by TeXitoi
|
||||
|
@ -10,8 +10,6 @@
|
|||
// replacing them with a single linear scan. i.e., it re-implements
|
||||
// `replace_all`. As a result, this is around 25% faster. ---AG
|
||||
|
||||
extern crate regex;
|
||||
|
||||
use std::io::{self, Read};
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
|
|
|
@ -1,5 +1,3 @@
|
|||
extern crate regex;
|
||||
|
||||
use std::io::{self, Read};
|
||||
|
||||
macro_rules! regex {
|
||||
|
|
|
@ -1,12 +1,10 @@
|
|||
// The Computer Language Benchmarks Game
|
||||
// http://benchmarksgame.alioth.debian.org/
|
||||
// https://benchmarksgame-team.pages.debian.net/benchmarksgame/
|
||||
//
|
||||
// contributed by the Rust Project Developers
|
||||
// contributed by TeXitoi
|
||||
// contributed by BurntSushi
|
||||
|
||||
extern crate regex;
|
||||
|
||||
use std::io::{self, Read};
|
||||
|
||||
macro_rules! regex {
|
||||
|
|
|
@ -1,12 +1,10 @@
|
|||
// The Computer Language Benchmarks Game
|
||||
// http://benchmarksgame.alioth.debian.org/
|
||||
// https://benchmarksgame-team.pages.debian.net/benchmarksgame/
|
||||
//
|
||||
// contributed by the Rust Project Developers
|
||||
// contributed by TeXitoi
|
||||
// contributed by BurntSushi
|
||||
|
||||
extern crate regex;
|
||||
|
||||
use std::io::{self, Read};
|
||||
|
||||
macro_rules! regex {
|
||||
|
|
|
@ -1,12 +1,10 @@
|
|||
// The Computer Language Benchmarks Game
|
||||
// http://benchmarksgame.alioth.debian.org/
|
||||
// https://benchmarksgame-team.pages.debian.net/benchmarksgame/
|
||||
//
|
||||
// contributed by the Rust Project Developers
|
||||
// contributed by TeXitoi
|
||||
// contributed by BurntSushi
|
||||
|
||||
extern crate regex;
|
||||
|
||||
use std::io::{self, Read};
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
|
|
|
@ -16,10 +16,10 @@
|
|||
// the bitset has to be zeroed on each execution, which becomes quite expensive
|
||||
// on large bitsets.
|
||||
|
||||
use exec::ProgramCache;
|
||||
use input::{Input, InputAt};
|
||||
use prog::{InstPtr, Program};
|
||||
use re_trait::Slot;
|
||||
use crate::exec::ProgramCache;
|
||||
use crate::input::{Input, InputAt};
|
||||
use crate::prog::{InstPtr, Program};
|
||||
use crate::re_trait::Slot;
|
||||
|
||||
type Bits = u32;
|
||||
|
||||
|
@ -115,8 +115,8 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
|
|||
// Then we reset all existing allocated space to 0.
|
||||
// Finally, we request more space if we need it.
|
||||
//
|
||||
// This is all a little circuitous, but doing this unsafely
|
||||
// doesn't seem to have a measurable impact on performance.
|
||||
// This is all a little circuitous, but doing this using unchecked
|
||||
// operations doesn't seem to have a measurable impact on performance.
|
||||
// (Probably because backtracking is limited to such small
|
||||
// inputs/regexes in the first place.)
|
||||
let visited_len =
|
||||
|
@ -196,7 +196,7 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
|
|||
}
|
||||
|
||||
fn step(&mut self, mut ip: InstPtr, mut at: InputAt) -> bool {
|
||||
use prog::Inst::*;
|
||||
use crate::prog::Inst::*;
|
||||
loop {
|
||||
// This loop is an optimization to avoid constantly pushing/popping
|
||||
// from the stack. Namely, if we're pushing a job only to run it
|
||||
|
|
|
@ -1,100 +0,0 @@
|
|||
// This module defines a common API for caching internal runtime state.
|
||||
// The `thread_local` crate provides an extremely optimized version of this.
|
||||
// However, if the perf-cache feature is disabled, then we drop the
|
||||
// thread_local dependency and instead use a pretty naive caching mechanism
|
||||
// with a mutex.
|
||||
//
|
||||
// Strictly speaking, the CachedGuard isn't necessary for the much more
|
||||
// flexible thread_local API, but implementing thread_local's API doesn't
|
||||
// seem possible in purely safe code.
|
||||
|
||||
pub use self::imp::{Cached, CachedGuard};
|
||||
|
||||
#[cfg(feature = "perf-cache")]
|
||||
mod imp {
|
||||
use thread_local::CachedThreadLocal;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Cached<T: Send>(CachedThreadLocal<T>);
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct CachedGuard<'a, T: 'a>(&'a T);
|
||||
|
||||
impl<T: Send> Cached<T> {
|
||||
pub fn new() -> Cached<T> {
|
||||
Cached(CachedThreadLocal::new())
|
||||
}
|
||||
|
||||
pub fn get_or(&self, create: impl FnOnce() -> T) -> CachedGuard<T> {
|
||||
CachedGuard(self.0.get_or(|| create()))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: Send> CachedGuard<'a, T> {
|
||||
pub fn value(&self) -> &T {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "perf-cache"))]
|
||||
mod imp {
|
||||
use std::marker::PhantomData;
|
||||
use std::panic::UnwindSafe;
|
||||
use std::sync::Mutex;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Cached<T: Send> {
|
||||
stack: Mutex<Vec<T>>,
|
||||
/// When perf-cache is enabled, the thread_local crate is used, and
|
||||
/// its CachedThreadLocal impls Send, Sync and UnwindSafe, but NOT
|
||||
/// RefUnwindSafe. However, a Mutex impls RefUnwindSafe. So in order
|
||||
/// to keep the APIs consistent regardless of whether perf-cache is
|
||||
/// enabled, we force this type to NOT impl RefUnwindSafe too.
|
||||
///
|
||||
/// Ideally, we should always impl RefUnwindSafe, but it seems a little
|
||||
/// tricky to do that right now.
|
||||
///
|
||||
/// See also: https://github.com/rust-lang/regex/issues/576
|
||||
_phantom: PhantomData<Box<dyn Send + Sync + UnwindSafe>>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct CachedGuard<'a, T: 'a + Send> {
|
||||
cache: &'a Cached<T>,
|
||||
value: Option<T>,
|
||||
}
|
||||
|
||||
impl<T: Send> Cached<T> {
|
||||
pub fn new() -> Cached<T> {
|
||||
Cached { stack: Mutex::new(vec![]), _phantom: PhantomData }
|
||||
}
|
||||
|
||||
pub fn get_or(&self, create: impl FnOnce() -> T) -> CachedGuard<T> {
|
||||
let mut stack = self.stack.lock().unwrap();
|
||||
match stack.pop() {
|
||||
None => CachedGuard { cache: self, value: Some(create()) },
|
||||
Some(value) => CachedGuard { cache: self, value: Some(value) },
|
||||
}
|
||||
}
|
||||
|
||||
fn put(&self, value: T) {
|
||||
let mut stack = self.stack.lock().unwrap();
|
||||
stack.push(value);
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: Send> CachedGuard<'a, T> {
|
||||
pub fn value(&self) -> &T {
|
||||
self.value.as_ref().unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: Send> Drop for CachedGuard<'a, T> {
|
||||
fn drop(&mut self) {
|
||||
if let Some(value) = self.value.take() {
|
||||
self.cache.put(value);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,20 +1,22 @@
|
|||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::iter;
|
||||
use std::result;
|
||||
use std::sync::Arc;
|
||||
|
||||
use syntax::hir::{self, Hir};
|
||||
use syntax::is_word_byte;
|
||||
use syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences};
|
||||
use regex_syntax::hir::{self, Hir};
|
||||
use regex_syntax::is_word_byte;
|
||||
use regex_syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences};
|
||||
|
||||
use prog::{
|
||||
use crate::prog::{
|
||||
EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges,
|
||||
InstSave, InstSplit, Program,
|
||||
};
|
||||
|
||||
use Error;
|
||||
use crate::Error;
|
||||
|
||||
type Result = result::Result<Patch, Error>;
|
||||
type ResultOrEmpty = result::Result<Option<Patch>, Error>;
|
||||
|
||||
#[derive(Debug)]
|
||||
struct Patch {
|
||||
|
@ -24,6 +26,9 @@ struct Patch {
|
|||
|
||||
/// A compiler translates a regular expression AST to a sequence of
|
||||
/// instructions. The sequence of instructions represents an NFA.
|
||||
// `Compiler` is only public via the `internal` module, so avoid deriving
|
||||
// `Debug`.
|
||||
#[allow(missing_debug_implementations)]
|
||||
pub struct Compiler {
|
||||
insts: Vec<MaybeInst>,
|
||||
compiled: Program,
|
||||
|
@ -33,6 +38,7 @@ pub struct Compiler {
|
|||
suffix_cache: SuffixCache,
|
||||
utf8_seqs: Option<Utf8Sequences>,
|
||||
byte_classes: ByteClassSet,
|
||||
extra_inst_bytes: usize,
|
||||
}
|
||||
|
||||
impl Compiler {
|
||||
|
@ -49,6 +55,7 @@ impl Compiler {
|
|||
suffix_cache: SuffixCache::new(1000),
|
||||
utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')),
|
||||
byte_classes: ByteClassSet::new(),
|
||||
extra_inst_bytes: 0,
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -132,7 +139,7 @@ impl Compiler {
|
|||
self.compiled.start = dotstar_patch.entry;
|
||||
}
|
||||
self.compiled.captures = vec![None];
|
||||
let patch = self.c_capture(0, expr)?;
|
||||
let patch = self.c_capture(0, expr)?.unwrap_or(self.next_inst());
|
||||
if self.compiled.needs_dotstar() {
|
||||
self.fill(dotstar_patch.hole, patch.entry);
|
||||
} else {
|
||||
|
@ -167,14 +174,16 @@ impl Compiler {
|
|||
for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() {
|
||||
self.fill_to_next(prev_hole);
|
||||
let split = self.push_split_hole();
|
||||
let Patch { hole, entry } = self.c_capture(0, expr)?;
|
||||
let Patch { hole, entry } =
|
||||
self.c_capture(0, expr)?.unwrap_or(self.next_inst());
|
||||
self.fill_to_next(hole);
|
||||
self.compiled.matches.push(self.insts.len());
|
||||
self.push_compiled(Inst::Match(i));
|
||||
prev_hole = self.fill_split(split, Some(entry), None);
|
||||
}
|
||||
let i = exprs.len() - 1;
|
||||
let Patch { hole, entry } = self.c_capture(0, &exprs[i])?;
|
||||
let Patch { hole, entry } =
|
||||
self.c_capture(0, &exprs[i])?.unwrap_or(self.next_inst());
|
||||
self.fill(prev_hole, entry);
|
||||
self.fill_to_next(hole);
|
||||
self.compiled.matches.push(self.insts.len());
|
||||
|
@ -219,7 +228,7 @@ impl Compiler {
|
|||
/// hole
|
||||
/// ```
|
||||
///
|
||||
/// To compile two expressions, e1 and e2, concatinated together we
|
||||
/// To compile two expressions, e1 and e2, concatenated together we
|
||||
/// would do:
|
||||
///
|
||||
/// ```ignore
|
||||
|
@ -242,13 +251,16 @@ impl Compiler {
|
|||
/// method you will see that it does exactly this, though it handles
|
||||
/// a list of expressions rather than just the two that we use for
|
||||
/// an example.
|
||||
fn c(&mut self, expr: &Hir) -> Result {
|
||||
use prog;
|
||||
use syntax::hir::HirKind::*;
|
||||
///
|
||||
/// Ok(None) is returned when an expression is compiled to no
|
||||
/// instruction, and so no patch.entry value makes sense.
|
||||
fn c(&mut self, expr: &Hir) -> ResultOrEmpty {
|
||||
use crate::prog;
|
||||
use regex_syntax::hir::HirKind::*;
|
||||
|
||||
self.check_size()?;
|
||||
match *expr.kind() {
|
||||
Empty => Ok(Patch { hole: Hole::None, entry: self.insts.len() }),
|
||||
Empty => Ok(None),
|
||||
Literal(hir::Literal::Unicode(c)) => self.c_char(c),
|
||||
Literal(hir::Literal::Byte(b)) => {
|
||||
assert!(self.compiled.uses_bytes());
|
||||
|
@ -306,6 +318,13 @@ impl Compiler {
|
|||
}
|
||||
self.compiled.has_unicode_word_boundary = true;
|
||||
self.byte_classes.set_word_boundary();
|
||||
// We also make sure that all ASCII bytes are in a different
|
||||
// class from non-ASCII bytes. Otherwise, it's possible for
|
||||
// ASCII bytes to get lumped into the same class as non-ASCII
|
||||
// bytes. This in turn may cause the lazy DFA to falsely start
|
||||
// when it sees an ASCII byte that maps to a byte class with
|
||||
// non-ASCII bytes. This ensures that never happens.
|
||||
self.byte_classes.set_range(0, 0x7F);
|
||||
self.c_empty_look(prog::EmptyLook::WordBoundary)
|
||||
}
|
||||
WordBoundary(hir::WordBoundary::UnicodeNegate) => {
|
||||
|
@ -318,6 +337,8 @@ impl Compiler {
|
|||
}
|
||||
self.compiled.has_unicode_word_boundary = true;
|
||||
self.byte_classes.set_word_boundary();
|
||||
// See comments above for why we set the ASCII range here.
|
||||
self.byte_classes.set_range(0, 0x7F);
|
||||
self.c_empty_look(prog::EmptyLook::NotWordBoundary)
|
||||
}
|
||||
WordBoundary(hir::WordBoundary::Ascii) => {
|
||||
|
@ -357,7 +378,7 @@ impl Compiler {
|
|||
}
|
||||
}
|
||||
|
||||
fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> Result {
|
||||
fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty {
|
||||
if self.num_exprs > 1 || self.compiled.is_dfa {
|
||||
// Don't ever compile Save instructions for regex sets because
|
||||
// they are never used. They are also never used in DFA programs
|
||||
|
@ -366,11 +387,11 @@ impl Compiler {
|
|||
} else {
|
||||
let entry = self.insts.len();
|
||||
let hole = self.push_hole(InstHole::Save { slot: first_slot });
|
||||
let patch = self.c(expr)?;
|
||||
let patch = self.c(expr)?.unwrap_or(self.next_inst());
|
||||
self.fill(hole, patch.entry);
|
||||
self.fill_to_next(patch.hole);
|
||||
let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 });
|
||||
Ok(Patch { hole: hole, entry: entry })
|
||||
Ok(Some(Patch { hole: hole, entry: entry }))
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -381,40 +402,62 @@ impl Compiler {
|
|||
greedy: false,
|
||||
hir: Box::new(Hir::any(true)),
|
||||
}))?
|
||||
.unwrap()
|
||||
} else {
|
||||
self.c(&Hir::repetition(hir::Repetition {
|
||||
kind: hir::RepetitionKind::ZeroOrMore,
|
||||
greedy: false,
|
||||
hir: Box::new(Hir::any(false)),
|
||||
}))?
|
||||
.unwrap()
|
||||
})
|
||||
}
|
||||
|
||||
fn c_char(&mut self, c: char) -> Result {
|
||||
self.c_class(&[hir::ClassUnicodeRange::new(c, c)])
|
||||
fn c_char(&mut self, c: char) -> ResultOrEmpty {
|
||||
if self.compiled.uses_bytes() {
|
||||
if c.is_ascii() {
|
||||
let b = c as u8;
|
||||
let hole =
|
||||
self.push_hole(InstHole::Bytes { start: b, end: b });
|
||||
self.byte_classes.set_range(b, b);
|
||||
Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
|
||||
} else {
|
||||
self.c_class(&[hir::ClassUnicodeRange::new(c, c)])
|
||||
}
|
||||
} else {
|
||||
let hole = self.push_hole(InstHole::Char { c: c });
|
||||
Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
|
||||
}
|
||||
}
|
||||
|
||||
fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> Result {
|
||||
fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty {
|
||||
use std::mem::size_of;
|
||||
|
||||
assert!(!ranges.is_empty());
|
||||
if self.compiled.uses_bytes() {
|
||||
CompileClass { c: self, ranges: ranges }.compile()
|
||||
Ok(Some(CompileClass { c: self, ranges: ranges }.compile()?))
|
||||
} else {
|
||||
let ranges: Vec<(char, char)> =
|
||||
ranges.iter().map(|r| (r.start(), r.end())).collect();
|
||||
let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 {
|
||||
self.push_hole(InstHole::Char { c: ranges[0].0 })
|
||||
} else {
|
||||
self.extra_inst_bytes +=
|
||||
ranges.len() * (size_of::<char>() * 2);
|
||||
self.push_hole(InstHole::Ranges { ranges: ranges })
|
||||
};
|
||||
Ok(Patch { hole: hole, entry: self.insts.len() - 1 })
|
||||
Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 }))
|
||||
}
|
||||
}
|
||||
|
||||
fn c_byte(&mut self, b: u8) -> Result {
|
||||
fn c_byte(&mut self, b: u8) -> ResultOrEmpty {
|
||||
self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)])
|
||||
}
|
||||
|
||||
fn c_class_bytes(&mut self, ranges: &[hir::ClassBytesRange]) -> Result {
|
||||
fn c_class_bytes(
|
||||
&mut self,
|
||||
ranges: &[hir::ClassBytesRange],
|
||||
) -> ResultOrEmpty {
|
||||
debug_assert!(!ranges.is_empty());
|
||||
|
||||
let first_split_entry = self.insts.len();
|
||||
|
@ -438,35 +481,39 @@ impl Compiler {
|
|||
self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }),
|
||||
);
|
||||
self.fill(prev_hole, next);
|
||||
Ok(Patch { hole: Hole::Many(holes), entry: first_split_entry })
|
||||
Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
|
||||
}
|
||||
|
||||
fn c_empty_look(&mut self, look: EmptyLook) -> Result {
|
||||
fn c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty {
|
||||
let hole = self.push_hole(InstHole::EmptyLook { look: look });
|
||||
Ok(Patch { hole: hole, entry: self.insts.len() - 1 })
|
||||
Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 }))
|
||||
}
|
||||
|
||||
fn c_concat<'a, I>(&mut self, exprs: I) -> Result
|
||||
fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty
|
||||
where
|
||||
I: IntoIterator<Item = &'a Hir>,
|
||||
{
|
||||
let mut exprs = exprs.into_iter();
|
||||
let first = match exprs.next() {
|
||||
Some(expr) => expr,
|
||||
None => {
|
||||
return Ok(Patch { hole: Hole::None, entry: self.insts.len() })
|
||||
let Patch { mut hole, entry } = loop {
|
||||
match exprs.next() {
|
||||
None => return Ok(None),
|
||||
Some(e) => {
|
||||
if let Some(p) = self.c(e)? {
|
||||
break p;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
let Patch { mut hole, entry } = self.c(first)?;
|
||||
for e in exprs {
|
||||
let p = self.c(e)?;
|
||||
self.fill(hole, p.entry);
|
||||
hole = p.hole;
|
||||
if let Some(p) = self.c(e)? {
|
||||
self.fill(hole, p.entry);
|
||||
hole = p.hole;
|
||||
}
|
||||
}
|
||||
Ok(Patch { hole: hole, entry: entry })
|
||||
Ok(Some(Patch { hole: hole, entry: entry }))
|
||||
}
|
||||
|
||||
fn c_alternate(&mut self, exprs: &[Hir]) -> Result {
|
||||
fn c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty {
|
||||
debug_assert!(
|
||||
exprs.len() >= 2,
|
||||
"alternates must have at least 2 exprs"
|
||||
|
@ -479,44 +526,44 @@ impl Compiler {
|
|||
// patched to point to the same location.
|
||||
let mut holes = vec![];
|
||||
|
||||
let mut prev_hole = Hole::None;
|
||||
// true indicates that the hole is a split where we want to fill
|
||||
// the second branch.
|
||||
let mut prev_hole = (Hole::None, false);
|
||||
for e in &exprs[0..exprs.len() - 1] {
|
||||
self.fill_to_next(prev_hole);
|
||||
let split = self.push_split_hole();
|
||||
let prev_entry = self.insts.len();
|
||||
let Patch { hole, entry } = self.c(e)?;
|
||||
if prev_entry == self.insts.len() {
|
||||
// TODO(burntsushi): It is kind of silly that we don't support
|
||||
// empty-subexpressions in alternates, but it is supremely
|
||||
// awkward to support them in the existing compiler
|
||||
// infrastructure. This entire compiler needs to be thrown out
|
||||
// anyway, so don't feel too bad.
|
||||
return Err(Error::Syntax(
|
||||
"alternations cannot currently contain \
|
||||
empty sub-expressions"
|
||||
.to_string(),
|
||||
));
|
||||
if prev_hole.1 {
|
||||
let next = self.insts.len();
|
||||
self.fill_split(prev_hole.0, None, Some(next));
|
||||
} else {
|
||||
self.fill_to_next(prev_hole.0);
|
||||
}
|
||||
let split = self.push_split_hole();
|
||||
if let Some(Patch { hole, entry }) = self.c(e)? {
|
||||
holes.push(hole);
|
||||
prev_hole = (self.fill_split(split, Some(entry), None), false);
|
||||
} else {
|
||||
let (split1, split2) = split.dup_one();
|
||||
holes.push(split1);
|
||||
prev_hole = (split2, true);
|
||||
}
|
||||
}
|
||||
if let Some(Patch { hole, entry }) = self.c(&exprs[exprs.len() - 1])? {
|
||||
holes.push(hole);
|
||||
prev_hole = self.fill_split(split, Some(entry), None);
|
||||
if prev_hole.1 {
|
||||
self.fill_split(prev_hole.0, None, Some(entry));
|
||||
} else {
|
||||
self.fill(prev_hole.0, entry);
|
||||
}
|
||||
} else {
|
||||
// We ignore prev_hole.1. When it's true, it means we have two
|
||||
// empty branches both pushing prev_hole.0 into holes, so both
|
||||
// branches will go to the same place anyway.
|
||||
holes.push(prev_hole.0);
|
||||
}
|
||||
let prev_entry = self.insts.len();
|
||||
let Patch { hole, entry } = self.c(&exprs[exprs.len() - 1])?;
|
||||
if prev_entry == self.insts.len() {
|
||||
// TODO(burntsushi): See TODO above.
|
||||
return Err(Error::Syntax(
|
||||
"alternations cannot currently contain \
|
||||
empty sub-expressions"
|
||||
.to_string(),
|
||||
));
|
||||
}
|
||||
holes.push(hole);
|
||||
self.fill(prev_hole, entry);
|
||||
Ok(Patch { hole: Hole::Many(holes), entry: first_split_entry })
|
||||
Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
|
||||
}
|
||||
|
||||
fn c_repeat(&mut self, rep: &hir::Repetition) -> Result {
|
||||
use syntax::hir::RepetitionKind::*;
|
||||
fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty {
|
||||
use regex_syntax::hir::RepetitionKind::*;
|
||||
match rep.kind {
|
||||
ZeroOrOne => self.c_repeat_zero_or_one(&rep.hir, rep.greedy),
|
||||
ZeroOrMore => self.c_repeat_zero_or_more(&rep.hir, rep.greedy),
|
||||
|
@ -533,24 +580,37 @@ impl Compiler {
|
|||
}
|
||||
}
|
||||
|
||||
fn c_repeat_zero_or_one(&mut self, expr: &Hir, greedy: bool) -> Result {
|
||||
fn c_repeat_zero_or_one(
|
||||
&mut self,
|
||||
expr: &Hir,
|
||||
greedy: bool,
|
||||
) -> ResultOrEmpty {
|
||||
let split_entry = self.insts.len();
|
||||
let split = self.push_split_hole();
|
||||
let Patch { hole: hole_rep, entry: entry_rep } = self.c(expr)?;
|
||||
|
||||
let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
|
||||
Some(p) => p,
|
||||
None => return self.pop_split_hole(),
|
||||
};
|
||||
let split_hole = if greedy {
|
||||
self.fill_split(split, Some(entry_rep), None)
|
||||
} else {
|
||||
self.fill_split(split, None, Some(entry_rep))
|
||||
};
|
||||
let holes = vec![hole_rep, split_hole];
|
||||
Ok(Patch { hole: Hole::Many(holes), entry: split_entry })
|
||||
Ok(Some(Patch { hole: Hole::Many(holes), entry: split_entry }))
|
||||
}
|
||||
|
||||
fn c_repeat_zero_or_more(&mut self, expr: &Hir, greedy: bool) -> Result {
|
||||
fn c_repeat_zero_or_more(
|
||||
&mut self,
|
||||
expr: &Hir,
|
||||
greedy: bool,
|
||||
) -> ResultOrEmpty {
|
||||
let split_entry = self.insts.len();
|
||||
let split = self.push_split_hole();
|
||||
let Patch { hole: hole_rep, entry: entry_rep } = self.c(expr)?;
|
||||
let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
|
||||
Some(p) => p,
|
||||
None => return self.pop_split_hole(),
|
||||
};
|
||||
|
||||
self.fill(hole_rep, split_entry);
|
||||
let split_hole = if greedy {
|
||||
|
@ -558,11 +618,18 @@ impl Compiler {
|
|||
} else {
|
||||
self.fill_split(split, None, Some(entry_rep))
|
||||
};
|
||||
Ok(Patch { hole: split_hole, entry: split_entry })
|
||||
Ok(Some(Patch { hole: split_hole, entry: split_entry }))
|
||||
}
|
||||
|
||||
fn c_repeat_one_or_more(&mut self, expr: &Hir, greedy: bool) -> Result {
|
||||
let Patch { hole: hole_rep, entry: entry_rep } = self.c(expr)?;
|
||||
fn c_repeat_one_or_more(
|
||||
&mut self,
|
||||
expr: &Hir,
|
||||
greedy: bool,
|
||||
) -> ResultOrEmpty {
|
||||
let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
|
||||
Some(p) => p,
|
||||
None => return Ok(None),
|
||||
};
|
||||
self.fill_to_next(hole_rep);
|
||||
let split = self.push_split_hole();
|
||||
|
||||
|
@ -571,7 +638,7 @@ impl Compiler {
|
|||
} else {
|
||||
self.fill_split(split, None, Some(entry_rep))
|
||||
};
|
||||
Ok(Patch { hole: split_hole, entry: entry_rep })
|
||||
Ok(Some(Patch { hole: split_hole, entry: entry_rep }))
|
||||
}
|
||||
|
||||
fn c_repeat_range_min_or_more(
|
||||
|
@ -579,12 +646,20 @@ impl Compiler {
|
|||
expr: &Hir,
|
||||
greedy: bool,
|
||||
min: u32,
|
||||
) -> Result {
|
||||
) -> ResultOrEmpty {
|
||||
let min = u32_to_usize(min);
|
||||
let patch_concat = self.c_concat(iter::repeat(expr).take(min))?;
|
||||
let patch_rep = self.c_repeat_zero_or_more(expr, greedy)?;
|
||||
self.fill(patch_concat.hole, patch_rep.entry);
|
||||
Ok(Patch { hole: patch_rep.hole, entry: patch_concat.entry })
|
||||
// Using next_inst() is ok, because we can't return it (concat would
|
||||
// have to return Some(_) while c_repeat_range_min_or_more returns
|
||||
// None).
|
||||
let patch_concat = self
|
||||
.c_concat(iter::repeat(expr).take(min))?
|
||||
.unwrap_or(self.next_inst());
|
||||
if let Some(patch_rep) = self.c_repeat_zero_or_more(expr, greedy)? {
|
||||
self.fill(patch_concat.hole, patch_rep.entry);
|
||||
Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry }))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
fn c_repeat_range(
|
||||
|
@ -593,13 +668,17 @@ impl Compiler {
|
|||
greedy: bool,
|
||||
min: u32,
|
||||
max: u32,
|
||||
) -> Result {
|
||||
) -> ResultOrEmpty {
|
||||
let (min, max) = (u32_to_usize(min), u32_to_usize(max));
|
||||
debug_assert!(min <= max);
|
||||
let patch_concat = self.c_concat(iter::repeat(expr).take(min))?;
|
||||
let initial_entry = patch_concat.entry;
|
||||
if min == max {
|
||||
return Ok(patch_concat);
|
||||
}
|
||||
// Same reasoning as in c_repeat_range_min_or_more (we know that min <
|
||||
// max at this point).
|
||||
let patch_concat = patch_concat.unwrap_or(self.next_inst());
|
||||
let initial_entry = patch_concat.entry;
|
||||
// It is much simpler to compile, e.g., `a{2,5}` as:
|
||||
//
|
||||
// aaa?a?a?
|
||||
|
@ -624,7 +703,10 @@ impl Compiler {
|
|||
for _ in min..max {
|
||||
self.fill_to_next(prev_hole);
|
||||
let split = self.push_split_hole();
|
||||
let Patch { hole, entry } = self.c(expr)?;
|
||||
let Patch { hole, entry } = match self.c(expr)? {
|
||||
Some(p) => p,
|
||||
None => return self.pop_split_hole(),
|
||||
};
|
||||
prev_hole = hole;
|
||||
if greedy {
|
||||
holes.push(self.fill_split(split, Some(entry), None));
|
||||
|
@ -633,7 +715,14 @@ impl Compiler {
|
|||
}
|
||||
}
|
||||
holes.push(prev_hole);
|
||||
Ok(Patch { hole: Hole::Many(holes), entry: initial_entry })
|
||||
Ok(Some(Patch { hole: Hole::Many(holes), entry: initial_entry }))
|
||||
}
|
||||
|
||||
/// Can be used as a default value for the c_* functions when the call to
|
||||
/// c_function is followed by inserting at least one instruction that is
|
||||
/// always executed after the ones written by the c* function.
|
||||
fn next_inst(&self) -> Patch {
|
||||
Patch { hole: Hole::None, entry: self.insts.len() }
|
||||
}
|
||||
|
||||
fn fill(&mut self, hole: Hole, goto: InstPtr) {
|
||||
|
@ -713,10 +802,17 @@ impl Compiler {
|
|||
Hole::One(hole)
|
||||
}
|
||||
|
||||
fn pop_split_hole(&mut self) -> ResultOrEmpty {
|
||||
self.insts.pop();
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
fn check_size(&self) -> result::Result<(), Error> {
|
||||
use std::mem::size_of;
|
||||
|
||||
if self.insts.len() * size_of::<Inst>() > self.size_limit {
|
||||
let size =
|
||||
self.extra_inst_bytes + (self.insts.len() * size_of::<Inst>());
|
||||
if size > self.size_limit {
|
||||
Err(Error::CompiledTooBig(self.size_limit))
|
||||
} else {
|
||||
Ok(())
|
||||
|
@ -731,6 +827,17 @@ enum Hole {
|
|||
Many(Vec<Hole>),
|
||||
}
|
||||
|
||||
impl Hole {
|
||||
fn dup_one(self) -> (Self, Self) {
|
||||
match self {
|
||||
Hole::One(pc) => (Hole::One(pc), Hole::One(pc)),
|
||||
Hole::None | Hole::Many(_) => {
|
||||
unreachable!("must be called on single hole")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
enum MaybeInst {
|
||||
Compiled(Inst),
|
||||
|
@ -742,13 +849,22 @@ enum MaybeInst {
|
|||
|
||||
impl MaybeInst {
|
||||
fn fill(&mut self, goto: InstPtr) {
|
||||
let filled = match *self {
|
||||
MaybeInst::Uncompiled(ref inst) => inst.fill(goto),
|
||||
let maybeinst = match *self {
|
||||
MaybeInst::Split => MaybeInst::Split1(goto),
|
||||
MaybeInst::Uncompiled(ref inst) => {
|
||||
MaybeInst::Compiled(inst.fill(goto))
|
||||
}
|
||||
MaybeInst::Split1(goto1) => {
|
||||
Inst::Split(InstSplit { goto1: goto1, goto2: goto })
|
||||
MaybeInst::Compiled(Inst::Split(InstSplit {
|
||||
goto1: goto1,
|
||||
goto2: goto,
|
||||
}))
|
||||
}
|
||||
MaybeInst::Split2(goto2) => {
|
||||
Inst::Split(InstSplit { goto1: goto, goto2: goto2 })
|
||||
MaybeInst::Compiled(Inst::Split(InstSplit {
|
||||
goto1: goto,
|
||||
goto2: goto2,
|
||||
}))
|
||||
}
|
||||
_ => unreachable!(
|
||||
"not all instructions were compiled! \
|
||||
|
@ -756,7 +872,7 @@ impl MaybeInst {
|
|||
self
|
||||
),
|
||||
};
|
||||
*self = MaybeInst::Compiled(filled);
|
||||
*self = maybeinst;
|
||||
}
|
||||
|
||||
fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) {
|
||||
|
@ -828,9 +944,10 @@ impl InstHole {
|
|||
Inst::EmptyLook(InstEmptyLook { goto: goto, look: look })
|
||||
}
|
||||
InstHole::Char { c } => Inst::Char(InstChar { goto: goto, c: c }),
|
||||
InstHole::Ranges { ref ranges } => {
|
||||
Inst::Ranges(InstRanges { goto: goto, ranges: ranges.clone() })
|
||||
}
|
||||
InstHole::Ranges { ref ranges } => Inst::Ranges(InstRanges {
|
||||
goto: goto,
|
||||
ranges: ranges.clone().into_boxed_slice(),
|
||||
}),
|
||||
InstHole::Bytes { start, end } => {
|
||||
Inst::Bytes(InstBytes { goto: goto, start: start, end: end })
|
||||
}
|
||||
|
@ -956,6 +1073,7 @@ impl<'a, 'b> CompileClass<'a, 'b> {
|
|||
/// This uses similar idea to [`SparseSet`](../sparse/struct.SparseSet.html),
|
||||
/// except it uses hashes as original indices and then compares full keys for
|
||||
/// validation against `dense` array.
|
||||
#[derive(Debug)]
|
||||
struct SuffixCache {
|
||||
sparse: Box<[usize]>,
|
||||
dense: Vec<SuffixCacheEntry>,
|
||||
|
@ -1064,6 +1182,12 @@ impl ByteClassSet {
|
|||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for ByteClassSet {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.debug_tuple("ByteClassSet").field(&&self.0[..]).finish()
|
||||
}
|
||||
}
|
||||
|
||||
fn u32_to_usize(n: u32) -> usize {
|
||||
// In case usize is less than 32 bits, we need to guard against overflow.
|
||||
// On most platforms this compiles to nothing.
|
||||
|
|
|
@ -42,9 +42,9 @@ use std::iter::repeat;
|
|||
use std::mem;
|
||||
use std::sync::Arc;
|
||||
|
||||
use exec::ProgramCache;
|
||||
use prog::{Inst, Program};
|
||||
use sparse::SparseSet;
|
||||
use crate::exec::ProgramCache;
|
||||
use crate::prog::{Inst, Program};
|
||||
use crate::sparse::SparseSet;
|
||||
|
||||
/// Return true if and only if the given program can be executed by a DFA.
|
||||
///
|
||||
|
@ -55,7 +55,7 @@ use sparse::SparseSet;
|
|||
/// This function will also return false if the given program has any Unicode
|
||||
/// instructions (Char or Ranges) since the DFA operates on bytes only.
|
||||
pub fn can_exec(insts: &Program) -> bool {
|
||||
use prog::Inst::*;
|
||||
use crate::prog::Inst::*;
|
||||
// If for some reason we manage to allocate a regex program with more
|
||||
// than i32::MAX instructions, then we can't execute the DFA because we
|
||||
// use 32 bit instruction pointer deltas for memory savings.
|
||||
|
@ -306,7 +306,7 @@ impl State {
|
|||
StateFlags(self.data[0])
|
||||
}
|
||||
|
||||
fn inst_ptrs(&self) -> InstPtrs {
|
||||
fn inst_ptrs(&self) -> InstPtrs<'_> {
|
||||
InstPtrs { base: 0, data: &self.data[1..] }
|
||||
}
|
||||
}
|
||||
|
@ -679,7 +679,7 @@ impl<'a> Fsm<'a> {
|
|||
}
|
||||
} else if next_si & STATE_START > 0 {
|
||||
// A start state isn't in the common case because we may
|
||||
// what to do quick prefix scanning. If the program doesn't
|
||||
// want to do quick prefix scanning. If the program doesn't
|
||||
// have a detected prefix, then start states are actually
|
||||
// considered common and this case is never reached.
|
||||
debug_assert!(self.has_prefix());
|
||||
|
@ -725,7 +725,7 @@ impl<'a> Fsm<'a> {
|
|||
}
|
||||
}
|
||||
|
||||
// Run the DFA once more on the special EOF senitnel value.
|
||||
// Run the DFA once more on the special EOF sentinel value.
|
||||
// We don't care about the special bits in the state pointer any more,
|
||||
// so get rid of them.
|
||||
prev_si &= STATE_MAX;
|
||||
|
@ -830,7 +830,7 @@ impl<'a> Fsm<'a> {
|
|||
}
|
||||
}
|
||||
|
||||
// Run the DFA once more on the special EOF senitnel value.
|
||||
// Run the DFA once more on the special EOF sentinel value.
|
||||
prev_si = match self.next_state(qcur, qnext, prev_si, Byte::eof()) {
|
||||
None => return Result::Quit,
|
||||
Some(STATE_DEAD) => return result.set_non_match(0),
|
||||
|
@ -848,7 +848,7 @@ impl<'a> Fsm<'a> {
|
|||
/// next_si transitions to the next state, where the transition input
|
||||
/// corresponds to text[i].
|
||||
///
|
||||
/// This elides bounds checks, and is therefore unsafe.
|
||||
/// This elides bounds checks, and is therefore not safe.
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
unsafe fn next_si(&self, si: StatePtr, text: &[u8], i: usize) -> StatePtr {
|
||||
// What is the argument for safety here?
|
||||
|
@ -894,7 +894,7 @@ impl<'a> Fsm<'a> {
|
|||
mut si: StatePtr,
|
||||
b: Byte,
|
||||
) -> Option<StatePtr> {
|
||||
use prog::Inst::*;
|
||||
use crate::prog::Inst::*;
|
||||
|
||||
// Initialize a queue with the current DFA state's NFA states.
|
||||
qcur.clear();
|
||||
|
@ -913,8 +913,8 @@ impl<'a> Fsm<'a> {
|
|||
if self.state(si).flags().has_empty() {
|
||||
// Compute the flags immediately preceding the current byte.
|
||||
// This means we only care about the "end" or "end line" flags.
|
||||
// (The "start" flags are computed immediately proceding the
|
||||
// current byte and is handled below.)
|
||||
// (The "start" flags are computed immediately following the
|
||||
// current byte and are handled below.)
|
||||
let mut flags = EmptyFlags::default();
|
||||
if b.is_eof() {
|
||||
flags.end = true;
|
||||
|
@ -1048,7 +1048,7 @@ impl<'a> Fsm<'a> {
|
|||
///
|
||||
/// If matching starts after the beginning of the input, then only start
|
||||
/// line should be set if the preceding byte is `\n`. End line should never
|
||||
/// be set in this case. (Even if the proceding byte is a `\n`, it will
|
||||
/// be set in this case. (Even if the following byte is a `\n`, it will
|
||||
/// be handled in a subsequent DFA state.)
|
||||
fn follow_epsilons(
|
||||
&mut self,
|
||||
|
@ -1056,8 +1056,8 @@ impl<'a> Fsm<'a> {
|
|||
q: &mut SparseSet,
|
||||
flags: EmptyFlags,
|
||||
) {
|
||||
use prog::EmptyLook::*;
|
||||
use prog::Inst::*;
|
||||
use crate::prog::EmptyLook::*;
|
||||
use crate::prog::Inst::*;
|
||||
|
||||
// We need to traverse the NFA to follow epsilon transitions, so avoid
|
||||
// recursion with an explicit stack.
|
||||
|
@ -1190,7 +1190,7 @@ impl<'a> Fsm<'a> {
|
|||
q: &SparseSet,
|
||||
state_flags: &mut StateFlags,
|
||||
) -> Option<State> {
|
||||
use prog::Inst::*;
|
||||
use crate::prog::Inst::*;
|
||||
|
||||
// We need to build up enough information to recognize pre-built states
|
||||
// in the DFA. Generally speaking, this includes every instruction
|
||||
|
@ -1688,7 +1688,7 @@ impl Transitions {
|
|||
self.num_byte_classes * mem::size_of::<StatePtr>()
|
||||
}
|
||||
|
||||
/// Like `next`, but uses unchecked access and is therefore unsafe.
|
||||
/// Like `next`, but uses unchecked access and is therefore not safe.
|
||||
unsafe fn next_unchecked(&self, si: StatePtr, cls: usize) -> StatePtr {
|
||||
debug_assert!((si as usize) < self.table.len());
|
||||
debug_assert!(cls < self.num_byte_classes);
|
||||
|
@ -1754,7 +1754,7 @@ impl Byte {
|
|||
}
|
||||
|
||||
impl fmt::Debug for State {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let ips: Vec<usize> = self.inst_ptrs().collect();
|
||||
f.debug_struct("State")
|
||||
.field("flags", &self.flags())
|
||||
|
@ -1764,7 +1764,7 @@ impl fmt::Debug for State {
|
|||
}
|
||||
|
||||
impl fmt::Debug for Transitions {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let mut fmtd = f.debug_map();
|
||||
for si in 0..self.num_states() {
|
||||
let s = si * self.num_byte_classes;
|
||||
|
@ -1778,7 +1778,7 @@ impl fmt::Debug for Transitions {
|
|||
struct TransitionsRow<'a>(&'a [StatePtr]);
|
||||
|
||||
impl<'a> fmt::Debug for TransitionsRow<'a> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let mut fmtd = f.debug_map();
|
||||
for (b, si) in self.0.iter().enumerate() {
|
||||
match *si {
|
||||
|
@ -1796,7 +1796,7 @@ impl<'a> fmt::Debug for TransitionsRow<'a> {
|
|||
}
|
||||
|
||||
impl fmt::Debug for StateFlags {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.debug_struct("StateFlags")
|
||||
.field("is_match", &self.is_match())
|
||||
.field("is_word", &self.is_word())
|
||||
|
@ -1889,18 +1889,27 @@ fn read_varu32(data: &[u8]) -> (u32, usize) {
|
|||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
extern crate rand;
|
||||
|
||||
use super::{
|
||||
push_inst_ptr, read_vari32, read_varu32, write_vari32, write_varu32,
|
||||
State, StateFlags,
|
||||
};
|
||||
use quickcheck::{quickcheck, QuickCheck, StdGen};
|
||||
use quickcheck::{quickcheck, Gen, QuickCheck};
|
||||
use std::sync::Arc;
|
||||
|
||||
#[test]
|
||||
fn prop_state_encode_decode() {
|
||||
fn p(ips: Vec<u32>, flags: u8) -> bool {
|
||||
fn p(mut ips: Vec<u32>, flags: u8) -> bool {
|
||||
// It looks like our encoding scheme can't handle instruction
|
||||
// pointers at or above 2**31. We should fix that, but it seems
|
||||
// unlikely to occur in real code due to the amount of memory
|
||||
// required for such a state machine. So for now, we just clamp
|
||||
// our test data.
|
||||
for ip in &mut ips {
|
||||
if *ip >= 1 << 31 {
|
||||
*ip = (1 << 31) - 1;
|
||||
}
|
||||
}
|
||||
let mut data = vec![flags];
|
||||
let mut prev = 0;
|
||||
for &ip in ips.iter() {
|
||||
|
@ -1914,7 +1923,7 @@ mod tests {
|
|||
expected == got && state.flags() == StateFlags(flags)
|
||||
}
|
||||
QuickCheck::new()
|
||||
.gen(StdGen::new(self::rand::thread_rng(), 10_000))
|
||||
.gen(Gen::new(10_000))
|
||||
.quickcheck(p as fn(Vec<u32>, u8) -> bool);
|
||||
}
|
||||
|
||||
|
|
|
@ -31,7 +31,7 @@ impl ::std::error::Error for Error {
|
|||
}
|
||||
|
||||
impl fmt::Display for Error {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match *self {
|
||||
Error::Syntax(ref err) => err.fmt(f),
|
||||
Error::CompiledTooBig(limit) => write!(
|
||||
|
@ -49,7 +49,7 @@ impl fmt::Display for Error {
|
|||
// but the `Syntax` variant is already storing a `String` anyway, so we might
|
||||
// as well format it nicely.
|
||||
impl fmt::Debug for Error {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match *self {
|
||||
Error::Syntax(ref err) => {
|
||||
let hr: String = repeat('~').take(79).collect();
|
||||
|
|
|
@ -1,40 +1,49 @@
|
|||
use std::cell::RefCell;
|
||||
use std::collections::HashMap;
|
||||
use std::panic::AssertUnwindSafe;
|
||||
use std::sync::Arc;
|
||||
|
||||
#[cfg(feature = "perf-literal")]
|
||||
use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
|
||||
use syntax::hir::literal::Literals;
|
||||
use syntax::hir::Hir;
|
||||
use syntax::ParserBuilder;
|
||||
use regex_syntax::hir::literal::Literals;
|
||||
use regex_syntax::hir::Hir;
|
||||
use regex_syntax::ParserBuilder;
|
||||
|
||||
use backtrack;
|
||||
use cache::{Cached, CachedGuard};
|
||||
use compile::Compiler;
|
||||
use crate::backtrack;
|
||||
use crate::compile::Compiler;
|
||||
#[cfg(feature = "perf-dfa")]
|
||||
use dfa;
|
||||
use error::Error;
|
||||
use input::{ByteInput, CharInput};
|
||||
use literal::LiteralSearcher;
|
||||
use pikevm;
|
||||
use prog::Program;
|
||||
use re_builder::RegexOptions;
|
||||
use re_bytes;
|
||||
use re_set;
|
||||
use re_trait::{Locations, RegularExpression, Slot};
|
||||
use re_unicode;
|
||||
use utf8::next_utf8;
|
||||
use crate::dfa;
|
||||
use crate::error::Error;
|
||||
use crate::input::{ByteInput, CharInput};
|
||||
use crate::literal::LiteralSearcher;
|
||||
use crate::pikevm;
|
||||
use crate::pool::{Pool, PoolGuard};
|
||||
use crate::prog::Program;
|
||||
use crate::re_builder::RegexOptions;
|
||||
use crate::re_bytes;
|
||||
use crate::re_set;
|
||||
use crate::re_trait::{Locations, RegularExpression, Slot};
|
||||
use crate::re_unicode;
|
||||
use crate::utf8::next_utf8;
|
||||
|
||||
/// `Exec` manages the execution of a regular expression.
|
||||
///
|
||||
/// In particular, this manages the various compiled forms of a single regular
|
||||
/// expression and the choice of which matching engine to use to execute a
|
||||
/// regular expression.
|
||||
#[derive(Debug)]
|
||||
pub struct Exec {
|
||||
/// All read only state.
|
||||
ro: Arc<ExecReadOnly>,
|
||||
/// Caches for the various matching engines.
|
||||
cache: Cached<ProgramCache>,
|
||||
/// A pool of reusable values for the various matching engines.
|
||||
///
|
||||
/// Note that boxing this value is not strictly necessary, but it is an
|
||||
/// easy way to ensure that T does not bloat the stack sized used by a pool
|
||||
/// in the case where T is big. And this turns out to be the case at the
|
||||
/// time of writing for regex's use of this pool. At the time of writing,
|
||||
/// the size of a Regex on the stack is 856 bytes. Boxing this value
|
||||
/// reduces that size to 16 bytes.
|
||||
pool: Box<Pool<ProgramCache>>,
|
||||
}
|
||||
|
||||
/// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This
|
||||
|
@ -45,10 +54,11 @@ pub struct ExecNoSync<'c> {
|
|||
/// All read only state.
|
||||
ro: &'c Arc<ExecReadOnly>,
|
||||
/// Caches for the various matching engines.
|
||||
cache: CachedGuard<'c, ProgramCache>,
|
||||
cache: PoolGuard<'c, ProgramCache>,
|
||||
}
|
||||
|
||||
/// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8].
|
||||
#[derive(Debug)]
|
||||
pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>);
|
||||
|
||||
/// `ExecReadOnly` comprises all read only state for a regex. Namely, all such
|
||||
|
@ -97,6 +107,9 @@ struct ExecReadOnly {
|
|||
/// Facilitates the construction of an executor by exposing various knobs
|
||||
/// to control how a regex is executed and what kinds of resources it's
|
||||
/// permitted to use.
|
||||
// `ExecBuilder` is only public via the `internal` module, so avoid deriving
|
||||
// `Debug`.
|
||||
#[allow(missing_debug_implementations)]
|
||||
pub struct ExecBuilder {
|
||||
options: RegexOptions,
|
||||
match_type: Option<MatchType>,
|
||||
|
@ -127,7 +140,7 @@ impl ExecBuilder {
|
|||
///
|
||||
/// Note that when compiling 2 or more regular expressions, capture groups
|
||||
/// are completely unsupported. (This means both `find` and `captures`
|
||||
/// wont work.)
|
||||
/// won't work.)
|
||||
pub fn new_many<I, S>(res: I) -> Self
|
||||
where
|
||||
S: AsRef<str>,
|
||||
|
@ -297,7 +310,8 @@ impl ExecBuilder {
|
|||
ac: None,
|
||||
match_type: MatchType::Nothing,
|
||||
});
|
||||
return Ok(Exec { ro: ro, cache: Cached::new() });
|
||||
let pool = ExecReadOnly::new_pool(&ro);
|
||||
return Ok(Exec { ro: ro, pool });
|
||||
}
|
||||
let parsed = self.parse()?;
|
||||
let mut nfa = Compiler::new()
|
||||
|
@ -337,7 +351,8 @@ impl ExecBuilder {
|
|||
ro.match_type = ro.choose_match_type(self.match_type);
|
||||
|
||||
let ro = Arc::new(ro);
|
||||
Ok(Exec { ro: ro, cache: Cached::new() })
|
||||
let pool = ExecReadOnly::new_pool(&ro);
|
||||
Ok(Exec { ro, pool })
|
||||
}
|
||||
|
||||
#[cfg(feature = "perf-literal")]
|
||||
|
@ -358,9 +373,6 @@ impl ExecBuilder {
|
|||
AhoCorasickBuilder::new()
|
||||
.match_kind(MatchKind::LeftmostFirst)
|
||||
.auto_configure(&lits)
|
||||
// We always want this to reduce size, regardless
|
||||
// of what auto-configure does.
|
||||
.byte_classes(true)
|
||||
.build_with_size::<u32, _, _>(&lits)
|
||||
// This should never happen because we'd long exceed the
|
||||
// compilation limit for regexes first.
|
||||
|
@ -724,7 +736,7 @@ impl<'c> ExecNoSync<'c> {
|
|||
text: &[u8],
|
||||
start: usize,
|
||||
) -> dfa::Result<(usize, usize)> {
|
||||
use dfa::Result::*;
|
||||
use crate::dfa::Result::*;
|
||||
let end = match dfa::Fsm::forward(
|
||||
&self.ro.dfa,
|
||||
self.cache.value(),
|
||||
|
@ -764,7 +776,7 @@ impl<'c> ExecNoSync<'c> {
|
|||
text: &[u8],
|
||||
start: usize,
|
||||
) -> dfa::Result<(usize, usize)> {
|
||||
use dfa::Result::*;
|
||||
use crate::dfa::Result::*;
|
||||
match dfa::Fsm::reverse(
|
||||
&self.ro.dfa_reverse,
|
||||
self.cache.value(),
|
||||
|
@ -820,7 +832,7 @@ impl<'c> ExecNoSync<'c> {
|
|||
text: &[u8],
|
||||
original_start: usize,
|
||||
) -> Option<dfa::Result<(usize, usize)>> {
|
||||
use dfa::Result::*;
|
||||
use crate::dfa::Result::*;
|
||||
|
||||
let lcs = self.ro.suffixes.lcs();
|
||||
debug_assert!(lcs.len() >= 1);
|
||||
|
@ -865,7 +877,7 @@ impl<'c> ExecNoSync<'c> {
|
|||
text: &[u8],
|
||||
start: usize,
|
||||
) -> dfa::Result<(usize, usize)> {
|
||||
use dfa::Result::*;
|
||||
use crate::dfa::Result::*;
|
||||
|
||||
let match_start = match self.exec_dfa_reverse_suffix(text, start) {
|
||||
None => return self.find_dfa_forward(text, start),
|
||||
|
@ -1248,17 +1260,16 @@ impl<'c> ExecNoSyncStr<'c> {
|
|||
impl Exec {
|
||||
/// Get a searcher that isn't Sync.
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
pub fn searcher(&self) -> ExecNoSync {
|
||||
let create = || RefCell::new(ProgramCacheInner::new(&self.ro));
|
||||
pub fn searcher(&self) -> ExecNoSync<'_> {
|
||||
ExecNoSync {
|
||||
ro: &self.ro, // a clone is too expensive here! (and not needed)
|
||||
cache: self.cache.get_or(create),
|
||||
cache: self.pool.get(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get a searcher that isn't Sync and can match on &str.
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
pub fn searcher_str(&self) -> ExecNoSyncStr {
|
||||
pub fn searcher_str(&self) -> ExecNoSyncStr<'_> {
|
||||
ExecNoSyncStr(self.searcher())
|
||||
}
|
||||
|
||||
|
@ -1304,7 +1315,8 @@ impl Exec {
|
|||
|
||||
impl Clone for Exec {
|
||||
fn clone(&self) -> Exec {
|
||||
Exec { ro: self.ro.clone(), cache: Cached::new() }
|
||||
let pool = ExecReadOnly::new_pool(&self.ro);
|
||||
Exec { ro: self.ro.clone(), pool }
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1437,6 +1449,13 @@ impl ExecReadOnly {
|
|||
let lcs_len = self.suffixes.lcs().char_len();
|
||||
lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len()
|
||||
}
|
||||
|
||||
fn new_pool(ro: &Arc<ExecReadOnly>) -> Box<Pool<ProgramCache>> {
|
||||
let ro = ro.clone();
|
||||
Box::new(Pool::new(Box::new(move || {
|
||||
AssertUnwindSafe(RefCell::new(ProgramCacheInner::new(&ro)))
|
||||
})))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
|
@ -1495,7 +1514,11 @@ enum MatchNfaType {
|
|||
|
||||
/// `ProgramCache` maintains reusable allocations for each matching engine
|
||||
/// available to a particular program.
|
||||
pub type ProgramCache = RefCell<ProgramCacheInner>;
|
||||
///
|
||||
/// We declare this as unwind safe since it's a cache that's only used for
|
||||
/// performance purposes. If a panic occurs, it is (or should be) always safe
|
||||
/// to continue using the same regex object.
|
||||
pub type ProgramCache = AssertUnwindSafe<RefCell<ProgramCacheInner>>;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct ProgramCacheInner {
|
||||
|
@ -1524,7 +1547,7 @@ impl ProgramCacheInner {
|
|||
/// literals, and if so, returns them. Otherwise, this returns None.
|
||||
#[cfg(feature = "perf-literal")]
|
||||
fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
|
||||
use syntax::hir::{HirKind, Literal};
|
||||
use regex_syntax::hir::{HirKind, Literal};
|
||||
|
||||
// This is pretty hacky, but basically, if `is_alternation_literal` is
|
||||
// true, then we can make several assumptions about the structure of our
|
||||
|
@ -1576,7 +1599,7 @@ fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
|
|||
mod test {
|
||||
#[test]
|
||||
fn uppercut_s_backtracking_bytes_default_bytes_mismatch() {
|
||||
use internal::ExecBuilder;
|
||||
use crate::internal::ExecBuilder;
|
||||
|
||||
let backtrack_bytes_re = ExecBuilder::new("^S")
|
||||
.bounded_backtracking()
|
||||
|
@ -1604,7 +1627,7 @@ mod test {
|
|||
|
||||
#[test]
|
||||
fn unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch() {
|
||||
use internal::ExecBuilder;
|
||||
use crate::internal::ExecBuilder;
|
||||
|
||||
let backtrack_bytes_re = ExecBuilder::new(r"^(?u:\*)")
|
||||
.bounded_backtracking()
|
||||
|
|
|
@ -1,12 +1,12 @@
|
|||
use std::str;
|
||||
|
||||
use find_byte::find_byte;
|
||||
use crate::find_byte::find_byte;
|
||||
|
||||
use re_bytes;
|
||||
use re_unicode;
|
||||
use crate::re_bytes;
|
||||
use crate::re_unicode;
|
||||
|
||||
pub fn expand_str(
|
||||
caps: &re_unicode::Captures,
|
||||
caps: &re_unicode::Captures<'_>,
|
||||
mut replacement: &str,
|
||||
dst: &mut String,
|
||||
) {
|
||||
|
@ -24,7 +24,7 @@ pub fn expand_str(
|
|||
continue;
|
||||
}
|
||||
debug_assert!(!replacement.is_empty());
|
||||
let cap_ref = match find_cap_ref(replacement) {
|
||||
let cap_ref = match find_cap_ref(replacement.as_bytes()) {
|
||||
Some(cap_ref) => cap_ref,
|
||||
None => {
|
||||
dst.push_str("$");
|
||||
|
@ -48,7 +48,7 @@ pub fn expand_str(
|
|||
}
|
||||
|
||||
pub fn expand_bytes(
|
||||
caps: &re_bytes::Captures,
|
||||
caps: &re_bytes::Captures<'_>,
|
||||
mut replacement: &[u8],
|
||||
dst: &mut Vec<u8>,
|
||||
) {
|
||||
|
@ -125,19 +125,15 @@ impl From<usize> for Ref<'static> {
|
|||
/// starting at the beginning of `replacement`.
|
||||
///
|
||||
/// If no such valid reference could be found, None is returned.
|
||||
fn find_cap_ref<T: ?Sized + AsRef<[u8]>>(
|
||||
replacement: &T,
|
||||
) -> Option<CaptureRef> {
|
||||
fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef<'_>> {
|
||||
let mut i = 0;
|
||||
let rep: &[u8] = replacement.as_ref();
|
||||
if rep.len() <= 1 || rep[0] != b'$' {
|
||||
return None;
|
||||
}
|
||||
let mut brace = false;
|
||||
i += 1;
|
||||
if rep[i] == b'{' {
|
||||
brace = true;
|
||||
i += 1;
|
||||
return find_cap_ref_braced(rep, i + 1);
|
||||
}
|
||||
let mut cap_end = i;
|
||||
while rep.get(cap_end).map_or(false, is_valid_cap_letter) {
|
||||
|
@ -148,15 +144,10 @@ fn find_cap_ref<T: ?Sized + AsRef<[u8]>>(
|
|||
}
|
||||
// We just verified that the range 0..cap_end is valid ASCII, so it must
|
||||
// therefore be valid UTF-8. If we really cared, we could avoid this UTF-8
|
||||
// check with either unsafe or by parsing the number straight from &[u8].
|
||||
// check via an unchecked conversion or by parsing the number straight from
|
||||
// &[u8].
|
||||
let cap =
|
||||
str::from_utf8(&rep[i..cap_end]).expect("valid UTF-8 capture name");
|
||||
if brace {
|
||||
if !rep.get(cap_end).map_or(false, |&b| b == b'}') {
|
||||
return None;
|
||||
}
|
||||
cap_end += 1;
|
||||
}
|
||||
Some(CaptureRef {
|
||||
cap: match cap.parse::<u32>() {
|
||||
Ok(i) => Ref::Number(i as usize),
|
||||
|
@ -166,6 +157,31 @@ fn find_cap_ref<T: ?Sized + AsRef<[u8]>>(
|
|||
})
|
||||
}
|
||||
|
||||
fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef<'_>> {
|
||||
let start = i;
|
||||
while rep.get(i).map_or(false, |&b| b != b'}') {
|
||||
i += 1;
|
||||
}
|
||||
if !rep.get(i).map_or(false, |&b| b == b'}') {
|
||||
return None;
|
||||
}
|
||||
// When looking at braced names, we don't put any restrictions on the name,
|
||||
// so it's possible it could be invalid UTF-8. But a capture group name
|
||||
// can never be invalid UTF-8, so if we have invalid UTF-8, then we can
|
||||
// safely return None.
|
||||
let cap = match str::from_utf8(&rep[start..i]) {
|
||||
Err(_) => return None,
|
||||
Ok(cap) => cap,
|
||||
};
|
||||
Some(CaptureRef {
|
||||
cap: match cap.parse::<u32>() {
|
||||
Ok(i) => Ref::Number(i as usize),
|
||||
Err(_) => Ref::Named(cap),
|
||||
},
|
||||
end: i + 1,
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns true if and only if the given byte is allowed in a capture name.
|
||||
fn is_valid_cap_letter(b: &u8) -> bool {
|
||||
match *b {
|
||||
|
@ -182,13 +198,13 @@ mod tests {
|
|||
($name:ident, $text:expr) => {
|
||||
#[test]
|
||||
fn $name() {
|
||||
assert_eq!(None, find_cap_ref($text));
|
||||
assert_eq!(None, find_cap_ref($text.as_bytes()));
|
||||
}
|
||||
};
|
||||
($name:ident, $text:expr, $capref:expr) => {
|
||||
#[test]
|
||||
fn $name() {
|
||||
assert_eq!(Some($capref), find_cap_ref($text));
|
||||
assert_eq!(Some($capref), find_cap_ref($text.as_bytes()));
|
||||
}
|
||||
};
|
||||
}
|
||||
|
@ -204,7 +220,8 @@ mod tests {
|
|||
find!(find_cap_ref3, "$0", c!(0, 2));
|
||||
find!(find_cap_ref4, "$5", c!(5, 2));
|
||||
find!(find_cap_ref5, "$10", c!(10, 3));
|
||||
// see https://github.com/rust-lang/regex/pull/585 for more on characters following numbers
|
||||
// See https://github.com/rust-lang/regex/pull/585
|
||||
// for more on characters following numbers
|
||||
find!(find_cap_ref6, "$42a", c!("42a", 4));
|
||||
find!(find_cap_ref7, "${42}a", c!(42, 5));
|
||||
find!(find_cap_ref8, "${42");
|
||||
|
@ -217,4 +234,6 @@ mod tests {
|
|||
find!(find_cap_ref15, "$1_$2", c!("1_", 3));
|
||||
find!(find_cap_ref16, "$x-$y", c!("x", 2));
|
||||
find!(find_cap_ref17, "$x_$y", c!("x_", 3));
|
||||
find!(find_cap_ref18, "${#}", c!("#", 4));
|
||||
find!(find_cap_ref19, "${Z[}", c!("Z[", 5));
|
||||
}
|
||||
|
|
|
@ -4,11 +4,9 @@ use std::fmt;
|
|||
use std::ops;
|
||||
use std::u32;
|
||||
|
||||
use syntax;
|
||||
|
||||
use literal::LiteralSearcher;
|
||||
use prog::InstEmptyLook;
|
||||
use utf8::{decode_last_utf8, decode_utf8};
|
||||
use crate::literal::LiteralSearcher;
|
||||
use crate::prog::InstEmptyLook;
|
||||
use crate::utf8::{decode_last_utf8, decode_utf8};
|
||||
|
||||
/// Represents a location in the input.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
|
@ -175,7 +173,7 @@ impl<'t> Input for CharInput<'t> {
|
|||
}
|
||||
|
||||
fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
|
||||
use prog::EmptyLook::*;
|
||||
use crate::prog::EmptyLook::*;
|
||||
match empty.look {
|
||||
StartLine => {
|
||||
let c = self.previous_char(at);
|
||||
|
@ -268,7 +266,7 @@ impl<'t> Input for ByteInput<'t> {
|
|||
}
|
||||
|
||||
fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
|
||||
use prog::EmptyLook::*;
|
||||
use crate::prog::EmptyLook::*;
|
||||
match empty.look {
|
||||
StartLine => {
|
||||
let c = self.previous_char(at);
|
||||
|
@ -348,7 +346,7 @@ impl<'t> Input for ByteInput<'t> {
|
|||
pub struct Char(u32);
|
||||
|
||||
impl fmt::Debug for Char {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match char::from_u32(self.0) {
|
||||
None => write!(f, "Empty"),
|
||||
Some(c) => write!(f, "{:?}", c),
|
||||
|
@ -379,7 +377,7 @@ impl Char {
|
|||
// available. However, our compiler ensures that if a Unicode word
|
||||
// boundary is used, then the data must also be available. If it isn't,
|
||||
// then the compiler returns an error.
|
||||
char::from_u32(self.0).map_or(false, syntax::is_word_character)
|
||||
char::from_u32(self.0).map_or(false, regex_syntax::is_word_character)
|
||||
}
|
||||
|
||||
/// Returns true iff the byte is a word byte.
|
||||
|
@ -387,7 +385,7 @@ impl Char {
|
|||
/// If the byte is absent, then false is returned.
|
||||
pub fn is_word_byte(self) -> bool {
|
||||
match char::from_u32(self.0) {
|
||||
Some(c) if c <= '\u{7F}' => syntax::is_word_byte(c as u8),
|
||||
Some(c) if c <= '\u{7F}' => regex_syntax::is_word_byte(c as u8),
|
||||
None | Some(_) => false,
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,12 +22,6 @@ used by adding `regex` to your dependencies in your project's `Cargo.toml`.
|
|||
regex = "1"
|
||||
```
|
||||
|
||||
If you're using Rust 2015, then you'll also need to add it to your crate root:
|
||||
|
||||
```rust
|
||||
extern crate regex;
|
||||
```
|
||||
|
||||
# Example: find a date
|
||||
|
||||
General use of regular expressions in this package involves compiling an
|
||||
|
@ -68,9 +62,7 @@ regular expressions are compiled exactly once.
|
|||
For example:
|
||||
|
||||
```rust
|
||||
#[macro_use] extern crate lazy_static;
|
||||
extern crate regex;
|
||||
|
||||
use lazy_static::lazy_static;
|
||||
use regex::Regex;
|
||||
|
||||
fn some_helper_function(text: &str) -> bool {
|
||||
|
@ -94,7 +86,7 @@ matches. For example, to find all dates in a string and be able to access
|
|||
them by their component pieces:
|
||||
|
||||
```rust
|
||||
# extern crate regex; use regex::Regex;
|
||||
# use regex::Regex;
|
||||
# fn main() {
|
||||
let re = Regex::new(r"(\d{4})-(\d{2})-(\d{2})").unwrap();
|
||||
let text = "2012-03-14, 2013-01-01 and 2014-07-05";
|
||||
|
@ -119,7 +111,7 @@ clearer, we can *name* our capture groups and use those names as variables
|
|||
in our replacement text:
|
||||
|
||||
```rust
|
||||
# extern crate regex; use regex::Regex;
|
||||
# use regex::Regex;
|
||||
# fn main() {
|
||||
let re = Regex::new(r"(?P<y>\d{4})-(?P<m>\d{2})-(?P<d>\d{2})").unwrap();
|
||||
let before = "2012-03-14, 2013-01-01 and 2014-07-05";
|
||||
|
@ -136,7 +128,7 @@ Note that if your regex gets complicated, you can use the `x` flag to
|
|||
enable insignificant whitespace mode, which also lets you write comments:
|
||||
|
||||
```rust
|
||||
# extern crate regex; use regex::Regex;
|
||||
# use regex::Regex;
|
||||
# fn main() {
|
||||
let re = Regex::new(r"(?x)
|
||||
(?P<y>\d{4}) # the year
|
||||
|
@ -152,8 +144,9 @@ assert_eq!(after, "03/14/2012, 01/01/2013 and 07/05/2014");
|
|||
```
|
||||
|
||||
If you wish to match against whitespace in this mode, you can still use `\s`,
|
||||
`\n`, `\t`, etc. For escaping a single space character, you can use its hex
|
||||
character code `\x20` or temporarily disable the `x` flag, e.g., `(?-x: )`.
|
||||
`\n`, `\t`, etc. For escaping a single space character, you can escape it
|
||||
directly with `\ `, use its hex character code `\x20` or temporarily disable
|
||||
the `x` flag, e.g., `(?-x: )`.
|
||||
|
||||
# Example: match multiple regular expressions simultaneously
|
||||
|
||||
|
@ -216,7 +209,7 @@ Unicode scalar values. This means you can use Unicode characters directly
|
|||
in your expression:
|
||||
|
||||
```rust
|
||||
# extern crate regex; use regex::Regex;
|
||||
# use regex::Regex;
|
||||
# fn main() {
|
||||
let re = Regex::new(r"(?i)Δ+").unwrap();
|
||||
let mat = re.find("ΔδΔ").unwrap();
|
||||
|
@ -243,7 +236,7 @@ of boolean properties are available as character classes. For example, you can
|
|||
match a sequence of numerals, Greek or Cherokee letters:
|
||||
|
||||
```rust
|
||||
# extern crate regex; use regex::Regex;
|
||||
# use regex::Regex;
|
||||
# fn main() {
|
||||
let re = Regex::new(r"[\pN\p{Greek}\p{Cherokee}]+").unwrap();
|
||||
let mat = re.find("abcΔᎠβⅠᏴγδⅡxyz").unwrap();
|
||||
|
@ -252,7 +245,7 @@ assert_eq!((mat.start(), mat.end()), (3, 23));
|
|||
```
|
||||
|
||||
For a more detailed breakdown of Unicode support with respect to
|
||||
[UTS#18](http://unicode.org/reports/tr18/),
|
||||
[UTS#18](https://unicode.org/reports/tr18/),
|
||||
please see the
|
||||
[UNICODE](https://github.com/rust-lang/regex/blob/master/UNICODE.md)
|
||||
document in the root of the regex repository.
|
||||
|
@ -364,7 +357,7 @@ $ the end of text (or end-of-line with multi-line mode)
|
|||
|
||||
<pre class="rust">
|
||||
(exp) numbered capture group (indexed by opening parenthesis)
|
||||
(?P<name>exp) named (also numbered) capture group (allowed chars: [_0-9a-zA-Z])
|
||||
(?P<name>exp) named (also numbered) capture group (allowed chars: [_0-9a-zA-Z.\[\]])
|
||||
(?:exp) non-capturing group
|
||||
(?flags) set flags within current group
|
||||
(?flags:exp) set flags for exp (non-capturing)
|
||||
|
@ -390,7 +383,7 @@ Flags can be toggled within a pattern. Here's an example that matches
|
|||
case-insensitively for the first part but case-sensitively for the second part:
|
||||
|
||||
```rust
|
||||
# extern crate regex; use regex::Regex;
|
||||
# use regex::Regex;
|
||||
# fn main() {
|
||||
let re = Regex::new(r"(?i)a+(?-i)b+").unwrap();
|
||||
let cap = re.captures("AaAaAbbBBBb").unwrap();
|
||||
|
@ -424,7 +417,7 @@ Here is an example that uses an ASCII word boundary instead of a Unicode
|
|||
word boundary:
|
||||
|
||||
```rust
|
||||
# extern crate regex; use regex::Regex;
|
||||
# use regex::Regex;
|
||||
# fn main() {
|
||||
let re = Regex::new(r"(?-u:\b).+(?-u:\b)").unwrap();
|
||||
let cap = re.captures("$$abc$$").unwrap();
|
||||
|
@ -454,7 +447,7 @@ assert_eq!(&cap[0], "abc");
|
|||
## Perl character classes (Unicode friendly)
|
||||
|
||||
These classes are based on the definitions provided in
|
||||
[UTS#18](http://www.unicode.org/reports/tr18/#Compatibility_Properties):
|
||||
[UTS#18](https://www.unicode.org/reports/tr18/#Compatibility_Properties):
|
||||
|
||||
<pre class="rust">
|
||||
\d digit (\p{Nd})
|
||||
|
@ -522,11 +515,6 @@ All features below are enabled by default.
|
|||
Enables all performance related features. This feature is enabled by default
|
||||
and will always cover all features that improve performance, even if more
|
||||
are added in the future.
|
||||
* **perf-cache** -
|
||||
Enables the use of very fast thread safe caching for internal match state.
|
||||
When this is disabled, caching is still used, but with a slower and simpler
|
||||
implementation. Disabling this drops the `thread_local` and `lazy_static`
|
||||
dependencies.
|
||||
* **perf-dfa** -
|
||||
Enables the use of a lazy DFA for matching. The lazy DFA is used to compile
|
||||
portions of a regex to a very fast DFA on an as-needed basis. This can
|
||||
|
@ -541,6 +529,11 @@ All features below are enabled by default.
|
|||
Enables the use of literal optimizations for speeding up matches. In some
|
||||
cases, literal optimizations can result in speedups of _several_ orders of
|
||||
magnitude. Disabling this drops the `aho-corasick` and `memchr` dependencies.
|
||||
* **perf-cache** -
|
||||
This feature used to enable a faster internal cache at the cost of using
|
||||
additional dependencies, but this is no longer an option. A fast internal
|
||||
cache is now used unconditionally with no additional dependencies. This may
|
||||
change in the future.
|
||||
|
||||
### Unicode features
|
||||
|
||||
|
@ -561,7 +554,7 @@ All features below are enabled by default.
|
|||
[Unicode's "simple loose matches" specification](https://www.unicode.org/reports/tr18/#Simple_Loose_Matches).
|
||||
* **unicode-gencat** -
|
||||
Provide the data for
|
||||
[Uncode general categories](https://www.unicode.org/reports/tr44/tr44-24.html#General_Category_Values).
|
||||
[Unicode general categories](https://www.unicode.org/reports/tr44/tr44-24.html#General_Category_Values).
|
||||
This includes, but is not limited to, `Decimal_Number`, `Letter`,
|
||||
`Math_Symbol`, `Number` and `Punctuation`.
|
||||
* **unicode-perl** -
|
||||
|
@ -613,39 +606,30 @@ another matching engine with fixed memory requirements.
|
|||
*/
|
||||
|
||||
#![deny(missing_docs)]
|
||||
#![cfg_attr(test, deny(warnings))]
|
||||
#![cfg_attr(feature = "pattern", feature(pattern))]
|
||||
#![warn(missing_debug_implementations)]
|
||||
|
||||
#[cfg(not(feature = "std"))]
|
||||
compile_error!("`std` feature is currently required to build this crate");
|
||||
|
||||
#[cfg(feature = "perf-literal")]
|
||||
extern crate aho_corasick;
|
||||
#[cfg(test)]
|
||||
extern crate doc_comment;
|
||||
#[cfg(feature = "perf-literal")]
|
||||
extern crate memchr;
|
||||
#[cfg(test)]
|
||||
#[cfg_attr(feature = "perf-literal", macro_use)]
|
||||
extern crate quickcheck;
|
||||
extern crate regex_syntax as syntax;
|
||||
#[cfg(feature = "perf-cache")]
|
||||
extern crate thread_local;
|
||||
|
||||
#[cfg(test)]
|
||||
doc_comment::doctest!("../README.md");
|
||||
// To check README's example
|
||||
// TODO: Re-enable this once the MSRV is 1.43 or greater.
|
||||
// See: https://github.com/rust-lang/regex/issues/684
|
||||
// See: https://github.com/rust-lang/regex/issues/685
|
||||
// #[cfg(doctest)]
|
||||
// doc_comment::doctest!("../README.md");
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
pub use error::Error;
|
||||
pub use crate::error::Error;
|
||||
#[cfg(feature = "std")]
|
||||
pub use re_builder::set_unicode::*;
|
||||
pub use crate::re_builder::set_unicode::*;
|
||||
#[cfg(feature = "std")]
|
||||
pub use re_builder::unicode::*;
|
||||
pub use crate::re_builder::unicode::*;
|
||||
#[cfg(feature = "std")]
|
||||
pub use re_set::unicode::*;
|
||||
pub use crate::re_set::unicode::*;
|
||||
#[cfg(feature = "std")]
|
||||
#[cfg(feature = "std")]
|
||||
pub use re_unicode::{
|
||||
pub use crate::re_unicode::{
|
||||
escape, CaptureLocations, CaptureMatches, CaptureNames, Captures,
|
||||
Locations, Match, Matches, NoExpand, Regex, Replacer, ReplacerRef, Split,
|
||||
SplitN, SubCaptureMatches,
|
||||
|
@ -730,8 +714,8 @@ Unicode codepoints. For example, in ASCII compatible mode, `\xFF` matches the
|
|||
literal byte `\xFF`, while in Unicode mode, `\xFF` is a Unicode codepoint that
|
||||
matches its UTF-8 encoding of `\xC3\xBF`. Similarly for octal notation when
|
||||
enabled.
|
||||
6. `.` matches any *byte* except for `\n` instead of any Unicode scalar value.
|
||||
When the `s` flag is enabled, `.` matches any byte.
|
||||
6. In ASCII compatible mode, `.` matches any *byte* except for `\n`. When the
|
||||
`s` flag is additionally enabled, `.` matches any byte.
|
||||
|
||||
# Performance
|
||||
|
||||
|
@ -740,14 +724,13 @@ performance on `&str`.
|
|||
*/
|
||||
#[cfg(feature = "std")]
|
||||
pub mod bytes {
|
||||
pub use re_builder::bytes::*;
|
||||
pub use re_builder::set_bytes::*;
|
||||
pub use re_bytes::*;
|
||||
pub use re_set::bytes::*;
|
||||
pub use crate::re_builder::bytes::*;
|
||||
pub use crate::re_builder::set_bytes::*;
|
||||
pub use crate::re_bytes::*;
|
||||
pub use crate::re_set::bytes::*;
|
||||
}
|
||||
|
||||
mod backtrack;
|
||||
mod cache;
|
||||
mod compile;
|
||||
#[cfg(feature = "perf-dfa")]
|
||||
mod dfa;
|
||||
|
@ -755,13 +738,12 @@ mod error;
|
|||
mod exec;
|
||||
mod expand;
|
||||
mod find_byte;
|
||||
#[cfg(feature = "perf-literal")]
|
||||
mod freqs;
|
||||
mod input;
|
||||
mod literal;
|
||||
#[cfg(feature = "pattern")]
|
||||
mod pattern;
|
||||
mod pikevm;
|
||||
mod pool;
|
||||
mod prog;
|
||||
mod re_builder;
|
||||
mod re_bytes;
|
||||
|
@ -777,9 +759,9 @@ mod utf8;
|
|||
#[doc(hidden)]
|
||||
#[cfg(feature = "std")]
|
||||
pub mod internal {
|
||||
pub use compile::Compiler;
|
||||
pub use exec::{Exec, ExecBuilder};
|
||||
pub use input::{Char, CharInput, Input, InputAt};
|
||||
pub use literal::LiteralSearcher;
|
||||
pub use prog::{EmptyLook, Inst, InstRanges, Program};
|
||||
pub use crate::compile::Compiler;
|
||||
pub use crate::exec::{Exec, ExecBuilder};
|
||||
pub use crate::input::{Char, CharInput, Input, InputAt};
|
||||
pub use crate::literal::LiteralSearcher;
|
||||
pub use crate::prog::{EmptyLook, Inst, InstRanges, Program};
|
||||
}
|
||||
|
|
|
@ -1,11 +1,8 @@
|
|||
use std::cmp;
|
||||
use std::mem;
|
||||
|
||||
use aho_corasick::{self, packed, AhoCorasick, AhoCorasickBuilder};
|
||||
use memchr::{memchr, memchr2, memchr3};
|
||||
use syntax::hir::literal::{Literal, Literals};
|
||||
|
||||
use freqs::BYTE_FREQUENCIES;
|
||||
use memchr::{memchr, memchr2, memchr3, memmem};
|
||||
use regex_syntax::hir::literal::{Literal, Literals};
|
||||
|
||||
/// A prefix extracted from a compiled regular expression.
|
||||
///
|
||||
|
@ -15,8 +12,8 @@ use freqs::BYTE_FREQUENCIES;
|
|||
#[derive(Clone, Debug)]
|
||||
pub struct LiteralSearcher {
|
||||
complete: bool,
|
||||
lcp: FreqyPacked,
|
||||
lcs: FreqyPacked,
|
||||
lcp: Memmem,
|
||||
lcs: Memmem,
|
||||
matcher: Matcher,
|
||||
}
|
||||
|
||||
|
@ -26,10 +23,8 @@ enum Matcher {
|
|||
Empty,
|
||||
/// A set of four or more single byte literals.
|
||||
Bytes(SingleByteSet),
|
||||
/// A single substring, find using memchr and frequency analysis.
|
||||
FreqyPacked(FreqyPacked),
|
||||
/// A single substring, find using Boyer-Moore.
|
||||
BoyerMoore(BoyerMooreSearch),
|
||||
/// A single substring, using vector accelerated routines when available.
|
||||
Memmem(Memmem),
|
||||
/// An Aho-Corasick automaton.
|
||||
AC { ac: AhoCorasick<u32>, lits: Vec<Literal> },
|
||||
/// A packed multiple substring searcher, using SIMD.
|
||||
|
@ -63,8 +58,8 @@ impl LiteralSearcher {
|
|||
let complete = lits.all_complete();
|
||||
LiteralSearcher {
|
||||
complete: complete,
|
||||
lcp: FreqyPacked::new(lits.longest_common_prefix().to_vec()),
|
||||
lcs: FreqyPacked::new(lits.longest_common_suffix().to_vec()),
|
||||
lcp: Memmem::new(lits.longest_common_prefix()),
|
||||
lcs: Memmem::new(lits.longest_common_suffix()),
|
||||
matcher: matcher,
|
||||
}
|
||||
}
|
||||
|
@ -72,7 +67,7 @@ impl LiteralSearcher {
|
|||
/// Returns true if all matches comprise the entire regular expression.
|
||||
///
|
||||
/// This does not necessarily mean that a literal match implies a match
|
||||
/// of the regular expression. For example, the regular expresison `^a`
|
||||
/// of the regular expression. For example, the regular expression `^a`
|
||||
/// is comprised of a single complete literal `a`, but the regular
|
||||
/// expression demands that it only match at the beginning of a string.
|
||||
pub fn complete(&self) -> bool {
|
||||
|
@ -86,8 +81,7 @@ impl LiteralSearcher {
|
|||
match self.matcher {
|
||||
Empty => Some((0, 0)),
|
||||
Bytes(ref sset) => sset.find(haystack).map(|i| (i, i + 1)),
|
||||
FreqyPacked(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
|
||||
BoyerMoore(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
|
||||
Memmem(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
|
||||
AC { ref ac, .. } => {
|
||||
ac.find(haystack).map(|m| (m.start(), m.end()))
|
||||
}
|
||||
|
@ -124,24 +118,23 @@ impl LiteralSearcher {
|
|||
}
|
||||
|
||||
/// Returns an iterator over all literals to be matched.
|
||||
pub fn iter(&self) -> LiteralIter {
|
||||
pub fn iter(&self) -> LiteralIter<'_> {
|
||||
match self.matcher {
|
||||
Matcher::Empty => LiteralIter::Empty,
|
||||
Matcher::Bytes(ref sset) => LiteralIter::Bytes(&sset.dense),
|
||||
Matcher::FreqyPacked(ref s) => LiteralIter::Single(&s.pat),
|
||||
Matcher::BoyerMoore(ref s) => LiteralIter::Single(&s.pattern),
|
||||
Matcher::Memmem(ref s) => LiteralIter::Single(&s.finder.needle()),
|
||||
Matcher::AC { ref lits, .. } => LiteralIter::AC(lits),
|
||||
Matcher::Packed { ref lits, .. } => LiteralIter::Packed(lits),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a matcher for the longest common prefix of this matcher.
|
||||
pub fn lcp(&self) -> &FreqyPacked {
|
||||
pub fn lcp(&self) -> &Memmem {
|
||||
&self.lcp
|
||||
}
|
||||
|
||||
/// Returns a matcher for the longest common suffix of this matcher.
|
||||
pub fn lcs(&self) -> &FreqyPacked {
|
||||
pub fn lcs(&self) -> &Memmem {
|
||||
&self.lcs
|
||||
}
|
||||
|
||||
|
@ -156,8 +149,7 @@ impl LiteralSearcher {
|
|||
match self.matcher {
|
||||
Empty => 0,
|
||||
Bytes(ref sset) => sset.dense.len(),
|
||||
FreqyPacked(_) => 1,
|
||||
BoyerMoore(_) => 1,
|
||||
Memmem(_) => 1,
|
||||
AC { ref ac, .. } => ac.pattern_count(),
|
||||
Packed { ref lits, .. } => lits.len(),
|
||||
}
|
||||
|
@ -169,8 +161,7 @@ impl LiteralSearcher {
|
|||
match self.matcher {
|
||||
Empty => 0,
|
||||
Bytes(ref sset) => sset.approximate_size(),
|
||||
FreqyPacked(ref single) => single.approximate_size(),
|
||||
BoyerMoore(ref single) => single.approximate_size(),
|
||||
Memmem(ref single) => single.approximate_size(),
|
||||
AC { ref ac, .. } => ac.heap_bytes(),
|
||||
Packed { ref s, .. } => s.heap_bytes(),
|
||||
}
|
||||
|
@ -205,12 +196,7 @@ impl Matcher {
|
|||
return Matcher::Bytes(sset);
|
||||
}
|
||||
if lits.literals().len() == 1 {
|
||||
let lit = lits.literals()[0].to_vec();
|
||||
if BoyerMooreSearch::should_use(lit.as_slice()) {
|
||||
return Matcher::BoyerMoore(BoyerMooreSearch::new(lit));
|
||||
} else {
|
||||
return Matcher::FreqyPacked(FreqyPacked::new(lit));
|
||||
}
|
||||
return Matcher::Memmem(Memmem::new(&lits.literals()[0]));
|
||||
}
|
||||
|
||||
let pats = lits.literals().to_owned();
|
||||
|
@ -232,6 +218,7 @@ impl Matcher {
|
|||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum LiteralIter<'a> {
|
||||
Empty,
|
||||
Bytes(&'a [u8]),
|
||||
|
@ -366,116 +353,27 @@ impl SingleByteSet {
|
|||
}
|
||||
}
|
||||
|
||||
/// Provides an implementation of fast subtring search using frequency
|
||||
/// analysis.
|
||||
/// A simple wrapper around the memchr crate's memmem implementation.
|
||||
///
|
||||
/// memchr is so fast that we do everything we can to keep the loop in memchr
|
||||
/// for as long as possible. The easiest way to do this is to intelligently
|
||||
/// pick the byte to send to memchr. The best byte is the byte that occurs
|
||||
/// least frequently in the haystack. Since doing frequency analysis on the
|
||||
/// haystack is far too expensive, we compute a set of fixed frequencies up
|
||||
/// front and hard code them in src/freqs.rs. Frequency analysis is done via
|
||||
/// scripts/frequencies.py.
|
||||
/// The API this exposes mirrors the API of previous substring searchers that
|
||||
/// this supplanted.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct FreqyPacked {
|
||||
/// The pattern.
|
||||
pat: Vec<u8>,
|
||||
/// The number of Unicode characters in the pattern. This is useful for
|
||||
/// determining the effective length of a pattern when deciding which
|
||||
/// optimizations to perform. A trailing incomplete UTF-8 sequence counts
|
||||
/// as one character.
|
||||
pub struct Memmem {
|
||||
finder: memmem::Finder<'static>,
|
||||
char_len: usize,
|
||||
/// The rarest byte in the pattern, according to pre-computed frequency
|
||||
/// analysis.
|
||||
rare1: u8,
|
||||
/// The offset of the rarest byte in `pat`.
|
||||
rare1i: usize,
|
||||
/// The second rarest byte in the pattern, according to pre-computed
|
||||
/// frequency analysis. (This may be equivalent to the rarest byte.)
|
||||
///
|
||||
/// The second rarest byte is used as a type of guard for quickly detecting
|
||||
/// a mismatch after memchr locates an instance of the rarest byte. This
|
||||
/// is a hedge against pathological cases where the pre-computed frequency
|
||||
/// analysis may be off. (But of course, does not prevent *all*
|
||||
/// pathological cases.)
|
||||
rare2: u8,
|
||||
/// The offset of the second rarest byte in `pat`.
|
||||
rare2i: usize,
|
||||
}
|
||||
|
||||
impl FreqyPacked {
|
||||
fn new(pat: Vec<u8>) -> FreqyPacked {
|
||||
if pat.is_empty() {
|
||||
return FreqyPacked::empty();
|
||||
}
|
||||
|
||||
// Find the rarest two bytes. Try to make them distinct (but it's not
|
||||
// required).
|
||||
let mut rare1 = pat[0];
|
||||
let mut rare2 = pat[0];
|
||||
for b in pat[1..].iter().cloned() {
|
||||
if freq_rank(b) < freq_rank(rare1) {
|
||||
rare1 = b;
|
||||
}
|
||||
}
|
||||
for &b in &pat {
|
||||
if rare1 == rare2 {
|
||||
rare2 = b
|
||||
} else if b != rare1 && freq_rank(b) < freq_rank(rare2) {
|
||||
rare2 = b;
|
||||
}
|
||||
}
|
||||
|
||||
// And find the offsets of their last occurrences.
|
||||
let rare1i = pat.iter().rposition(|&b| b == rare1).unwrap();
|
||||
let rare2i = pat.iter().rposition(|&b| b == rare2).unwrap();
|
||||
|
||||
let char_len = char_len_lossy(&pat);
|
||||
FreqyPacked {
|
||||
pat: pat,
|
||||
char_len: char_len,
|
||||
rare1: rare1,
|
||||
rare1i: rare1i,
|
||||
rare2: rare2,
|
||||
rare2i: rare2i,
|
||||
}
|
||||
}
|
||||
|
||||
fn empty() -> FreqyPacked {
|
||||
FreqyPacked {
|
||||
pat: vec![],
|
||||
char_len: 0,
|
||||
rare1: 0,
|
||||
rare1i: 0,
|
||||
rare2: 0,
|
||||
rare2i: 0,
|
||||
impl Memmem {
|
||||
fn new(pat: &[u8]) -> Memmem {
|
||||
Memmem {
|
||||
finder: memmem::Finder::new(pat).into_owned(),
|
||||
char_len: char_len_lossy(pat),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
pub fn find(&self, haystack: &[u8]) -> Option<usize> {
|
||||
let pat = &*self.pat;
|
||||
if haystack.len() < pat.len() || pat.is_empty() {
|
||||
return None;
|
||||
}
|
||||
let mut i = self.rare1i;
|
||||
while i < haystack.len() {
|
||||
i += match memchr(self.rare1, &haystack[i..]) {
|
||||
None => return None,
|
||||
Some(i) => i,
|
||||
};
|
||||
let start = i - self.rare1i;
|
||||
let end = start + pat.len();
|
||||
if end > haystack.len() {
|
||||
return None;
|
||||
}
|
||||
let aligned = &haystack[start..end];
|
||||
if aligned[self.rare2i] == self.rare2 && aligned == &*self.pat {
|
||||
return Some(start);
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
None
|
||||
self.finder.find(haystack)
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
|
@ -483,11 +381,11 @@ impl FreqyPacked {
|
|||
if text.len() < self.len() {
|
||||
return false;
|
||||
}
|
||||
text[text.len() - self.len()..] == *self.pat
|
||||
&text[text.len() - self.len()..] == self.finder.needle()
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.pat.len()
|
||||
self.finder.needle().len()
|
||||
}
|
||||
|
||||
pub fn char_len(&self) -> usize {
|
||||
|
@ -495,627 +393,10 @@ impl FreqyPacked {
|
|||
}
|
||||
|
||||
fn approximate_size(&self) -> usize {
|
||||
self.pat.len() * mem::size_of::<u8>()
|
||||
self.finder.needle().len() * mem::size_of::<u8>()
|
||||
}
|
||||
}
|
||||
|
||||
fn char_len_lossy(bytes: &[u8]) -> usize {
|
||||
String::from_utf8_lossy(bytes).chars().count()
|
||||
}
|
||||
|
||||
/// An implementation of Tuned Boyer-Moore as laid out by
|
||||
/// Andrew Hume and Daniel Sunday in "Fast String Searching".
|
||||
/// O(n) in the size of the input.
|
||||
///
|
||||
/// Fast string searching algorithms come in many variations,
|
||||
/// but they can generally be described in terms of three main
|
||||
/// components.
|
||||
///
|
||||
/// The skip loop is where the string searcher wants to spend
|
||||
/// as much time as possible. Exactly which character in the
|
||||
/// pattern the skip loop examines varies from algorithm to
|
||||
/// algorithm, but in the simplest case this loop repeated
|
||||
/// looks at the last character in the pattern and jumps
|
||||
/// forward in the input if it is not in the pattern.
|
||||
/// Robert Boyer and J Moore called this the "fast" loop in
|
||||
/// their original paper.
|
||||
///
|
||||
/// The match loop is responsible for actually examining the
|
||||
/// whole potentially matching substring. In order to fail
|
||||
/// faster, the match loop sometimes has a guard test attached.
|
||||
/// The guard test uses frequency analysis of the different
|
||||
/// characters in the pattern to choose the least frequency
|
||||
/// occurring character and use it to find match failures
|
||||
/// as quickly as possible.
|
||||
///
|
||||
/// The shift rule governs how the algorithm will shuffle its
|
||||
/// test window in the event of a failure during the match loop.
|
||||
/// Certain shift rules allow the worst-case run time of the
|
||||
/// algorithm to be shown to be O(n) in the size of the input
|
||||
/// rather than O(nm) in the size of the input and the size
|
||||
/// of the pattern (as naive Boyer-Moore is).
|
||||
///
|
||||
/// "Fast String Searching", in addition to presenting a tuned
|
||||
/// algorithm, provides a comprehensive taxonomy of the many
|
||||
/// different flavors of string searchers. Under that taxonomy
|
||||
/// TBM, the algorithm implemented here, uses an unrolled fast
|
||||
/// skip loop with memchr fallback, a forward match loop with guard,
|
||||
/// and the mini Sunday's delta shift rule. To unpack that you'll have to
|
||||
/// read the paper.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct BoyerMooreSearch {
|
||||
/// The pattern we are going to look for in the haystack.
|
||||
pattern: Vec<u8>,
|
||||
|
||||
/// The skip table for the skip loop.
|
||||
///
|
||||
/// Maps the character at the end of the input
|
||||
/// to a shift.
|
||||
skip_table: Vec<usize>,
|
||||
|
||||
/// The guard character (least frequently occurring char).
|
||||
guard: u8,
|
||||
/// The reverse-index of the guard character in the pattern.
|
||||
guard_reverse_idx: usize,
|
||||
|
||||
/// Daniel Sunday's mini generalized delta2 shift table.
|
||||
///
|
||||
/// We use a skip loop, so we only have to provide a shift
|
||||
/// for the skip char (last char). This is why it is a mini
|
||||
/// shift rule.
|
||||
md2_shift: usize,
|
||||
}
|
||||
|
||||
impl BoyerMooreSearch {
|
||||
/// Create a new string searcher, performing whatever
|
||||
/// compilation steps are required.
|
||||
fn new(pattern: Vec<u8>) -> Self {
|
||||
debug_assert!(!pattern.is_empty());
|
||||
|
||||
let (g, gi) = Self::select_guard(pattern.as_slice());
|
||||
let skip_table = Self::compile_skip_table(pattern.as_slice());
|
||||
let md2_shift = Self::compile_md2_shift(pattern.as_slice());
|
||||
BoyerMooreSearch {
|
||||
pattern: pattern,
|
||||
skip_table: skip_table,
|
||||
guard: g,
|
||||
guard_reverse_idx: gi,
|
||||
md2_shift: md2_shift,
|
||||
}
|
||||
}
|
||||
|
||||
/// Find the pattern in `haystack`, returning the offset
|
||||
/// of the start of the first occurrence of the pattern
|
||||
/// in `haystack`.
|
||||
#[inline]
|
||||
fn find(&self, haystack: &[u8]) -> Option<usize> {
|
||||
if haystack.len() < self.pattern.len() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut window_end = self.pattern.len() - 1;
|
||||
|
||||
// Inspired by the grep source. It is a way
|
||||
// to do correct loop unrolling without having to place
|
||||
// a crashpad of terminating charicters at the end in
|
||||
// the way described in the Fast String Searching paper.
|
||||
const NUM_UNROLL: usize = 10;
|
||||
// 1 for the initial position, and 1 for the md2 shift
|
||||
let short_circut = (NUM_UNROLL + 2) * self.pattern.len();
|
||||
|
||||
if haystack.len() > short_circut {
|
||||
// just 1 for the md2 shift
|
||||
let backstop =
|
||||
haystack.len() - ((NUM_UNROLL + 1) * self.pattern.len());
|
||||
loop {
|
||||
window_end =
|
||||
match self.skip_loop(haystack, window_end, backstop) {
|
||||
Some(i) => i,
|
||||
None => return None,
|
||||
};
|
||||
if window_end >= backstop {
|
||||
break;
|
||||
}
|
||||
|
||||
if self.check_match(haystack, window_end) {
|
||||
return Some(window_end - (self.pattern.len() - 1));
|
||||
} else {
|
||||
let skip = self.skip_table[haystack[window_end] as usize];
|
||||
window_end +=
|
||||
if skip == 0 { self.md2_shift } else { skip };
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// now process the input after the backstop
|
||||
while window_end < haystack.len() {
|
||||
let mut skip = self.skip_table[haystack[window_end] as usize];
|
||||
if skip == 0 {
|
||||
if self.check_match(haystack, window_end) {
|
||||
return Some(window_end - (self.pattern.len() - 1));
|
||||
} else {
|
||||
skip = self.md2_shift;
|
||||
}
|
||||
}
|
||||
window_end += skip;
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
fn len(&self) -> usize {
|
||||
return self.pattern.len();
|
||||
}
|
||||
|
||||
/// The key heuristic behind which the BoyerMooreSearch lives.
|
||||
///
|
||||
/// See `rust-lang/regex/issues/408`.
|
||||
///
|
||||
/// Tuned Boyer-Moore is actually pretty slow! It turns out a handrolled
|
||||
/// platform-specific memchr routine with a bit of frequency
|
||||
/// analysis sprinkled on top actually wins most of the time.
|
||||
/// However, there are a few cases where Tuned Boyer-Moore still
|
||||
/// wins.
|
||||
///
|
||||
/// If the haystack is random, frequency analysis doesn't help us,
|
||||
/// so Boyer-Moore will win for sufficiently large needles.
|
||||
/// Unfortunately, there is no obvious way to determine this
|
||||
/// ahead of time.
|
||||
///
|
||||
/// If the pattern itself consists of very common characters,
|
||||
/// frequency analysis won't get us anywhere. The most extreme
|
||||
/// example of this is a pattern like `eeeeeeeeeeeeeeee`. Fortunately,
|
||||
/// this case is wholly determined by the pattern, so we can actually
|
||||
/// implement the heuristic.
|
||||
///
|
||||
/// A third case is if the pattern is sufficiently long. The idea
|
||||
/// here is that once the pattern gets long enough the Tuned
|
||||
/// Boyer-Moore skip loop will start making strides long enough
|
||||
/// to beat the asm deep magic that is memchr.
|
||||
fn should_use(pattern: &[u8]) -> bool {
|
||||
// The minimum pattern length required to use TBM.
|
||||
const MIN_LEN: usize = 9;
|
||||
// The minimum frequency rank (lower is rarer) that every byte in the
|
||||
// pattern must have in order to use TBM. That is, if the pattern
|
||||
// contains _any_ byte with a lower rank, then TBM won't be used.
|
||||
const MIN_CUTOFF: usize = 150;
|
||||
// The maximum frequency rank for any byte.
|
||||
const MAX_CUTOFF: usize = 255;
|
||||
// The scaling factor used to determine the actual cutoff frequency
|
||||
// to use (keeping in mind that the minimum frequency rank is bounded
|
||||
// by MIN_CUTOFF). This scaling factor is an attempt to make TBM more
|
||||
// likely to be used as the pattern grows longer. That is, longer
|
||||
// patterns permit somewhat less frequent bytes than shorter patterns,
|
||||
// under the assumption that TBM gets better as the pattern gets
|
||||
// longer.
|
||||
const LEN_CUTOFF_PROPORTION: usize = 4;
|
||||
|
||||
let scaled_rank = pattern.len().wrapping_mul(LEN_CUTOFF_PROPORTION);
|
||||
let cutoff = cmp::max(
|
||||
MIN_CUTOFF,
|
||||
MAX_CUTOFF - cmp::min(MAX_CUTOFF, scaled_rank),
|
||||
);
|
||||
// The pattern must be long enough to be worthwhile. e.g., memchr will
|
||||
// be faster on `e` because it is short even though e is quite common.
|
||||
pattern.len() > MIN_LEN
|
||||
// all the bytes must be more common than the cutoff.
|
||||
&& pattern.iter().all(|c| freq_rank(*c) >= cutoff)
|
||||
}
|
||||
|
||||
/// Check to see if there is a match at the given position
|
||||
#[inline]
|
||||
fn check_match(&self, haystack: &[u8], window_end: usize) -> bool {
|
||||
// guard test
|
||||
if haystack[window_end - self.guard_reverse_idx] != self.guard {
|
||||
return false;
|
||||
}
|
||||
|
||||
// match loop
|
||||
let window_start = window_end - (self.pattern.len() - 1);
|
||||
for i in 0..self.pattern.len() {
|
||||
if self.pattern[i] != haystack[window_start + i] {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
true
|
||||
}
|
||||
|
||||
/// Skip forward according to the shift table.
|
||||
///
|
||||
/// Returns the offset of the next occurrence
|
||||
/// of the last char in the pattern, or the none
|
||||
/// if it never reappears. If `skip_loop` hits the backstop
|
||||
/// it will leave early.
|
||||
#[inline]
|
||||
fn skip_loop(
|
||||
&self,
|
||||
haystack: &[u8],
|
||||
mut window_end: usize,
|
||||
backstop: usize,
|
||||
) -> Option<usize> {
|
||||
let window_end_snapshot = window_end;
|
||||
let skip_of = |we: usize| -> usize {
|
||||
// Unsafe might make this faster, but the benchmarks
|
||||
// were hard to interpret.
|
||||
self.skip_table[haystack[we] as usize]
|
||||
};
|
||||
|
||||
loop {
|
||||
let mut skip = skip_of(window_end);
|
||||
window_end += skip;
|
||||
skip = skip_of(window_end);
|
||||
window_end += skip;
|
||||
if skip != 0 {
|
||||
skip = skip_of(window_end);
|
||||
window_end += skip;
|
||||
skip = skip_of(window_end);
|
||||
window_end += skip;
|
||||
skip = skip_of(window_end);
|
||||
window_end += skip;
|
||||
if skip != 0 {
|
||||
skip = skip_of(window_end);
|
||||
window_end += skip;
|
||||
skip = skip_of(window_end);
|
||||
window_end += skip;
|
||||
skip = skip_of(window_end);
|
||||
window_end += skip;
|
||||
if skip != 0 {
|
||||
skip = skip_of(window_end);
|
||||
window_end += skip;
|
||||
skip = skip_of(window_end);
|
||||
window_end += skip;
|
||||
|
||||
// If ten iterations did not make at least 16 words
|
||||
// worth of progress, we just fall back on memchr.
|
||||
if window_end - window_end_snapshot
|
||||
> 16 * mem::size_of::<usize>()
|
||||
{
|
||||
// Returning a window_end >= backstop will
|
||||
// immediatly break us out of the inner loop in
|
||||
// `find`.
|
||||
if window_end >= backstop {
|
||||
return Some(window_end);
|
||||
}
|
||||
|
||||
continue; // we made enough progress
|
||||
} else {
|
||||
// In case we are already there, and so that
|
||||
// we will catch the guard char.
|
||||
window_end = window_end
|
||||
.checked_sub(1 + self.guard_reverse_idx)
|
||||
.unwrap_or(0);
|
||||
|
||||
match memchr(self.guard, &haystack[window_end..]) {
|
||||
None => return None,
|
||||
Some(g_idx) => {
|
||||
return Some(
|
||||
window_end
|
||||
+ g_idx
|
||||
+ self.guard_reverse_idx,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return Some(window_end);
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute the ufast skip table.
|
||||
fn compile_skip_table(pattern: &[u8]) -> Vec<usize> {
|
||||
let mut tab = vec![pattern.len(); 256];
|
||||
|
||||
// For every char in the pattern, we write a skip
|
||||
// that will line us up with the rightmost occurrence.
|
||||
//
|
||||
// N.B. the sentinel (0) is written by the last
|
||||
// loop iteration.
|
||||
for (i, c) in pattern.iter().enumerate() {
|
||||
tab[*c as usize] = (pattern.len() - 1) - i;
|
||||
}
|
||||
|
||||
tab
|
||||
}
|
||||
|
||||
/// Select the guard character based off of the precomputed
|
||||
/// frequency table.
|
||||
fn select_guard(pattern: &[u8]) -> (u8, usize) {
|
||||
let mut rarest = pattern[0];
|
||||
let mut rarest_rev_idx = pattern.len() - 1;
|
||||
for (i, c) in pattern.iter().enumerate() {
|
||||
if freq_rank(*c) < freq_rank(rarest) {
|
||||
rarest = *c;
|
||||
rarest_rev_idx = (pattern.len() - 1) - i;
|
||||
}
|
||||
}
|
||||
|
||||
(rarest, rarest_rev_idx)
|
||||
}
|
||||
|
||||
/// If there is another occurrence of the skip
|
||||
/// char, shift to it, otherwise just shift to
|
||||
/// the next window.
|
||||
fn compile_md2_shift(pattern: &[u8]) -> usize {
|
||||
let shiftc = *pattern.last().unwrap();
|
||||
|
||||
// For a pattern of length 1 we will never apply the
|
||||
// shift rule, so we use a poison value on the principle
|
||||
// that failing fast is a good thing.
|
||||
if pattern.len() == 1 {
|
||||
return 0xDEADBEAF;
|
||||
}
|
||||
|
||||
let mut i = pattern.len() - 2;
|
||||
while i > 0 {
|
||||
if pattern[i] == shiftc {
|
||||
return (pattern.len() - 1) - i;
|
||||
}
|
||||
i -= 1;
|
||||
}
|
||||
|
||||
// The skip char never re-occurs in the pattern, so
|
||||
// we can just shift the whole window length.
|
||||
pattern.len() - 1
|
||||
}
|
||||
|
||||
fn approximate_size(&self) -> usize {
|
||||
(self.pattern.len() * mem::size_of::<u8>())
|
||||
+ (256 * mem::size_of::<usize>()) // skip table
|
||||
}
|
||||
}
|
||||
|
||||
fn freq_rank(b: u8) -> usize {
|
||||
BYTE_FREQUENCIES[b as usize] as usize
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{BoyerMooreSearch, FreqyPacked};
|
||||
|
||||
//
|
||||
// Unit Tests
|
||||
//
|
||||
|
||||
// The "hello, world" of string searching
|
||||
#[test]
|
||||
fn bm_find_subs() {
|
||||
let searcher = BoyerMooreSearch::new(Vec::from(&b"pattern"[..]));
|
||||
let haystack = b"I keep seeing patterns in this text";
|
||||
assert_eq!(14, searcher.find(haystack).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bm_find_no_subs() {
|
||||
let searcher = BoyerMooreSearch::new(Vec::from(&b"pattern"[..]));
|
||||
let haystack = b"I keep seeing needles in this text";
|
||||
assert_eq!(None, searcher.find(haystack));
|
||||
}
|
||||
|
||||
//
|
||||
// Regression Tests
|
||||
//
|
||||
|
||||
#[test]
|
||||
fn bm_skip_reset_bug() {
|
||||
let haystack = vec![0, 0, 0, 0, 0, 1, 1, 0];
|
||||
let needle = vec![0, 1, 1, 0];
|
||||
|
||||
let searcher = BoyerMooreSearch::new(needle);
|
||||
let offset = searcher.find(haystack.as_slice()).unwrap();
|
||||
assert_eq!(4, offset);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bm_backstop_underflow_bug() {
|
||||
let haystack = vec![0, 0];
|
||||
let needle = vec![0, 0];
|
||||
|
||||
let searcher = BoyerMooreSearch::new(needle);
|
||||
let offset = searcher.find(haystack.as_slice()).unwrap();
|
||||
assert_eq!(0, offset);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bm_naive_off_by_one_bug() {
|
||||
let haystack = vec![91];
|
||||
let needle = vec![91];
|
||||
|
||||
let naive_offset = naive_find(&needle, &haystack).unwrap();
|
||||
assert_eq!(0, naive_offset);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bm_memchr_fallback_indexing_bug() {
|
||||
let mut haystack = vec![
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 87, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
];
|
||||
let needle = vec![1, 1, 1, 1, 32, 32, 87];
|
||||
let needle_start = haystack.len();
|
||||
haystack.extend(needle.clone());
|
||||
|
||||
let searcher = BoyerMooreSearch::new(needle);
|
||||
assert_eq!(needle_start, searcher.find(haystack.as_slice()).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bm_backstop_boundary() {
|
||||
let haystack = b"\
|
||||
// aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
|
||||
e_data.clone_created(entity_id, entity_to_add.entity_id);
|
||||
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
|
||||
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
|
||||
"
|
||||
.to_vec();
|
||||
let needle = b"clone_created".to_vec();
|
||||
|
||||
let searcher = BoyerMooreSearch::new(needle);
|
||||
let result = searcher.find(&haystack);
|
||||
assert_eq!(Some(43), result);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bm_win_gnu_indexing_bug() {
|
||||
let haystack_raw = vec![
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
];
|
||||
let needle = vec![1, 1, 1, 1, 1, 1, 1];
|
||||
let haystack = haystack_raw.as_slice();
|
||||
|
||||
BoyerMooreSearch::new(needle.clone()).find(haystack);
|
||||
}
|
||||
|
||||
//
|
||||
// QuickCheck Properties
|
||||
//
|
||||
|
||||
use quickcheck::TestResult;
|
||||
|
||||
fn naive_find(needle: &[u8], haystack: &[u8]) -> Option<usize> {
|
||||
assert!(needle.len() <= haystack.len());
|
||||
|
||||
for i in 0..(haystack.len() - (needle.len() - 1)) {
|
||||
if haystack[i] == needle[0]
|
||||
&& &haystack[i..(i + needle.len())] == needle
|
||||
{
|
||||
return Some(i);
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
quickcheck! {
|
||||
fn qc_bm_equals_nieve_find(pile1: Vec<u8>, pile2: Vec<u8>) -> TestResult {
|
||||
if pile1.len() == 0 || pile2.len() == 0 {
|
||||
return TestResult::discard();
|
||||
}
|
||||
|
||||
let (needle, haystack) = if pile1.len() < pile2.len() {
|
||||
(pile1, pile2.as_slice())
|
||||
} else {
|
||||
(pile2, pile1.as_slice())
|
||||
};
|
||||
|
||||
let searcher = BoyerMooreSearch::new(needle.clone());
|
||||
TestResult::from_bool(
|
||||
searcher.find(haystack) == naive_find(&needle, haystack))
|
||||
}
|
||||
|
||||
fn qc_bm_equals_single(pile1: Vec<u8>, pile2: Vec<u8>) -> TestResult {
|
||||
if pile1.len() == 0 || pile2.len() == 0 {
|
||||
return TestResult::discard();
|
||||
}
|
||||
|
||||
let (needle, haystack) = if pile1.len() < pile2.len() {
|
||||
(pile1, pile2.as_slice())
|
||||
} else {
|
||||
(pile2, pile1.as_slice())
|
||||
};
|
||||
|
||||
let bm_searcher = BoyerMooreSearch::new(needle.clone());
|
||||
let freqy_memchr = FreqyPacked::new(needle);
|
||||
TestResult::from_bool(
|
||||
bm_searcher.find(haystack) == freqy_memchr.find(haystack))
|
||||
}
|
||||
|
||||
fn qc_bm_finds_trailing_needle(
|
||||
haystack_pre: Vec<u8>,
|
||||
needle: Vec<u8>
|
||||
) -> TestResult {
|
||||
if needle.len() == 0 {
|
||||
return TestResult::discard();
|
||||
}
|
||||
|
||||
let mut haystack = haystack_pre.clone();
|
||||
let searcher = BoyerMooreSearch::new(needle.clone());
|
||||
|
||||
if haystack.len() >= needle.len() &&
|
||||
searcher.find(haystack.as_slice()).is_some() {
|
||||
return TestResult::discard();
|
||||
}
|
||||
|
||||
haystack.extend(needle.clone());
|
||||
|
||||
// What if the the tail of the haystack can start the
|
||||
// needle?
|
||||
let start = haystack_pre.len()
|
||||
.checked_sub(needle.len())
|
||||
.unwrap_or(0);
|
||||
for i in 0..(needle.len() - 1) {
|
||||
if searcher.find(&haystack[(i + start)..]).is_some() {
|
||||
return TestResult::discard();
|
||||
}
|
||||
}
|
||||
|
||||
TestResult::from_bool(
|
||||
searcher.find(haystack.as_slice())
|
||||
.map(|x| x == haystack_pre.len())
|
||||
.unwrap_or(false))
|
||||
}
|
||||
|
||||
// qc_equals_* is only testing the negative case as @burntsushi
|
||||
// pointed out in https://github.com/rust-lang/regex/issues/446.
|
||||
// This quickcheck prop represents an effort to force testing of
|
||||
// the positive case. qc_bm_finds_first and qc_bm_finds_trailing_needle
|
||||
// already check some of the positive cases, but they don't cover
|
||||
// cases where the needle is in the middle of haystack. This prop
|
||||
// fills that hole.
|
||||
fn qc_bm_finds_subslice(
|
||||
haystack: Vec<u8>,
|
||||
needle_start: usize,
|
||||
needle_length: usize
|
||||
) -> TestResult {
|
||||
if haystack.len() == 0 {
|
||||
return TestResult::discard();
|
||||
}
|
||||
|
||||
let needle_start = needle_start % haystack.len();
|
||||
let needle_length = needle_length % (haystack.len() - needle_start);
|
||||
|
||||
if needle_length == 0 {
|
||||
return TestResult::discard();
|
||||
}
|
||||
|
||||
let needle = &haystack[needle_start..(needle_start + needle_length)];
|
||||
|
||||
let bm_searcher = BoyerMooreSearch::new(needle.to_vec());
|
||||
|
||||
let start = naive_find(&needle, &haystack);
|
||||
match start {
|
||||
None => TestResult::from_bool(false),
|
||||
Some(nf_start) =>
|
||||
TestResult::from_bool(
|
||||
nf_start <= needle_start
|
||||
&& bm_searcher.find(&haystack) == start
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
fn qc_bm_finds_first(needle: Vec<u8>) -> TestResult {
|
||||
if needle.len() == 0 {
|
||||
return TestResult::discard();
|
||||
}
|
||||
|
||||
let mut haystack = needle.clone();
|
||||
let searcher = BoyerMooreSearch::new(needle.clone());
|
||||
haystack.extend(needle);
|
||||
|
||||
TestResult::from_bool(
|
||||
searcher.find(haystack.as_slice())
|
||||
.map(|x| x == 0)
|
||||
.unwrap_or(false))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -6,7 +6,7 @@ mod imp;
|
|||
#[allow(missing_docs)]
|
||||
#[cfg(not(feature = "perf-literal"))]
|
||||
mod imp {
|
||||
use syntax::hir::literal::Literals;
|
||||
use regex_syntax::hir::literal::Literals;
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct LiteralSearcher(());
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
use std::str::pattern::{Pattern, SearchStep, Searcher};
|
||||
|
||||
use re_unicode::{Matches, Regex};
|
||||
use crate::re_unicode::{Matches, Regex};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct RegexSearcher<'r, 't> {
|
||||
haystack: &'t str,
|
||||
it: Matches<'r, 't>,
|
||||
|
|
|
@ -8,7 +8,7 @@
|
|||
//
|
||||
// It can do more than the DFA can (specifically, record capture locations
|
||||
// and execute Unicode word boundary assertions), but at a slower speed.
|
||||
// Specifically, the Pike VM exectues a DFA implicitly by repeatedly expanding
|
||||
// Specifically, the Pike VM executes a DFA implicitly by repeatedly expanding
|
||||
// epsilon transitions. That is, the Pike VM engine can be in multiple states
|
||||
// at once where as the DFA is only ever in one state at a time.
|
||||
//
|
||||
|
@ -17,11 +17,11 @@
|
|||
|
||||
use std::mem;
|
||||
|
||||
use exec::ProgramCache;
|
||||
use input::{Input, InputAt};
|
||||
use prog::{InstPtr, Program};
|
||||
use re_trait::Slot;
|
||||
use sparse::SparseSet;
|
||||
use crate::exec::ProgramCache;
|
||||
use crate::input::{Input, InputAt};
|
||||
use crate::prog::{InstPtr, Program};
|
||||
use crate::re_trait::Slot;
|
||||
use crate::sparse::SparseSet;
|
||||
|
||||
/// An NFA simulation matching engine.
|
||||
#[derive(Debug)]
|
||||
|
@ -231,7 +231,7 @@ impl<'r, I: Input> Fsm<'r, I> {
|
|||
at: InputAt,
|
||||
at_next: InputAt,
|
||||
) -> bool {
|
||||
use prog::Inst::*;
|
||||
use crate::prog::Inst::*;
|
||||
match self.prog[ip] {
|
||||
Match(match_slot) => {
|
||||
if match_slot < matches.len() {
|
||||
|
@ -300,7 +300,7 @@ impl<'r, I: Input> Fsm<'r, I> {
|
|||
// traverse the set of states. We only push to the stack when we
|
||||
// absolutely need recursion (restoring captures or following a
|
||||
// branch).
|
||||
use prog::Inst::*;
|
||||
use crate::prog::Inst::*;
|
||||
loop {
|
||||
// Don't visit states we've already added.
|
||||
if nlist.set.contains(ip) {
|
||||
|
|
|
@ -0,0 +1,333 @@
|
|||
// This module provides a relatively simple thread-safe pool of reusable
|
||||
// objects. For the most part, it's implemented by a stack represented by a
|
||||
// Mutex<Vec<T>>. It has one small trick: because unlocking a mutex is somewhat
|
||||
// costly, in the case where a pool is accessed by the first thread that tried
|
||||
// to get a value, we bypass the mutex. Here are some benchmarks showing the
|
||||
// difference.
|
||||
//
|
||||
// 1) misc::anchored_literal_long_non_match 21 (18571 MB/s)
|
||||
// 2) misc::anchored_literal_long_non_match 107 (3644 MB/s)
|
||||
// 3) misc::anchored_literal_long_non_match 45 (8666 MB/s)
|
||||
// 4) misc::anchored_literal_long_non_match 19 (20526 MB/s)
|
||||
//
|
||||
// (1) represents our baseline: the master branch at the time of writing when
|
||||
// using the 'thread_local' crate to implement the pool below.
|
||||
//
|
||||
// (2) represents a naive pool implemented completely via Mutex<Vec<T>>. There
|
||||
// is no special trick for bypassing the mutex.
|
||||
//
|
||||
// (3) is the same as (2), except it uses Mutex<Vec<Box<T>>>. It is twice as
|
||||
// fast because a Box<T> is much smaller than the T we use with a Pool in this
|
||||
// crate. So pushing and popping a Box<T> from a Vec is quite a bit faster
|
||||
// than for T.
|
||||
//
|
||||
// (4) is the same as (3), but with the trick for bypassing the mutex in the
|
||||
// case of the first-to-get thread.
|
||||
//
|
||||
// Why move off of thread_local? Even though (4) is a hair faster than (1)
|
||||
// above, this was not the main goal. The main goal was to move off of
|
||||
// thread_local and find a way to *simply* re-capture some of its speed for
|
||||
// regex's specific case. So again, why move off of it? The *primary* reason is
|
||||
// because of memory leaks. See https://github.com/rust-lang/regex/issues/362
|
||||
// for example. (Why do I want it to be simple? Well, I suppose what I mean is,
|
||||
// "use as much safe code as possible to minimize risk and be as sure as I can
|
||||
// be that it is correct.")
|
||||
//
|
||||
// My guess is that the thread_local design is probably not appropriate for
|
||||
// regex since its memory usage scales to the number of active threads that
|
||||
// have used a regex, where as the pool below scales to the number of threads
|
||||
// that simultaneously use a regex. While neither case permits contraction,
|
||||
// since we own the pool data structure below, we can add contraction if a
|
||||
// clear use case pops up in the wild. More pressingly though, it seems that
|
||||
// there are at least some use case patterns where one might have many threads
|
||||
// sitting around that might have used a regex at one point. While thread_local
|
||||
// does try to reuse space previously used by a thread that has since stopped,
|
||||
// its maximal memory usage still scales with the total number of active
|
||||
// threads. In contrast, the pool below scales with the total number of threads
|
||||
// *simultaneously* using the pool. The hope is that this uses less memory
|
||||
// overall. And if it doesn't, we can hopefully tune it somehow.
|
||||
//
|
||||
// It seems that these sort of conditions happen frequently
|
||||
// in FFI inside of other more "managed" languages. This was
|
||||
// mentioned in the issue linked above, and also mentioned here:
|
||||
// https://github.com/BurntSushi/rure-go/issues/3. And in particular, users
|
||||
// confirm that disabling the use of thread_local resolves the leak.
|
||||
//
|
||||
// There were other weaker reasons for moving off of thread_local as well.
|
||||
// Namely, at the time, I was looking to reduce dependencies. And for something
|
||||
// like regex, maintenance can be simpler when we own the full dependency tree.
|
||||
|
||||
use std::panic::{RefUnwindSafe, UnwindSafe};
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::Mutex;
|
||||
|
||||
/// An atomic counter used to allocate thread IDs.
|
||||
static COUNTER: AtomicUsize = AtomicUsize::new(1);
|
||||
|
||||
thread_local!(
|
||||
/// A thread local used to assign an ID to a thread.
|
||||
static THREAD_ID: usize = {
|
||||
let next = COUNTER.fetch_add(1, Ordering::Relaxed);
|
||||
// SAFETY: We cannot permit the reuse of thread IDs since reusing a
|
||||
// thread ID might result in more than one thread "owning" a pool,
|
||||
// and thus, permit accessing a mutable value from multiple threads
|
||||
// simultaneously without synchronization. The intent of this panic is
|
||||
// to be a sanity check. It is not expected that the thread ID space
|
||||
// will actually be exhausted in practice.
|
||||
//
|
||||
// This checks that the counter never wraps around, since atomic
|
||||
// addition wraps around on overflow.
|
||||
if next == 0 {
|
||||
panic!("regex: thread ID allocation space exhausted");
|
||||
}
|
||||
next
|
||||
};
|
||||
);
|
||||
|
||||
/// The type of the function used to create values in a pool when the pool is
|
||||
/// empty and the caller requests one.
|
||||
type CreateFn<T> =
|
||||
Box<dyn Fn() -> T + Send + Sync + UnwindSafe + RefUnwindSafe + 'static>;
|
||||
|
||||
/// A simple thread safe pool for reusing values.
|
||||
///
|
||||
/// Getting a value out comes with a guard. When that guard is dropped, the
|
||||
/// value is automatically put back in the pool.
|
||||
///
|
||||
/// A Pool<T> impls Sync when T is Send (even if it's not Sync). This means
|
||||
/// that T can use interior mutability. This is possible because a pool is
|
||||
/// guaranteed to provide a value to exactly one thread at any time.
|
||||
///
|
||||
/// Currently, a pool never contracts in size. Its size is proportional to the
|
||||
/// number of simultaneous uses.
|
||||
pub struct Pool<T> {
|
||||
/// A stack of T values to hand out. These are used when a Pool is
|
||||
/// accessed by a thread that didn't create it.
|
||||
stack: Mutex<Vec<Box<T>>>,
|
||||
/// A function to create more T values when stack is empty and a caller
|
||||
/// has requested a T.
|
||||
create: CreateFn<T>,
|
||||
/// The ID of the thread that owns this pool. The owner is the thread
|
||||
/// that makes the first call to 'get'. When the owner calls 'get', it
|
||||
/// gets 'owner_val' directly instead of returning a T from 'stack'.
|
||||
/// See comments elsewhere for details, but this is intended to be an
|
||||
/// optimization for the common case that makes getting a T faster.
|
||||
///
|
||||
/// It is initialized to a value of zero (an impossible thread ID) as a
|
||||
/// sentinel to indicate that it is unowned.
|
||||
owner: AtomicUsize,
|
||||
/// A value to return when the caller is in the same thread that created
|
||||
/// the Pool.
|
||||
owner_val: T,
|
||||
}
|
||||
|
||||
// SAFETY: Since we want to use a Pool from multiple threads simultaneously
|
||||
// behind an Arc, we need for it to be Sync. In cases where T is sync, Pool<T>
|
||||
// would be Sync. However, since we use a Pool to store mutable scratch space,
|
||||
// we wind up using a T that has interior mutability and is thus itself not
|
||||
// Sync. So what we *really* want is for our Pool<T> to by Sync even when T is
|
||||
// not Sync (but is at least Send).
|
||||
//
|
||||
// The only non-sync aspect of a Pool is its 'owner_val' field, which is used
|
||||
// to implement faster access to a pool value in the common case of a pool
|
||||
// being accessed in the same thread in which it was created. The 'stack' field
|
||||
// is also shared, but a Mutex<T> where T: Send is already Sync. So we only
|
||||
// need to worry about 'owner_val'.
|
||||
//
|
||||
// The key is to guarantee that 'owner_val' can only ever be accessed from one
|
||||
// thread. In our implementation below, we guarantee this by only returning the
|
||||
// 'owner_val' when the ID of the current thread matches the ID of the thread
|
||||
// that created the Pool. Since this can only ever be one thread, it follows
|
||||
// that only one thread can access 'owner_val' at any point in time. Thus, it
|
||||
// is safe to declare that Pool<T> is Sync when T is Send.
|
||||
//
|
||||
// NOTE: It would also be possible to make the owning thread be the *first*
|
||||
// thread that tries to get a value out of a Pool. However, the current
|
||||
// implementation is a little simpler and it's not clear if making the first
|
||||
// thread (rather than the creating thread) is meaningfully better.
|
||||
//
|
||||
// If there is a way to achieve our performance goals using safe code, then
|
||||
// I would very much welcome a patch. As it stands, the implementation below
|
||||
// tries to balance safety with performance. The case where a Regex is used
|
||||
// from multiple threads simultaneously will suffer a bit since getting a cache
|
||||
// will require unlocking a mutex.
|
||||
unsafe impl<T: Send> Sync for Pool<T> {}
|
||||
|
||||
impl<T: ::std::fmt::Debug> ::std::fmt::Debug for Pool<T> {
|
||||
fn fmt(&self, f: &mut ::std::fmt::Formatter<'_>) -> ::std::fmt::Result {
|
||||
f.debug_struct("Pool")
|
||||
.field("stack", &self.stack)
|
||||
.field("owner", &self.owner)
|
||||
.field("owner_val", &self.owner_val)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
/// A guard that is returned when a caller requests a value from the pool.
|
||||
///
|
||||
/// The purpose of the guard is to use RAII to automatically put the value back
|
||||
/// in the pool once it's dropped.
|
||||
#[derive(Debug)]
|
||||
pub struct PoolGuard<'a, T: Send> {
|
||||
/// The pool that this guard is attached to.
|
||||
pool: &'a Pool<T>,
|
||||
/// This is None when the guard represents the special "owned" value. In
|
||||
/// which case, the value is retrieved from 'pool.owner_val'.
|
||||
value: Option<Box<T>>,
|
||||
}
|
||||
|
||||
impl<T: Send> Pool<T> {
|
||||
/// Create a new pool. The given closure is used to create values in the
|
||||
/// pool when necessary.
|
||||
pub fn new(create: CreateFn<T>) -> Pool<T> {
|
||||
let owner = AtomicUsize::new(0);
|
||||
let owner_val = create();
|
||||
Pool { stack: Mutex::new(vec![]), create, owner, owner_val }
|
||||
}
|
||||
|
||||
/// Get a value from the pool. The caller is guaranteed to have exclusive
|
||||
/// access to the given value.
|
||||
///
|
||||
/// Note that there is no guarantee provided about which value in the
|
||||
/// pool is returned. That is, calling get, dropping the guard (causing
|
||||
/// the value to go back into the pool) and then calling get again is NOT
|
||||
/// guaranteed to return the same value received in the first get call.
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
pub fn get(&self) -> PoolGuard<'_, T> {
|
||||
// Our fast path checks if the caller is the thread that "owns" this
|
||||
// pool. Or stated differently, whether it is the first thread that
|
||||
// tried to extract a value from the pool. If it is, then we can return
|
||||
// a T to the caller without going through a mutex.
|
||||
//
|
||||
// SAFETY: We must guarantee that only one thread gets access to this
|
||||
// value. Since a thread is uniquely identified by the THREAD_ID thread
|
||||
// local, it follows that is the caller's thread ID is equal to the
|
||||
// owner, then only one thread may receive this value.
|
||||
let caller = THREAD_ID.with(|id| *id);
|
||||
let owner = self.owner.load(Ordering::Relaxed);
|
||||
if caller == owner {
|
||||
return self.guard_owned();
|
||||
}
|
||||
self.get_slow(caller, owner)
|
||||
}
|
||||
|
||||
/// This is the "slow" version that goes through a mutex to pop an
|
||||
/// allocated value off a stack to return to the caller. (Or, if the stack
|
||||
/// is empty, a new value is created.)
|
||||
///
|
||||
/// If the pool has no owner, then this will set the owner.
|
||||
#[cold]
|
||||
fn get_slow(&self, caller: usize, owner: usize) -> PoolGuard<'_, T> {
|
||||
use std::sync::atomic::Ordering::Relaxed;
|
||||
|
||||
if owner == 0 {
|
||||
// The sentinel 0 value means this pool is not yet owned. We
|
||||
// try to atomically set the owner. If we do, then this thread
|
||||
// becomes the owner and we can return a guard that represents
|
||||
// the special T for the owner.
|
||||
let res = self.owner.compare_exchange(0, caller, Relaxed, Relaxed);
|
||||
if res.is_ok() {
|
||||
return self.guard_owned();
|
||||
}
|
||||
}
|
||||
let mut stack = self.stack.lock().unwrap();
|
||||
let value = match stack.pop() {
|
||||
None => Box::new((self.create)()),
|
||||
Some(value) => value,
|
||||
};
|
||||
self.guard_stack(value)
|
||||
}
|
||||
|
||||
/// Puts a value back into the pool. Callers don't need to call this. Once
|
||||
/// the guard that's returned by 'get' is dropped, it is put back into the
|
||||
/// pool automatically.
|
||||
fn put(&self, value: Box<T>) {
|
||||
let mut stack = self.stack.lock().unwrap();
|
||||
stack.push(value);
|
||||
}
|
||||
|
||||
/// Create a guard that represents the special owned T.
|
||||
fn guard_owned(&self) -> PoolGuard<'_, T> {
|
||||
PoolGuard { pool: self, value: None }
|
||||
}
|
||||
|
||||
/// Create a guard that contains a value from the pool's stack.
|
||||
fn guard_stack(&self, value: Box<T>) -> PoolGuard<'_, T> {
|
||||
PoolGuard { pool: self, value: Some(value) }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: Send> PoolGuard<'a, T> {
|
||||
/// Return the underlying value.
|
||||
pub fn value(&self) -> &T {
|
||||
match self.value {
|
||||
None => &self.pool.owner_val,
|
||||
Some(ref v) => &**v,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: Send> Drop for PoolGuard<'a, T> {
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
fn drop(&mut self) {
|
||||
if let Some(value) = self.value.take() {
|
||||
self.pool.put(value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::panic::{RefUnwindSafe, UnwindSafe};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn oibits() {
|
||||
use crate::exec::ProgramCache;
|
||||
|
||||
fn has_oibits<T: Send + Sync + UnwindSafe + RefUnwindSafe>() {}
|
||||
has_oibits::<Pool<ProgramCache>>();
|
||||
}
|
||||
|
||||
// Tests that Pool implements the "single owner" optimization. That is, the
|
||||
// thread that first accesses the pool gets its own copy, while all other
|
||||
// threads get distinct copies.
|
||||
#[test]
|
||||
fn thread_owner_optimization() {
|
||||
use std::cell::RefCell;
|
||||
use std::sync::Arc;
|
||||
|
||||
let pool: Arc<Pool<RefCell<Vec<char>>>> =
|
||||
Arc::new(Pool::new(Box::new(|| RefCell::new(vec!['a']))));
|
||||
pool.get().value().borrow_mut().push('x');
|
||||
|
||||
let pool1 = pool.clone();
|
||||
let t1 = std::thread::spawn(move || {
|
||||
let guard = pool1.get();
|
||||
let v = guard.value();
|
||||
v.borrow_mut().push('y');
|
||||
});
|
||||
|
||||
let pool2 = pool.clone();
|
||||
let t2 = std::thread::spawn(move || {
|
||||
let guard = pool2.get();
|
||||
let v = guard.value();
|
||||
v.borrow_mut().push('z');
|
||||
});
|
||||
|
||||
t1.join().unwrap();
|
||||
t2.join().unwrap();
|
||||
|
||||
// If we didn't implement the single owner optimization, then one of
|
||||
// the threads above is likely to have mutated the [a, x] vec that
|
||||
// we stuffed in the pool before spawning the threads. But since
|
||||
// neither thread was first to access the pool, and because of the
|
||||
// optimization, we should be guaranteed that neither thread mutates
|
||||
// the special owned pool value.
|
||||
//
|
||||
// (Technically this is an implementation detail and not a contract of
|
||||
// Pool's API.)
|
||||
assert_eq!(vec!['a', 'x'], *pool.get().value().borrow());
|
||||
}
|
||||
}
|
|
@ -6,8 +6,8 @@ use std::ops::Deref;
|
|||
use std::slice;
|
||||
use std::sync::Arc;
|
||||
|
||||
use input::Char;
|
||||
use literal::LiteralSearcher;
|
||||
use crate::input::Char;
|
||||
use crate::literal::LiteralSearcher;
|
||||
|
||||
/// `InstPtr` represents the index of an instruction in a regex program.
|
||||
pub type InstPtr = usize;
|
||||
|
@ -168,7 +168,7 @@ impl Deref for Program {
|
|||
}
|
||||
|
||||
impl fmt::Debug for Program {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
use self::Inst::*;
|
||||
|
||||
fn with_goto(cur: usize, goto: usize, fmtd: String) -> String {
|
||||
|
@ -259,8 +259,8 @@ impl<'a> IntoIterator for &'a Program {
|
|||
///
|
||||
/// Other than the benefit of moving invariants into the type system, another
|
||||
/// benefit is the decreased size. If we remove the `Char` and `Ranges`
|
||||
/// instructions from the `Inst` enum, then its size shrinks from 40 bytes to
|
||||
/// 24 bytes. (This is because of the removal of a `Vec` in the `Ranges`
|
||||
/// instructions from the `Inst` enum, then its size shrinks from 32 bytes to
|
||||
/// 24 bytes. (This is because of the removal of a `Box<[]>` in the `Ranges`
|
||||
/// variant.) Given that byte based machines are typically much bigger than
|
||||
/// their Unicode analogues (because they can decode UTF-8 directly), this ends
|
||||
/// up being a pretty significant savings.
|
||||
|
@ -374,7 +374,7 @@ pub struct InstRanges {
|
|||
/// succeeds.
|
||||
pub goto: InstPtr,
|
||||
/// The set of Unicode scalar value ranges to test.
|
||||
pub ranges: Vec<(char, char)>,
|
||||
pub ranges: Box<[(char, char)]>,
|
||||
}
|
||||
|
||||
impl InstRanges {
|
||||
|
@ -432,3 +432,16 @@ impl InstBytes {
|
|||
self.start <= byte && byte <= self.end
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
#[test]
|
||||
#[cfg(target_pointer_width = "64")]
|
||||
fn test_size_of_inst() {
|
||||
use std::mem::size_of;
|
||||
|
||||
use super::Inst;
|
||||
|
||||
assert_eq!(32, size_of::<Inst>());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -37,16 +37,17 @@ macro_rules! define_builder {
|
|||
($name:ident, $regex_mod:ident, $only_utf8:expr) => {
|
||||
pub mod $name {
|
||||
use super::RegexOptions;
|
||||
use error::Error;
|
||||
use exec::ExecBuilder;
|
||||
use crate::error::Error;
|
||||
use crate::exec::ExecBuilder;
|
||||
|
||||
use $regex_mod::Regex;
|
||||
use crate::$regex_mod::Regex;
|
||||
|
||||
/// A configurable builder for a regular expression.
|
||||
///
|
||||
/// A builder can be used to configure how the regex is built, for example, by
|
||||
/// setting the default flags (which can be overridden in the expression
|
||||
/// itself) or setting various limits.
|
||||
#[derive(Debug)]
|
||||
pub struct RegexBuilder(RegexOptions);
|
||||
|
||||
impl RegexBuilder {
|
||||
|
@ -234,16 +235,17 @@ macro_rules! define_set_builder {
|
|||
($name:ident, $regex_mod:ident, $only_utf8:expr) => {
|
||||
pub mod $name {
|
||||
use super::RegexOptions;
|
||||
use error::Error;
|
||||
use exec::ExecBuilder;
|
||||
use crate::error::Error;
|
||||
use crate::exec::ExecBuilder;
|
||||
|
||||
use re_set::$regex_mod::RegexSet;
|
||||
use crate::re_set::$regex_mod::RegexSet;
|
||||
|
||||
/// A configurable builder for a set of regular expressions.
|
||||
///
|
||||
/// A builder can be used to configure how the regexes are built, for example,
|
||||
/// by setting the default flags (which can be overridden in the expression
|
||||
/// itself) or setting various limits.
|
||||
#[derive(Debug)]
|
||||
pub struct RegexSetBuilder(RegexOptions);
|
||||
|
||||
impl RegexSetBuilder {
|
||||
|
|
|
@ -1,17 +1,18 @@
|
|||
use std::borrow::Cow;
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::iter::FusedIterator;
|
||||
use std::ops::{Index, Range};
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
|
||||
use find_byte::find_byte;
|
||||
use crate::find_byte::find_byte;
|
||||
|
||||
use error::Error;
|
||||
use exec::{Exec, ExecNoSync};
|
||||
use expand::expand_bytes;
|
||||
use re_builder::bytes::RegexBuilder;
|
||||
use re_trait::{self, RegularExpression, SubCapturesPosIter};
|
||||
use crate::error::Error;
|
||||
use crate::exec::{Exec, ExecNoSync};
|
||||
use crate::expand::expand_bytes;
|
||||
use crate::re_builder::bytes::RegexBuilder;
|
||||
use crate::re_trait::{self, RegularExpression, SubCapturesPosIter};
|
||||
|
||||
/// Match represents a single match of a regex in a haystack.
|
||||
///
|
||||
|
@ -78,14 +79,14 @@ pub struct Regex(Exec);
|
|||
|
||||
impl fmt::Display for Regex {
|
||||
/// Shows the original regular expression.
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "{}", self.as_str())
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for Regex {
|
||||
/// Shows the original regular expression.
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
fmt::Display::fmt(self, f)
|
||||
}
|
||||
}
|
||||
|
@ -119,7 +120,8 @@ impl Regex {
|
|||
RegexBuilder::new(re).build()
|
||||
}
|
||||
|
||||
/// Returns true if and only if the regex matches the string given.
|
||||
/// Returns true if and only if there is a match for the regex in the
|
||||
/// string given.
|
||||
///
|
||||
/// It is recommended to use this method if all you need to do is test
|
||||
/// a match, since the underlying matching engine may be able to do less
|
||||
|
@ -131,7 +133,7 @@ impl Regex {
|
|||
/// bytes:
|
||||
///
|
||||
/// ```rust
|
||||
/// # extern crate regex; use regex::bytes::Regex;
|
||||
/// # use regex::bytes::Regex;
|
||||
/// # fn main() {
|
||||
/// let text = b"I categorically deny having triskaidekaphobia.";
|
||||
/// assert!(Regex::new(r"\b\w{13}\b").unwrap().is_match(text));
|
||||
|
@ -154,7 +156,7 @@ impl Regex {
|
|||
/// ASCII word bytes:
|
||||
///
|
||||
/// ```rust
|
||||
/// # extern crate regex; use regex::bytes::Regex;
|
||||
/// # use regex::bytes::Regex;
|
||||
/// # fn main() {
|
||||
/// let text = b"I categorically deny having triskaidekaphobia.";
|
||||
/// let mat = Regex::new(r"\b\w{13}\b").unwrap().find(text).unwrap();
|
||||
|
@ -175,7 +177,7 @@ impl Regex {
|
|||
/// word bytes:
|
||||
///
|
||||
/// ```rust
|
||||
/// # extern crate regex; use regex::bytes::Regex;
|
||||
/// # use regex::bytes::Regex;
|
||||
/// # fn main() {
|
||||
/// let text = b"Retroactively relinquishing remunerations is reprehensible.";
|
||||
/// for mat in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) {
|
||||
|
@ -203,7 +205,7 @@ impl Regex {
|
|||
/// year separately.
|
||||
///
|
||||
/// ```rust
|
||||
/// # extern crate regex; use regex::bytes::Regex;
|
||||
/// # use regex::bytes::Regex;
|
||||
/// # fn main() {
|
||||
/// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap();
|
||||
/// let text = b"Not my favorite movie: 'Citizen Kane' (1941).";
|
||||
|
@ -225,7 +227,7 @@ impl Regex {
|
|||
/// We can make this example a bit clearer by using *named* capture groups:
|
||||
///
|
||||
/// ```rust
|
||||
/// # extern crate regex; use regex::bytes::Regex;
|
||||
/// # use regex::bytes::Regex;
|
||||
/// # fn main() {
|
||||
/// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)")
|
||||
/// .unwrap();
|
||||
|
@ -269,7 +271,7 @@ impl Regex {
|
|||
/// some text, where the movie is formatted like "'Title' (xxxx)":
|
||||
///
|
||||
/// ```rust
|
||||
/// # extern crate regex; use std::str; use regex::bytes::Regex;
|
||||
/// # use std::str; use regex::bytes::Regex;
|
||||
/// # fn main() {
|
||||
/// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)")
|
||||
/// .unwrap();
|
||||
|
@ -303,7 +305,7 @@ impl Regex {
|
|||
/// To split a string delimited by arbitrary amounts of spaces or tabs:
|
||||
///
|
||||
/// ```rust
|
||||
/// # extern crate regex; use regex::bytes::Regex;
|
||||
/// # use regex::bytes::Regex;
|
||||
/// # fn main() {
|
||||
/// let re = Regex::new(r"[ \t]+").unwrap();
|
||||
/// let fields: Vec<&[u8]> = re.split(b"a b \t c\td e").collect();
|
||||
|
@ -329,7 +331,7 @@ impl Regex {
|
|||
/// Get the first two words in some text:
|
||||
///
|
||||
/// ```rust
|
||||
/// # extern crate regex; use regex::bytes::Regex;
|
||||
/// # use regex::bytes::Regex;
|
||||
/// # fn main() {
|
||||
/// let re = Regex::new(r"\W+").unwrap();
|
||||
/// let fields: Vec<&[u8]> = re.splitn(b"Hey! How are you?", 3).collect();
|
||||
|
@ -377,7 +379,7 @@ impl Regex {
|
|||
/// In typical usage, this can just be a normal byte string:
|
||||
///
|
||||
/// ```rust
|
||||
/// # extern crate regex; use regex::bytes::Regex;
|
||||
/// # use regex::bytes::Regex;
|
||||
/// # fn main() {
|
||||
/// let re = Regex::new("[^01]+").unwrap();
|
||||
/// assert_eq!(re.replace(b"1078910", &b""[..]), &b"1010"[..]);
|
||||
|
@ -390,7 +392,7 @@ impl Regex {
|
|||
/// group matches easily:
|
||||
///
|
||||
/// ```rust
|
||||
/// # extern crate regex; use regex::bytes::Regex;
|
||||
/// # use regex::bytes::Regex;
|
||||
/// # use regex::bytes::Captures; fn main() {
|
||||
/// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap();
|
||||
/// let result = re.replace(b"Springsteen, Bruce", |caps: &Captures| {
|
||||
|
@ -409,7 +411,7 @@ impl Regex {
|
|||
/// with named capture groups:
|
||||
///
|
||||
/// ```rust
|
||||
/// # extern crate regex; use regex::bytes::Regex;
|
||||
/// # use regex::bytes::Regex;
|
||||
/// # fn main() {
|
||||
/// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(?P<first>\S+)").unwrap();
|
||||
/// let result = re.replace(b"Springsteen, Bruce", &b"$first $last"[..]);
|
||||
|
@ -426,7 +428,7 @@ impl Regex {
|
|||
/// underscore:
|
||||
///
|
||||
/// ```rust
|
||||
/// # extern crate regex; use regex::bytes::Regex;
|
||||
/// # use regex::bytes::Regex;
|
||||
/// # fn main() {
|
||||
/// let re = Regex::new(r"(?P<first>\w+)\s+(?P<second>\w+)").unwrap();
|
||||
/// let result = re.replace(b"deep fried", &b"${first}_$second"[..]);
|
||||
|
@ -443,7 +445,7 @@ impl Regex {
|
|||
/// byte string with `NoExpand`:
|
||||
///
|
||||
/// ```rust
|
||||
/// # extern crate regex; use regex::bytes::Regex;
|
||||
/// # use regex::bytes::Regex;
|
||||
/// # fn main() {
|
||||
/// use regex::bytes::NoExpand;
|
||||
///
|
||||
|
@ -544,7 +546,7 @@ impl Regex {
|
|||
/// `a`.
|
||||
///
|
||||
/// ```rust
|
||||
/// # extern crate regex; use regex::bytes::Regex;
|
||||
/// # use regex::bytes::Regex;
|
||||
/// # fn main() {
|
||||
/// let text = b"aaaaa";
|
||||
/// let pos = Regex::new(r"a+").unwrap().shortest_match(text);
|
||||
|
@ -656,7 +658,7 @@ impl Regex {
|
|||
}
|
||||
|
||||
/// Returns an iterator over the capture names.
|
||||
pub fn capture_names(&self) -> CaptureNames {
|
||||
pub fn capture_names(&self) -> CaptureNames<'_> {
|
||||
CaptureNames(self.0.capture_names().iter())
|
||||
}
|
||||
|
||||
|
@ -689,6 +691,7 @@ impl Regex {
|
|||
///
|
||||
/// `'r` is the lifetime of the compiled regular expression and `'t` is the
|
||||
/// lifetime of the matched byte string.
|
||||
#[derive(Debug)]
|
||||
pub struct Matches<'r, 't>(re_trait::Matches<'t, ExecNoSync<'r>>);
|
||||
|
||||
impl<'r, 't> Iterator for Matches<'r, 't> {
|
||||
|
@ -700,6 +703,8 @@ impl<'r, 't> Iterator for Matches<'r, 't> {
|
|||
}
|
||||
}
|
||||
|
||||
impl<'r, 't> FusedIterator for Matches<'r, 't> {}
|
||||
|
||||
/// An iterator that yields all non-overlapping capture groups matching a
|
||||
/// particular regular expression.
|
||||
///
|
||||
|
@ -707,6 +712,7 @@ impl<'r, 't> Iterator for Matches<'r, 't> {
|
|||
///
|
||||
/// `'r` is the lifetime of the compiled regular expression and `'t` is the
|
||||
/// lifetime of the matched byte string.
|
||||
#[derive(Debug)]
|
||||
pub struct CaptureMatches<'r, 't>(
|
||||
re_trait::CaptureMatches<'t, ExecNoSync<'r>>,
|
||||
);
|
||||
|
@ -723,10 +729,13 @@ impl<'r, 't> Iterator for CaptureMatches<'r, 't> {
|
|||
}
|
||||
}
|
||||
|
||||
impl<'r, 't> FusedIterator for CaptureMatches<'r, 't> {}
|
||||
|
||||
/// Yields all substrings delimited by a regular expression match.
|
||||
///
|
||||
/// `'r` is the lifetime of the compiled regular expression and `'t` is the
|
||||
/// lifetime of the byte string being split.
|
||||
#[derive(Debug)]
|
||||
pub struct Split<'r, 't> {
|
||||
finder: Matches<'r, 't>,
|
||||
last: usize,
|
||||
|
@ -756,12 +765,15 @@ impl<'r, 't> Iterator for Split<'r, 't> {
|
|||
}
|
||||
}
|
||||
|
||||
impl<'r, 't> FusedIterator for Split<'r, 't> {}
|
||||
|
||||
/// Yields at most `N` substrings delimited by a regular expression match.
|
||||
///
|
||||
/// The last substring will be whatever remains after splitting.
|
||||
///
|
||||
/// `'r` is the lifetime of the compiled regular expression and `'t` is the
|
||||
/// lifetime of the byte string being split.
|
||||
#[derive(Debug)]
|
||||
pub struct SplitN<'r, 't> {
|
||||
splits: Split<'r, 't>,
|
||||
n: usize,
|
||||
|
@ -789,14 +801,21 @@ impl<'r, 't> Iterator for SplitN<'r, 't> {
|
|||
Some(&text[self.splits.last..])
|
||||
}
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
(0, Some(self.n))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'r, 't> FusedIterator for SplitN<'r, 't> {}
|
||||
|
||||
/// An iterator over the names of all possible captures.
|
||||
///
|
||||
/// `None` indicates an unnamed capture; the first element (capture 0, the
|
||||
/// whole matched region) is always unnamed.
|
||||
///
|
||||
/// `'r` is the lifetime of the compiled regular expression.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct CaptureNames<'r>(::std::slice::Iter<'r, Option<String>>);
|
||||
|
||||
impl<'r> Iterator for CaptureNames<'r> {
|
||||
|
@ -812,8 +831,16 @@ impl<'r> Iterator for CaptureNames<'r> {
|
|||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
self.0.size_hint()
|
||||
}
|
||||
|
||||
fn count(self) -> usize {
|
||||
self.0.count()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'r> ExactSizeIterator for CaptureNames<'r> {}
|
||||
|
||||
impl<'r> FusedIterator for CaptureNames<'r> {}
|
||||
|
||||
/// CaptureLocations is a low level representation of the raw offsets of each
|
||||
/// submatch.
|
||||
///
|
||||
|
@ -930,17 +957,22 @@ impl<'t> Captures<'t> {
|
|||
/// Expands all instances of `$name` in `replacement` to the corresponding
|
||||
/// capture group `name`, and writes them to the `dst` buffer given.
|
||||
///
|
||||
/// `name` may be an integer corresponding to the index of the
|
||||
/// capture group (counted by order of opening parenthesis where `0` is the
|
||||
/// `name` may be an integer corresponding to the index of the capture
|
||||
/// group (counted by order of opening parenthesis where `0` is the
|
||||
/// entire match) or it can be a name (consisting of letters, digits or
|
||||
/// underscores) corresponding to a named capture group.
|
||||
///
|
||||
/// If `name` isn't a valid capture group (whether the name doesn't exist
|
||||
/// or isn't a valid index), then it is replaced with the empty string.
|
||||
///
|
||||
/// The longest possible name is used. e.g., `$1a` looks up the capture
|
||||
/// group named `1a` and not the capture group at index `1`. To exert more
|
||||
/// precise control over the name, use braces, e.g., `${1}a`.
|
||||
/// The longest possible name consisting of the characters `[_0-9A-Za-z]`
|
||||
/// is used. e.g., `$1a` looks up the capture group named `1a` and not the
|
||||
/// capture group at index `1`. To exert more precise control over the
|
||||
/// name, or to refer to a capture group name that uses characters outside
|
||||
/// of `[_0-9A-Za-z]`, use braces, e.g., `${1}a` or `${foo[bar].baz}`. When
|
||||
/// using braces, any sequence of valid UTF-8 bytes is permitted. If the
|
||||
/// sequence does not refer to a capture group name in the corresponding
|
||||
/// regex, then it is replaced with an empty string.
|
||||
///
|
||||
/// To write a literal `$` use `$$`.
|
||||
pub fn expand(&self, replacement: &[u8], dst: &mut Vec<u8>) {
|
||||
|
@ -958,15 +990,15 @@ impl<'t> Captures<'t> {
|
|||
}
|
||||
|
||||
impl<'t> fmt::Debug for Captures<'t> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.debug_tuple("Captures").field(&CapturesDebug(self)).finish()
|
||||
}
|
||||
}
|
||||
|
||||
struct CapturesDebug<'c, 't: 'c>(&'c Captures<'t>);
|
||||
struct CapturesDebug<'c, 't>(&'c Captures<'t>);
|
||||
|
||||
impl<'c, 't> fmt::Debug for CapturesDebug<'c, 't> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
fn escape_bytes(bytes: &[u8]) -> String {
|
||||
let mut s = String::new();
|
||||
for &b in bytes {
|
||||
|
@ -1051,7 +1083,8 @@ impl<'t, 'i> Index<&'i str> for Captures<'t> {
|
|||
///
|
||||
/// The lifetime `'c` corresponds to the lifetime of the `Captures` value, and
|
||||
/// the lifetime `'t` corresponds to the originally matched text.
|
||||
pub struct SubCaptureMatches<'c, 't: 'c> {
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct SubCaptureMatches<'c, 't> {
|
||||
caps: &'c Captures<'t>,
|
||||
it: SubCapturesPosIter<'c>,
|
||||
}
|
||||
|
@ -1066,13 +1099,15 @@ impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> {
|
|||
}
|
||||
}
|
||||
|
||||
impl<'c, 't> FusedIterator for SubCaptureMatches<'c, 't> {}
|
||||
|
||||
/// Replacer describes types that can be used to replace matches in a byte
|
||||
/// string.
|
||||
///
|
||||
/// In general, users of this crate shouldn't need to implement this trait,
|
||||
/// since implementations are already provided for `&[u8]` and
|
||||
/// `FnMut(&Captures) -> Vec<u8>` (or any `FnMut(&Captures) -> T`
|
||||
/// where `T: AsRef<[u8]>`), which covers most use cases.
|
||||
/// since implementations are already provided for `&[u8]` along with other
|
||||
/// variants of bytes types and `FnMut(&Captures) -> Vec<u8>` (or any
|
||||
/// `FnMut(&Captures) -> T` where `T: AsRef<[u8]>`), which covers most use cases.
|
||||
pub trait Replacer {
|
||||
/// Appends text to `dst` to replace the current match.
|
||||
///
|
||||
|
@ -1081,7 +1116,7 @@ pub trait Replacer {
|
|||
///
|
||||
/// For example, a no-op replacement would be
|
||||
/// `dst.extend(&caps[0])`.
|
||||
fn replace_append(&mut self, caps: &Captures, dst: &mut Vec<u8>);
|
||||
fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>);
|
||||
|
||||
/// Return a fixed unchanging replacement byte string.
|
||||
///
|
||||
|
@ -1124,10 +1159,10 @@ pub trait Replacer {
|
|||
///
|
||||
/// Returned by [`Replacer::by_ref`](trait.Replacer.html#method.by_ref).
|
||||
#[derive(Debug)]
|
||||
pub struct ReplacerRef<'a, R: ?Sized + 'a>(&'a mut R);
|
||||
pub struct ReplacerRef<'a, R: ?Sized>(&'a mut R);
|
||||
|
||||
impl<'a, R: Replacer + ?Sized + 'a> Replacer for ReplacerRef<'a, R> {
|
||||
fn replace_append(&mut self, caps: &Captures, dst: &mut Vec<u8>) {
|
||||
fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
|
||||
self.0.replace_append(caps, dst)
|
||||
}
|
||||
fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, [u8]>> {
|
||||
|
@ -1136,24 +1171,69 @@ impl<'a, R: Replacer + ?Sized + 'a> Replacer for ReplacerRef<'a, R> {
|
|||
}
|
||||
|
||||
impl<'a> Replacer for &'a [u8] {
|
||||
fn replace_append(&mut self, caps: &Captures, dst: &mut Vec<u8>) {
|
||||
fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
|
||||
caps.expand(*self, dst);
|
||||
}
|
||||
|
||||
fn no_expansion(&mut self) -> Option<Cow<[u8]>> {
|
||||
match find_byte(b'$', *self) {
|
||||
Some(_) => None,
|
||||
None => Some(Cow::Borrowed(*self)),
|
||||
}
|
||||
fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
|
||||
no_expansion(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Replacer for &'a Vec<u8> {
|
||||
fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
|
||||
caps.expand(*self, dst);
|
||||
}
|
||||
|
||||
fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
|
||||
no_expansion(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl Replacer for Vec<u8> {
|
||||
fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
|
||||
caps.expand(self, dst);
|
||||
}
|
||||
|
||||
fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
|
||||
no_expansion(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Replacer for Cow<'a, [u8]> {
|
||||
fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
|
||||
caps.expand(self.as_ref(), dst);
|
||||
}
|
||||
|
||||
fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
|
||||
no_expansion(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Replacer for &'a Cow<'a, [u8]> {
|
||||
fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
|
||||
caps.expand(self.as_ref(), dst);
|
||||
}
|
||||
|
||||
fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
|
||||
no_expansion(self)
|
||||
}
|
||||
}
|
||||
|
||||
fn no_expansion<T: AsRef<[u8]>>(t: &T) -> Option<Cow<'_, [u8]>> {
|
||||
let s = t.as_ref();
|
||||
match find_byte(b'$', s) {
|
||||
Some(_) => None,
|
||||
None => Some(Cow::Borrowed(s)),
|
||||
}
|
||||
}
|
||||
|
||||
impl<F, T> Replacer for F
|
||||
where
|
||||
F: FnMut(&Captures) -> T,
|
||||
F: FnMut(&Captures<'_>) -> T,
|
||||
T: AsRef<[u8]>,
|
||||
{
|
||||
fn replace_append(&mut self, caps: &Captures, dst: &mut Vec<u8>) {
|
||||
fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) {
|
||||
dst.extend_from_slice((*self)(caps).as_ref());
|
||||
}
|
||||
}
|
||||
|
@ -1166,14 +1246,15 @@ where
|
|||
/// and performant (since capture groups don't need to be found).
|
||||
///
|
||||
/// `'t` is the lifetime of the literal text.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct NoExpand<'t>(pub &'t [u8]);
|
||||
|
||||
impl<'t> Replacer for NoExpand<'t> {
|
||||
fn replace_append(&mut self, _: &Captures, dst: &mut Vec<u8>) {
|
||||
fn replace_append(&mut self, _: &Captures<'_>, dst: &mut Vec<u8>) {
|
||||
dst.extend_from_slice(self.0);
|
||||
}
|
||||
|
||||
fn no_expansion(&mut self) -> Option<Cow<[u8]>> {
|
||||
fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> {
|
||||
Some(Cow::Borrowed(self.0))
|
||||
}
|
||||
}
|
||||
|
|
|
@ -7,10 +7,10 @@ macro_rules! define_set {
|
|||
use std::slice;
|
||||
use std::vec;
|
||||
|
||||
use error::Error;
|
||||
use exec::Exec;
|
||||
use re_builder::$builder_mod::RegexSetBuilder;
|
||||
use re_trait::RegularExpression;
|
||||
use crate::error::Error;
|
||||
use crate::exec::Exec;
|
||||
use crate::re_builder::$builder_mod::RegexSetBuilder;
|
||||
use crate::re_trait::RegularExpression;
|
||||
|
||||
/// Match multiple (possibly overlapping) regular expressions in a single scan.
|
||||
///
|
||||
|
@ -43,7 +43,7 @@ $(#[$doc_regexset_example])*
|
|||
/// Note that it would be possible to adapt the above example to using `Regex`
|
||||
/// with an expression like:
|
||||
///
|
||||
/// ```ignore
|
||||
/// ```text
|
||||
/// (?P<email>[a-z]+@(?P<email_domain>[a-z]+[.](com|org|net)))|(?P<domain>[a-z]+[.](com|org|net))
|
||||
/// ```
|
||||
///
|
||||
|
@ -96,6 +96,19 @@ impl RegexSet {
|
|||
RegexSetBuilder::new(exprs).build()
|
||||
}
|
||||
|
||||
/// Create a new empty regex set.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// # use regex::RegexSet;
|
||||
/// let set = RegexSet::empty();
|
||||
/// assert!(set.is_empty());
|
||||
/// ```
|
||||
pub fn empty() -> RegexSet {
|
||||
RegexSetBuilder::new(&[""; 0]).build().unwrap()
|
||||
}
|
||||
|
||||
/// Returns true if and only if one of the regexes in this set matches
|
||||
/// the text given.
|
||||
///
|
||||
|
@ -207,6 +220,11 @@ impl RegexSet {
|
|||
self.0.regex_strings().len()
|
||||
}
|
||||
|
||||
/// Returns `true` if this set contains no regular expressions.
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.0.regex_strings().is_empty()
|
||||
}
|
||||
|
||||
/// Returns the patterns that this set will match on.
|
||||
///
|
||||
/// This function can be used to determine the pattern for a match. The
|
||||
|
@ -274,7 +292,7 @@ impl SetMatches {
|
|||
/// This will always produces matches in ascending order of index, where
|
||||
/// the index corresponds to the index of the regex that matched with
|
||||
/// respect to its position when initially building the set.
|
||||
pub fn iter(&self) -> SetMatchesIter {
|
||||
pub fn iter(&self) -> SetMatchesIter<'_> {
|
||||
SetMatchesIter((&*self.matches).into_iter().enumerate())
|
||||
}
|
||||
}
|
||||
|
@ -302,6 +320,7 @@ impl<'a> IntoIterator for &'a SetMatches {
|
|||
/// This will always produces matches in ascending order of index, where the
|
||||
/// index corresponds to the index of the regex that matched with respect to
|
||||
/// its position when initially building the set.
|
||||
#[derive(Debug)]
|
||||
pub struct SetMatchesIntoIter(iter::Enumerate<vec::IntoIter<bool>>);
|
||||
|
||||
impl Iterator for SetMatchesIntoIter {
|
||||
|
@ -334,6 +353,8 @@ impl DoubleEndedIterator for SetMatchesIntoIter {
|
|||
}
|
||||
}
|
||||
|
||||
impl iter::FusedIterator for SetMatchesIntoIter {}
|
||||
|
||||
/// A borrowed iterator over the set of matches from a regex set.
|
||||
///
|
||||
/// The lifetime `'a` refers to the lifetime of a `SetMatches` value.
|
||||
|
@ -341,7 +362,7 @@ impl DoubleEndedIterator for SetMatchesIntoIter {
|
|||
/// This will always produces matches in ascending order of index, where the
|
||||
/// index corresponds to the index of the regex that matched with respect to
|
||||
/// its position when initially building the set.
|
||||
#[derive(Clone)]
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct SetMatchesIter<'a>(iter::Enumerate<slice::Iter<'a, bool>>);
|
||||
|
||||
impl<'a> Iterator for SetMatchesIter<'a> {
|
||||
|
@ -374,6 +395,8 @@ impl<'a> DoubleEndedIterator for SetMatchesIter<'a> {
|
|||
}
|
||||
}
|
||||
|
||||
impl<'a> iter::FusedIterator for SetMatchesIter<'a> {}
|
||||
|
||||
#[doc(hidden)]
|
||||
impl From<Exec> for RegexSet {
|
||||
fn from(exec: Exec) -> Self {
|
||||
|
@ -382,7 +405,7 @@ impl From<Exec> for RegexSet {
|
|||
}
|
||||
|
||||
impl fmt::Debug for RegexSet {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "RegexSet({:?})", self.0.regex_strings())
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,3 +1,6 @@
|
|||
use std::fmt;
|
||||
use std::iter::FusedIterator;
|
||||
|
||||
/// Slot is a single saved capture location. Note that there are two slots for
|
||||
/// every capture in a regular expression (one slot each for the start and end
|
||||
/// of the capture).
|
||||
|
@ -27,7 +30,7 @@ impl Locations {
|
|||
/// Creates an iterator of all the capture group positions in order of
|
||||
/// appearance in the regular expression. Positions are byte indices
|
||||
/// in terms of the original string matched.
|
||||
pub fn iter(&self) -> SubCapturesPosIter {
|
||||
pub fn iter(&self) -> SubCapturesPosIter<'_> {
|
||||
SubCapturesPosIter { idx: 0, locs: self }
|
||||
}
|
||||
|
||||
|
@ -51,6 +54,7 @@ impl Locations {
|
|||
/// Positions are byte indices in terms of the original string matched.
|
||||
///
|
||||
/// `'c` is the lifetime of the captures.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct SubCapturesPosIter<'c> {
|
||||
idx: usize,
|
||||
locs: &'c Locations,
|
||||
|
@ -72,6 +76,8 @@ impl<'c> Iterator for SubCapturesPosIter<'c> {
|
|||
}
|
||||
}
|
||||
|
||||
impl<'c> FusedIterator for SubCapturesPosIter<'c> {}
|
||||
|
||||
/// `RegularExpression` describes types that can implement regex searching.
|
||||
///
|
||||
/// This trait is my attempt at reducing code duplication and to standardize
|
||||
|
@ -84,9 +90,9 @@ impl<'c> Iterator for SubCapturesPosIter<'c> {
|
|||
/// somewhat reasonable. One particular thing this trait would expose would be
|
||||
/// the ability to start the search of a regex anywhere in a haystack, which
|
||||
/// isn't possible in the current public API.
|
||||
pub trait RegularExpression: Sized {
|
||||
pub trait RegularExpression: Sized + fmt::Debug {
|
||||
/// The type of the haystack.
|
||||
type Text: ?Sized;
|
||||
type Text: ?Sized + fmt::Debug;
|
||||
|
||||
/// The number of capture slots in the compiled regular expression. This is
|
||||
/// always two times the number of capture groups (two slots per group).
|
||||
|
@ -132,18 +138,19 @@ pub trait RegularExpression: Sized {
|
|||
|
||||
/// Returns an iterator over all non-overlapping successive leftmost-first
|
||||
/// matches.
|
||||
fn find_iter(self, text: &Self::Text) -> Matches<Self> {
|
||||
fn find_iter(self, text: &Self::Text) -> Matches<'_, Self> {
|
||||
Matches { re: self, text: text, last_end: 0, last_match: None }
|
||||
}
|
||||
|
||||
/// Returns an iterator over all non-overlapping successive leftmost-first
|
||||
/// matches with captures.
|
||||
fn captures_iter(self, text: &Self::Text) -> CaptureMatches<Self> {
|
||||
fn captures_iter(self, text: &Self::Text) -> CaptureMatches<'_, Self> {
|
||||
CaptureMatches(self.find_iter(text))
|
||||
}
|
||||
}
|
||||
|
||||
/// An iterator over all non-overlapping successive leftmost-first matches.
|
||||
#[derive(Debug)]
|
||||
pub struct Matches<'t, R>
|
||||
where
|
||||
R: RegularExpression,
|
||||
|
@ -204,8 +211,16 @@ where
|
|||
}
|
||||
}
|
||||
|
||||
impl<'t, R> FusedIterator for Matches<'t, R>
|
||||
where
|
||||
R: RegularExpression,
|
||||
R::Text: 't + AsRef<[u8]>,
|
||||
{
|
||||
}
|
||||
|
||||
/// An iterator over all non-overlapping successive leftmost-first matches with
|
||||
/// captures.
|
||||
#[derive(Debug)]
|
||||
pub struct CaptureMatches<'t, R>(Matches<'t, R>)
|
||||
where
|
||||
R: RegularExpression,
|
||||
|
@ -259,3 +274,10 @@ where
|
|||
Some(locs)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t, R> FusedIterator for CaptureMatches<'t, R>
|
||||
where
|
||||
R: RegularExpression,
|
||||
R::Text: 't + AsRef<[u8]>,
|
||||
{
|
||||
}
|
||||
|
|
|
@ -1,25 +1,25 @@
|
|||
use std::borrow::Cow;
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::iter::FusedIterator;
|
||||
use std::ops::{Index, Range};
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
|
||||
use find_byte::find_byte;
|
||||
use syntax;
|
||||
use crate::find_byte::find_byte;
|
||||
|
||||
use error::Error;
|
||||
use exec::{Exec, ExecNoSyncStr};
|
||||
use expand::expand_str;
|
||||
use re_builder::unicode::RegexBuilder;
|
||||
use re_trait::{self, RegularExpression, SubCapturesPosIter};
|
||||
use crate::error::Error;
|
||||
use crate::exec::{Exec, ExecNoSyncStr};
|
||||
use crate::expand::expand_str;
|
||||
use crate::re_builder::unicode::RegexBuilder;
|
||||
use crate::re_trait::{self, RegularExpression, SubCapturesPosIter};
|
||||
|
||||
/// Escapes all regular expression meta characters in `text`.
|
||||
///
|
||||
/// The string returned may be safely used as a literal in a regular
|
||||
/// expression.
|
||||
pub fn escape(text: &str) -> String {
|
||||
syntax::escape(text)
|
||||
regex_syntax::escape(text)
|
||||
}
|
||||
|
||||
/// Match represents a single match of a regex in a haystack.
|
||||
|
@ -137,14 +137,14 @@ pub struct Regex(Exec);
|
|||
|
||||
impl fmt::Display for Regex {
|
||||
/// Shows the original regular expression.
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "{}", self.as_str())
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for Regex {
|
||||
/// Shows the original regular expression.
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
fmt::Display::fmt(self, f)
|
||||
}
|
||||
}
|
||||
|
@ -175,7 +175,8 @@ impl Regex {
|
|||
RegexBuilder::new(re).build()
|
||||
}
|
||||
|
||||
/// Returns true if and only if the regex matches the string given.
|
||||
/// Returns true if and only if there is a match for the regex in the
|
||||
/// string given.
|
||||
///
|
||||
/// It is recommended to use this method if all you need to do is test
|
||||
/// a match, since the underlying matching engine may be able to do less
|
||||
|
@ -187,7 +188,7 @@ impl Regex {
|
|||
/// Unicode word characters:
|
||||
///
|
||||
/// ```rust
|
||||
/// # extern crate regex; use regex::Regex;
|
||||
/// # use regex::Regex;
|
||||
/// # fn main() {
|
||||
/// let text = "I categorically deny having triskaidekaphobia.";
|
||||
/// assert!(Regex::new(r"\b\w{13}\b").unwrap().is_match(text));
|
||||
|
@ -210,7 +211,7 @@ impl Regex {
|
|||
/// Unicode word characters:
|
||||
///
|
||||
/// ```rust
|
||||
/// # extern crate regex; use regex::Regex;
|
||||
/// # use regex::Regex;
|
||||
/// # fn main() {
|
||||
/// let text = "I categorically deny having triskaidekaphobia.";
|
||||
/// let mat = Regex::new(r"\b\w{13}\b").unwrap().find(text).unwrap();
|
||||
|
@ -232,7 +233,7 @@ impl Regex {
|
|||
/// word characters:
|
||||
///
|
||||
/// ```rust
|
||||
/// # extern crate regex; use regex::Regex;
|
||||
/// # use regex::Regex;
|
||||
/// # fn main() {
|
||||
/// let text = "Retroactively relinquishing remunerations is reprehensible.";
|
||||
/// for mat in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) {
|
||||
|
@ -260,7 +261,7 @@ impl Regex {
|
|||
/// year separately.
|
||||
///
|
||||
/// ```rust
|
||||
/// # extern crate regex; use regex::Regex;
|
||||
/// # use regex::Regex;
|
||||
/// # fn main() {
|
||||
/// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap();
|
||||
/// let text = "Not my favorite movie: 'Citizen Kane' (1941).";
|
||||
|
@ -282,7 +283,7 @@ impl Regex {
|
|||
/// We can make this example a bit clearer by using *named* capture groups:
|
||||
///
|
||||
/// ```rust
|
||||
/// # extern crate regex; use regex::Regex;
|
||||
/// # use regex::Regex;
|
||||
/// # fn main() {
|
||||
/// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)")
|
||||
/// .unwrap();
|
||||
|
@ -326,7 +327,7 @@ impl Regex {
|
|||
/// some text, where the movie is formatted like "'Title' (xxxx)":
|
||||
///
|
||||
/// ```rust
|
||||
/// # extern crate regex; use regex::Regex;
|
||||
/// # use regex::Regex;
|
||||
/// # fn main() {
|
||||
/// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)")
|
||||
/// .unwrap();
|
||||
|
@ -359,7 +360,7 @@ impl Regex {
|
|||
/// To split a string delimited by arbitrary amounts of spaces or tabs:
|
||||
///
|
||||
/// ```rust
|
||||
/// # extern crate regex; use regex::Regex;
|
||||
/// # use regex::Regex;
|
||||
/// # fn main() {
|
||||
/// let re = Regex::new(r"[ \t]+").unwrap();
|
||||
/// let fields: Vec<&str> = re.split("a b \t c\td e").collect();
|
||||
|
@ -383,7 +384,7 @@ impl Regex {
|
|||
/// Get the first two words in some text:
|
||||
///
|
||||
/// ```rust
|
||||
/// # extern crate regex; use regex::Regex;
|
||||
/// # use regex::Regex;
|
||||
/// # fn main() {
|
||||
/// let re = Regex::new(r"\W+").unwrap();
|
||||
/// let fields: Vec<&str> = re.splitn("Hey! How are you?", 3).collect();
|
||||
|
@ -430,7 +431,7 @@ impl Regex {
|
|||
/// In typical usage, this can just be a normal string:
|
||||
///
|
||||
/// ```rust
|
||||
/// # extern crate regex; use regex::Regex;
|
||||
/// # use regex::Regex;
|
||||
/// # fn main() {
|
||||
/// let re = Regex::new("[^01]+").unwrap();
|
||||
/// assert_eq!(re.replace("1078910", ""), "1010");
|
||||
|
@ -443,7 +444,7 @@ impl Regex {
|
|||
/// capturing group matches easily:
|
||||
///
|
||||
/// ```rust
|
||||
/// # extern crate regex; use regex::Regex;
|
||||
/// # use regex::Regex;
|
||||
/// # use regex::Captures; fn main() {
|
||||
/// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap();
|
||||
/// let result = re.replace("Springsteen, Bruce", |caps: &Captures| {
|
||||
|
@ -459,7 +460,7 @@ impl Regex {
|
|||
/// with named capture groups:
|
||||
///
|
||||
/// ```rust
|
||||
/// # extern crate regex; use regex::Regex;
|
||||
/// # use regex::Regex;
|
||||
/// # fn main() {
|
||||
/// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(?P<first>\S+)").unwrap();
|
||||
/// let result = re.replace("Springsteen, Bruce", "$first $last");
|
||||
|
@ -476,7 +477,7 @@ impl Regex {
|
|||
/// underscore:
|
||||
///
|
||||
/// ```rust
|
||||
/// # extern crate regex; use regex::Regex;
|
||||
/// # use regex::Regex;
|
||||
/// # fn main() {
|
||||
/// let re = Regex::new(r"(?P<first>\w+)\s+(?P<second>\w+)").unwrap();
|
||||
/// let result = re.replace("deep fried", "${first}_$second");
|
||||
|
@ -493,7 +494,7 @@ impl Regex {
|
|||
/// byte string with `NoExpand`:
|
||||
///
|
||||
/// ```rust
|
||||
/// # extern crate regex; use regex::Regex;
|
||||
/// # use regex::Regex;
|
||||
/// # fn main() {
|
||||
/// use regex::NoExpand;
|
||||
///
|
||||
|
@ -603,7 +604,7 @@ impl Regex {
|
|||
/// `a`.
|
||||
///
|
||||
/// ```rust
|
||||
/// # extern crate regex; use regex::Regex;
|
||||
/// # use regex::Regex;
|
||||
/// # fn main() {
|
||||
/// let text = "aaaaa";
|
||||
/// let pos = Regex::new(r"a+").unwrap().shortest_match(text);
|
||||
|
@ -715,7 +716,7 @@ impl Regex {
|
|||
}
|
||||
|
||||
/// Returns an iterator over the capture names.
|
||||
pub fn capture_names(&self) -> CaptureNames {
|
||||
pub fn capture_names(&self) -> CaptureNames<'_> {
|
||||
CaptureNames(self.0.capture_names().iter())
|
||||
}
|
||||
|
||||
|
@ -746,6 +747,7 @@ impl Regex {
|
|||
/// whole matched region) is always unnamed.
|
||||
///
|
||||
/// `'r` is the lifetime of the compiled regular expression.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct CaptureNames<'r>(::std::slice::Iter<'r, Option<String>>);
|
||||
|
||||
impl<'r> Iterator for CaptureNames<'r> {
|
||||
|
@ -761,12 +763,21 @@ impl<'r> Iterator for CaptureNames<'r> {
|
|||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
self.0.size_hint()
|
||||
}
|
||||
|
||||
fn count(self) -> usize {
|
||||
self.0.count()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'r> ExactSizeIterator for CaptureNames<'r> {}
|
||||
|
||||
impl<'r> FusedIterator for CaptureNames<'r> {}
|
||||
|
||||
/// Yields all substrings delimited by a regular expression match.
|
||||
///
|
||||
/// `'r` is the lifetime of the compiled regular expression and `'t` is the
|
||||
/// lifetime of the string being split.
|
||||
#[derive(Debug)]
|
||||
pub struct Split<'r, 't> {
|
||||
finder: Matches<'r, 't>,
|
||||
last: usize,
|
||||
|
@ -796,12 +807,15 @@ impl<'r, 't> Iterator for Split<'r, 't> {
|
|||
}
|
||||
}
|
||||
|
||||
impl<'r, 't> FusedIterator for Split<'r, 't> {}
|
||||
|
||||
/// Yields at most `N` substrings delimited by a regular expression match.
|
||||
///
|
||||
/// The last substring will be whatever remains after splitting.
|
||||
///
|
||||
/// `'r` is the lifetime of the compiled regular expression and `'t` is the
|
||||
/// lifetime of the string being split.
|
||||
#[derive(Debug)]
|
||||
pub struct SplitN<'r, 't> {
|
||||
splits: Split<'r, 't>,
|
||||
n: usize,
|
||||
|
@ -829,8 +843,14 @@ impl<'r, 't> Iterator for SplitN<'r, 't> {
|
|||
Some(&text[self.splits.last..])
|
||||
}
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
(0, Some(self.n))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'r, 't> FusedIterator for SplitN<'r, 't> {}
|
||||
|
||||
/// CaptureLocations is a low level representation of the raw offsets of each
|
||||
/// submatch.
|
||||
///
|
||||
|
@ -947,17 +967,22 @@ impl<'t> Captures<'t> {
|
|||
/// Expands all instances of `$name` in `replacement` to the corresponding
|
||||
/// capture group `name`, and writes them to the `dst` buffer given.
|
||||
///
|
||||
/// `name` may be an integer corresponding to the index of the
|
||||
/// capture group (counted by order of opening parenthesis where `0` is the
|
||||
/// `name` may be an integer corresponding to the index of the capture
|
||||
/// group (counted by order of opening parenthesis where `0` is the
|
||||
/// entire match) or it can be a name (consisting of letters, digits or
|
||||
/// underscores) corresponding to a named capture group.
|
||||
///
|
||||
/// If `name` isn't a valid capture group (whether the name doesn't exist
|
||||
/// or isn't a valid index), then it is replaced with the empty string.
|
||||
///
|
||||
/// The longest possible name is used. e.g., `$1a` looks up the capture
|
||||
/// group named `1a` and not the capture group at index `1`. To exert more
|
||||
/// precise control over the name, use braces, e.g., `${1}a`.
|
||||
/// The longest possible name consisting of the characters `[_0-9A-Za-z]`
|
||||
/// is used. e.g., `$1a` looks up the capture group named `1a` and not the
|
||||
/// capture group at index `1`. To exert more precise control over the
|
||||
/// name, or to refer to a capture group name that uses characters outside
|
||||
/// of `[_0-9A-Za-z]`, use braces, e.g., `${1}a` or `${foo[bar].baz}`. When
|
||||
/// using braces, any sequence of characters is permitted. If the sequence
|
||||
/// does not refer to a capture group name in the corresponding regex, then
|
||||
/// it is replaced with an empty string.
|
||||
///
|
||||
/// To write a literal `$` use `$$`.
|
||||
pub fn expand(&self, replacement: &str, dst: &mut String) {
|
||||
|
@ -975,15 +1000,15 @@ impl<'t> Captures<'t> {
|
|||
}
|
||||
|
||||
impl<'t> fmt::Debug for Captures<'t> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.debug_tuple("Captures").field(&CapturesDebug(self)).finish()
|
||||
}
|
||||
}
|
||||
|
||||
struct CapturesDebug<'c, 't: 'c>(&'c Captures<'t>);
|
||||
struct CapturesDebug<'c, 't>(&'c Captures<'t>);
|
||||
|
||||
impl<'c, 't> fmt::Debug for CapturesDebug<'c, 't> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
// We'd like to show something nice here, even if it means an
|
||||
// allocation to build a reverse index.
|
||||
let slot_to_name: HashMap<&usize, &String> =
|
||||
|
@ -1053,7 +1078,8 @@ impl<'t, 'i> Index<&'i str> for Captures<'t> {
|
|||
///
|
||||
/// The lifetime `'c` corresponds to the lifetime of the `Captures` value, and
|
||||
/// the lifetime `'t` corresponds to the originally matched text.
|
||||
pub struct SubCaptureMatches<'c, 't: 'c> {
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct SubCaptureMatches<'c, 't> {
|
||||
caps: &'c Captures<'t>,
|
||||
it: SubCapturesPosIter<'c>,
|
||||
}
|
||||
|
@ -1068,6 +1094,8 @@ impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> {
|
|||
}
|
||||
}
|
||||
|
||||
impl<'c, 't> FusedIterator for SubCaptureMatches<'c, 't> {}
|
||||
|
||||
/// An iterator that yields all non-overlapping capture groups matching a
|
||||
/// particular regular expression.
|
||||
///
|
||||
|
@ -1075,6 +1103,7 @@ impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> {
|
|||
///
|
||||
/// `'r` is the lifetime of the compiled regular expression and `'t` is the
|
||||
/// lifetime of the matched string.
|
||||
#[derive(Debug)]
|
||||
pub struct CaptureMatches<'r, 't>(
|
||||
re_trait::CaptureMatches<'t, ExecNoSyncStr<'r>>,
|
||||
);
|
||||
|
@ -1091,6 +1120,8 @@ impl<'r, 't> Iterator for CaptureMatches<'r, 't> {
|
|||
}
|
||||
}
|
||||
|
||||
impl<'r, 't> FusedIterator for CaptureMatches<'r, 't> {}
|
||||
|
||||
/// An iterator over all non-overlapping matches for a particular string.
|
||||
///
|
||||
/// The iterator yields a `Match` value. The iterator stops when no more
|
||||
|
@ -1098,6 +1129,7 @@ impl<'r, 't> Iterator for CaptureMatches<'r, 't> {
|
|||
///
|
||||
/// `'r` is the lifetime of the compiled regular expression and `'t` is the
|
||||
/// lifetime of the matched string.
|
||||
#[derive(Debug)]
|
||||
pub struct Matches<'r, 't>(re_trait::Matches<'t, ExecNoSyncStr<'r>>);
|
||||
|
||||
impl<'r, 't> Iterator for Matches<'r, 't> {
|
||||
|
@ -1109,12 +1141,14 @@ impl<'r, 't> Iterator for Matches<'r, 't> {
|
|||
}
|
||||
}
|
||||
|
||||
impl<'r, 't> FusedIterator for Matches<'r, 't> {}
|
||||
|
||||
/// Replacer describes types that can be used to replace matches in a string.
|
||||
///
|
||||
/// In general, users of this crate shouldn't need to implement this trait,
|
||||
/// since implementations are already provided for `&str` and
|
||||
/// `FnMut(&Captures) -> String` (or any `FnMut(&Captures) -> T`
|
||||
/// where `T: AsRef<str>`), which covers most use cases.
|
||||
/// since implementations are already provided for `&str` along with other
|
||||
/// variants of string types and `FnMut(&Captures) -> String` (or any
|
||||
/// `FnMut(&Captures) -> T` where `T: AsRef<str>`), which covers most use cases.
|
||||
pub trait Replacer {
|
||||
/// Appends text to `dst` to replace the current match.
|
||||
///
|
||||
|
@ -1122,8 +1156,8 @@ pub trait Replacer {
|
|||
/// have a match at capture group `0`.
|
||||
///
|
||||
/// For example, a no-op replacement would be
|
||||
/// `dst.extend(caps.get(0).unwrap().as_str())`.
|
||||
fn replace_append(&mut self, caps: &Captures, dst: &mut String);
|
||||
/// `dst.push_str(caps.get(0).unwrap().as_str())`.
|
||||
fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String);
|
||||
|
||||
/// Return a fixed unchanging replacement string.
|
||||
///
|
||||
|
@ -1166,36 +1200,81 @@ pub trait Replacer {
|
|||
///
|
||||
/// Returned by [`Replacer::by_ref`](trait.Replacer.html#method.by_ref).
|
||||
#[derive(Debug)]
|
||||
pub struct ReplacerRef<'a, R: ?Sized + 'a>(&'a mut R);
|
||||
pub struct ReplacerRef<'a, R: ?Sized>(&'a mut R);
|
||||
|
||||
impl<'a, R: Replacer + ?Sized + 'a> Replacer for ReplacerRef<'a, R> {
|
||||
fn replace_append(&mut self, caps: &Captures, dst: &mut String) {
|
||||
fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) {
|
||||
self.0.replace_append(caps, dst)
|
||||
}
|
||||
fn no_expansion(&mut self) -> Option<Cow<str>> {
|
||||
fn no_expansion(&mut self) -> Option<Cow<'_, str>> {
|
||||
self.0.no_expansion()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Replacer for &'a str {
|
||||
fn replace_append(&mut self, caps: &Captures, dst: &mut String) {
|
||||
fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) {
|
||||
caps.expand(*self, dst);
|
||||
}
|
||||
|
||||
fn no_expansion(&mut self) -> Option<Cow<str>> {
|
||||
match find_byte(b'$', self.as_bytes()) {
|
||||
Some(_) => None,
|
||||
None => Some(Cow::Borrowed(*self)),
|
||||
}
|
||||
fn no_expansion(&mut self) -> Option<Cow<'_, str>> {
|
||||
no_expansion(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Replacer for &'a String {
|
||||
fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) {
|
||||
self.as_str().replace_append(caps, dst)
|
||||
}
|
||||
|
||||
fn no_expansion(&mut self) -> Option<Cow<'_, str>> {
|
||||
no_expansion(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl Replacer for String {
|
||||
fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) {
|
||||
self.as_str().replace_append(caps, dst)
|
||||
}
|
||||
|
||||
fn no_expansion(&mut self) -> Option<Cow<'_, str>> {
|
||||
no_expansion(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Replacer for Cow<'a, str> {
|
||||
fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) {
|
||||
self.as_ref().replace_append(caps, dst)
|
||||
}
|
||||
|
||||
fn no_expansion(&mut self) -> Option<Cow<'_, str>> {
|
||||
no_expansion(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Replacer for &'a Cow<'a, str> {
|
||||
fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) {
|
||||
self.as_ref().replace_append(caps, dst)
|
||||
}
|
||||
|
||||
fn no_expansion(&mut self) -> Option<Cow<'_, str>> {
|
||||
no_expansion(self)
|
||||
}
|
||||
}
|
||||
|
||||
fn no_expansion<T: AsRef<str>>(t: &T) -> Option<Cow<'_, str>> {
|
||||
let s = t.as_ref();
|
||||
match find_byte(b'$', s.as_bytes()) {
|
||||
Some(_) => None,
|
||||
None => Some(Cow::Borrowed(s)),
|
||||
}
|
||||
}
|
||||
|
||||
impl<F, T> Replacer for F
|
||||
where
|
||||
F: FnMut(&Captures) -> T,
|
||||
F: FnMut(&Captures<'_>) -> T,
|
||||
T: AsRef<str>,
|
||||
{
|
||||
fn replace_append(&mut self, caps: &Captures, dst: &mut String) {
|
||||
fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) {
|
||||
dst.push_str((*self)(caps).as_ref());
|
||||
}
|
||||
}
|
||||
|
@ -1208,14 +1287,15 @@ where
|
|||
/// and performant (since capture groups don't need to be found).
|
||||
///
|
||||
/// `'t` is the lifetime of the literal text.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct NoExpand<'t>(pub &'t str);
|
||||
|
||||
impl<'t> Replacer for NoExpand<'t> {
|
||||
fn replace_append(&mut self, _: &Captures, dst: &mut String) {
|
||||
fn replace_append(&mut self, _: &Captures<'_>, dst: &mut String) {
|
||||
dst.push_str(self.0);
|
||||
}
|
||||
|
||||
fn no_expansion(&mut self) -> Option<Cow<str>> {
|
||||
fn no_expansion(&mut self) -> Option<Cow<'_, str>> {
|
||||
Some(Cow::Borrowed(self.0))
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
use std::fmt;
|
||||
use std::ops::Deref;
|
||||
use std::slice;
|
||||
|
||||
|
@ -7,11 +8,11 @@ use std::slice;
|
|||
/// entire set can also be done in constant time. Iteration yields elements
|
||||
/// in the order in which they were inserted.
|
||||
///
|
||||
/// The data structure is based on: http://research.swtch.com/sparse
|
||||
/// The data structure is based on: https://research.swtch.com/sparse
|
||||
/// Note though that we don't actually use uninitialized memory. We generally
|
||||
/// reuse allocations, so the initial allocation cost is bareable. However,
|
||||
/// its other properties listed above are extremely useful.
|
||||
#[derive(Clone, Debug)]
|
||||
#[derive(Clone)]
|
||||
pub struct SparseSet {
|
||||
/// Dense contains the instruction pointers in the order in which they
|
||||
/// were inserted.
|
||||
|
@ -60,6 +61,12 @@ impl SparseSet {
|
|||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for SparseSet {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "SparseSet({:?})", self.dense)
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for SparseSet {
|
||||
type Target = [usize];
|
||||
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
# This is a convenience script for running a broad swath of tests across
|
||||
# features. We don't test the complete space, since the complete space is quite
|
||||
# large. Hopefully once we migrate the test suite to better infrastructure
|
||||
|
|
|
@ -195,6 +195,18 @@ expand!(
|
|||
);
|
||||
expand!(expand10, r"(?-u)(?P<a>\w+)\s+(?P<b>\d+)", "abc 123", "$bz$az", "");
|
||||
|
||||
expand!(expand_name1, r"%(?P<Z>[a-z]+)", "%abc", "$Z%", "abc%");
|
||||
expand!(expand_name2, r"\[(?P<Z>[a-z]+)", "[abc", "$Z[", "abc[");
|
||||
expand!(expand_name3, r"\{(?P<Z>[a-z]+)", "{abc", "$Z{", "abc{");
|
||||
expand!(expand_name4, r"\}(?P<Z>[a-z]+)", "}abc", "$Z}", "abc}");
|
||||
expand!(expand_name5, r"%([a-z]+)", "%abc", "$1a%", "%");
|
||||
expand!(expand_name6, r"%([a-z]+)", "%abc", "${1}a%", "abca%");
|
||||
expand!(expand_name7, r"\[(?P<Z[>[a-z]+)", "[abc", "${Z[}[", "abc[");
|
||||
expand!(expand_name8, r"\[(?P<Z[>[a-z]+)", "[abc", "${foo}[", "[");
|
||||
expand!(expand_name9, r"\[(?P<Z[>[a-z]+)", "[abc", "${1a}[", "[");
|
||||
expand!(expand_name10, r"\[(?P<Z[>[a-z]+)", "[abc", "${#}[", "[");
|
||||
expand!(expand_name11, r"\[(?P<Z[>[a-z]+)", "[abc", "${$$}[", "[");
|
||||
|
||||
split!(
|
||||
split1,
|
||||
r"(?-u)\s+",
|
||||
|
|
|
@ -157,10 +157,7 @@ macro_rules! checker {
|
|||
}
|
||||
|
||||
impl quickcheck::Testable for RegexEqualityTest {
|
||||
fn result<G: quickcheck::Gen>(
|
||||
&self,
|
||||
gen: &mut G,
|
||||
) -> TestResult {
|
||||
fn result(&self, gen: &mut quickcheck::Gen) -> TestResult {
|
||||
let input = $mk_input(gen);
|
||||
let input = &input;
|
||||
|
||||
|
|
|
@ -118,6 +118,18 @@ matiter!(match_empty8, r"()+|z", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
|
|||
matiter!(match_empty9, r"z|()+", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
|
||||
matiter!(match_empty10, r"()+|b", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
|
||||
matiter!(match_empty11, r"b|()+", "abc", (0, 0), (1, 2), (3, 3));
|
||||
matiter!(match_empty12, r"|b", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
|
||||
matiter!(match_empty13, r"b|", "abc", (0, 0), (1, 2), (3, 3));
|
||||
matiter!(match_empty14, r"|z", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
|
||||
matiter!(match_empty15, r"z|", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
|
||||
matiter!(match_empty16, r"|", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
|
||||
matiter!(match_empty17, r"||", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
|
||||
matiter!(match_empty18, r"||z", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
|
||||
matiter!(match_empty19, r"(?:)|b", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
|
||||
matiter!(match_empty20, r"b|(?:)", "abc", (0, 0), (1, 2), (3, 3));
|
||||
matiter!(match_empty21, r"(?:|)", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
|
||||
matiter!(match_empty22, r"(?:|)|z", "abc", (0, 0), (1, 1), (2, 2), (3, 3));
|
||||
matiter!(match_empty23, r"a(?:)|b", "abc", (0, 1), (1, 2));
|
||||
|
||||
// Test that the DFA can handle pathological cases.
|
||||
// (This should result in the DFA's cache being flushed too frequently, which
|
||||
|
@ -125,9 +137,10 @@ matiter!(match_empty11, r"b|()+", "abc", (0, 0), (1, 2), (3, 3));
|
|||
#[test]
|
||||
fn dfa_handles_pathological_case() {
|
||||
fn ones_and_zeroes(count: usize) -> String {
|
||||
use rand::{thread_rng, Rng};
|
||||
use rand::rngs::SmallRng;
|
||||
use rand::{Rng, SeedableRng};
|
||||
|
||||
let mut rng = thread_rng();
|
||||
let mut rng = SmallRng::from_entropy();
|
||||
let mut s = String::new();
|
||||
for _ in 0..count {
|
||||
if rng.gen() {
|
||||
|
|
|
@ -4,7 +4,6 @@ macro_rules! t { ($re:expr) => { text!($re) } }
|
|||
macro_rules! match_text { ($text:expr) => { $text.as_bytes() } }
|
||||
macro_rules! use_ { ($($path: tt)*) => { use regex::bytes::$($path)*; } }
|
||||
macro_rules! empty_vec { () => { <Vec<&[u8]>>::new() } }
|
||||
|
||||
macro_rules! bytes { ($text:expr) => { $text } }
|
||||
|
||||
macro_rules! no_expand {
|
||||
|
|
|
@ -4,6 +4,7 @@ macro_rules! t { ($text:expr) => { text!($text) } }
|
|||
macro_rules! match_text { ($text:expr) => { $text.as_str() } }
|
||||
macro_rules! use_ { ($($path: tt)*) => { use regex::$($path)*; } }
|
||||
macro_rules! empty_vec { () => { <Vec<&str>>::new() } }
|
||||
macro_rules! bytes { ($text:expr) => { std::str::from_utf8($text.as_ref()).unwrap() } }
|
||||
|
||||
macro_rules! no_expand {
|
||||
($text:expr) => {{
|
||||
|
|
|
@ -26,6 +26,8 @@ noparse!(fail_bad_capture_name, "(?P<na-me>)");
|
|||
noparse!(fail_bad_flag, "(?a)a");
|
||||
noparse!(fail_too_big, "a{10000000}");
|
||||
noparse!(fail_counted_no_close, "a{1001");
|
||||
noparse!(fail_counted_decreasing, "a{2,1}");
|
||||
noparse!(fail_counted_nonnegative, "a{-1,1}");
|
||||
noparse!(fail_unfinished_cap, "(?");
|
||||
noparse!(fail_unfinished_escape, "\\");
|
||||
noparse!(fail_octal_digit, r"\8");
|
||||
|
@ -41,10 +43,3 @@ noparse!(fail_range_end_no_class, "[a-[:lower:]]");
|
|||
noparse!(fail_range_end_no_begin, r"[a-\A]");
|
||||
noparse!(fail_range_end_no_end, r"[a-\z]");
|
||||
noparse!(fail_range_end_no_boundary, r"[a-\b]");
|
||||
noparse!(fail_empty_alt1, r"|z");
|
||||
noparse!(fail_empty_alt2, r"z|");
|
||||
noparse!(fail_empty_alt3, r"|");
|
||||
noparse!(fail_empty_alt4, r"||");
|
||||
noparse!(fail_empty_alt5, r"()|z");
|
||||
noparse!(fail_empty_alt6, r"z|()");
|
||||
noparse!(fail_empty_alt7, r"(|)");
|
||||
|
|
|
@ -199,3 +199,21 @@ fn regression_nfa_stops1() {
|
|||
let re = ::regex::bytes::Regex::new(r"\bs(?:[ab])").unwrap();
|
||||
assert_eq!(0, re.find_iter(b"s\xE4").count());
|
||||
}
|
||||
|
||||
// See: https://github.com/rust-lang/regex/issues/640
|
||||
#[cfg(feature = "unicode-case")]
|
||||
matiter!(
|
||||
flags_are_unset,
|
||||
r"((?i)foo)|Bar",
|
||||
"foo Foo bar Bar",
|
||||
(0, 3),
|
||||
(4, 7),
|
||||
(12, 15)
|
||||
);
|
||||
|
||||
// See: https://github.com/rust-lang/regex/issues/659
|
||||
//
|
||||
// Note that 'Ј' is not 'j', but cyrillic Je
|
||||
// https://en.wikipedia.org/wiki/Je_(Cyrillic)
|
||||
ismatch!(empty_group_match, r"()Ј01", "zЈ01", true);
|
||||
matiter!(empty_group_find, r"()Ј01", "zЈ01", (1, 5));
|
||||
|
|
|
@ -0,0 +1,31 @@
|
|||
// These tests are only run for the "default" test target because some of them
|
||||
// can take quite a long time. Some of them take long enough that it's not
|
||||
// practical to run them in debug mode. :-/
|
||||
|
||||
// See: https://oss-fuzz.com/testcase-detail/5673225499181056
|
||||
//
|
||||
// Ignored by default since it takes too long in debug mode (almost a minute).
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn fuzz1() {
|
||||
regex!(r"1}{55}{0}*{1}{55}{55}{5}*{1}{55}+{56}|;**");
|
||||
}
|
||||
|
||||
// See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=26505
|
||||
// See: https://github.com/rust-lang/regex/issues/722
|
||||
#[test]
|
||||
fn empty_any_errors_no_panic() {
|
||||
assert!(regex_new!(r"\P{any}").is_err());
|
||||
}
|
||||
|
||||
// This tests that a very large regex errors during compilation instead of
|
||||
// using gratuitous amounts of memory. The specific problem is that the
|
||||
// compiler wasn't accounting for the memory used by Unicode character classes
|
||||
// correctly.
|
||||
//
|
||||
// See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=33579
|
||||
#[test]
|
||||
fn big_regex_fails_to_compile() {
|
||||
let pat = "[\u{0}\u{e}\u{2}\\w~~>[l\t\u{0}]p?<]{971158}";
|
||||
assert!(regex_new!(pat).is_err());
|
||||
}
|
|
@ -94,7 +94,7 @@ replace!(
|
|||
replace,
|
||||
r"([0-9]+)",
|
||||
"age: 26",
|
||||
|captures: &Captures| {
|
||||
|captures: &Captures<'_>| {
|
||||
match_text!(captures.get(1).unwrap())[0..1].to_owned()
|
||||
},
|
||||
"age: 2"
|
||||
|
@ -104,7 +104,7 @@ replace!(
|
|||
replace,
|
||||
r"[0-9]+",
|
||||
"age: 26",
|
||||
|_captures: &Captures| t!("Z").to_owned(),
|
||||
|_captures: &Captures<'_>| t!("Z").to_owned(),
|
||||
"age: Z"
|
||||
);
|
||||
|
||||
|
@ -130,3 +130,101 @@ replace!(
|
|||
t!("${1}a $1a"),
|
||||
"ba "
|
||||
);
|
||||
|
||||
replace!(
|
||||
impl_string,
|
||||
replace,
|
||||
r"[0-9]",
|
||||
"age: 26",
|
||||
t!("Z".to_string()),
|
||||
"age: Z6"
|
||||
);
|
||||
replace!(
|
||||
impl_string_ref,
|
||||
replace,
|
||||
r"[0-9]",
|
||||
"age: 26",
|
||||
t!(&"Z".to_string()),
|
||||
"age: Z6"
|
||||
);
|
||||
replace!(
|
||||
impl_cow_str_borrowed,
|
||||
replace,
|
||||
r"[0-9]",
|
||||
"age: 26",
|
||||
t!(std::borrow::Cow::<'_, str>::Borrowed("Z")),
|
||||
"age: Z6"
|
||||
);
|
||||
replace!(
|
||||
impl_cow_str_borrowed_ref,
|
||||
replace,
|
||||
r"[0-9]",
|
||||
"age: 26",
|
||||
t!(&std::borrow::Cow::<'_, str>::Borrowed("Z")),
|
||||
"age: Z6"
|
||||
);
|
||||
replace!(
|
||||
impl_cow_str_owned,
|
||||
replace,
|
||||
r"[0-9]",
|
||||
"age: 26",
|
||||
t!(std::borrow::Cow::<'_, str>::Owned("Z".to_string())),
|
||||
"age: Z6"
|
||||
);
|
||||
replace!(
|
||||
impl_cow_str_owned_ref,
|
||||
replace,
|
||||
r"[0-9]",
|
||||
"age: 26",
|
||||
t!(&std::borrow::Cow::<'_, str>::Owned("Z".to_string())),
|
||||
"age: Z6"
|
||||
);
|
||||
|
||||
replace!(
|
||||
impl_vec_u8,
|
||||
replace,
|
||||
r"[0-9]",
|
||||
"age: 26",
|
||||
bytes!(vec![b'Z']),
|
||||
"age: Z6"
|
||||
);
|
||||
replace!(
|
||||
impl_vec_u8_ref,
|
||||
replace,
|
||||
r"[0-9]",
|
||||
"age: 26",
|
||||
bytes!(&vec![b'Z']),
|
||||
"age: Z6"
|
||||
);
|
||||
replace!(
|
||||
impl_cow_slice_borrowed,
|
||||
replace,
|
||||
r"[0-9]",
|
||||
"age: 26",
|
||||
bytes!(std::borrow::Cow::<'_, [u8]>::Borrowed(&[b'Z'])),
|
||||
"age: Z6"
|
||||
);
|
||||
replace!(
|
||||
impl_cow_slice_borrowed_ref,
|
||||
replace,
|
||||
r"[0-9]",
|
||||
"age: 26",
|
||||
bytes!(&std::borrow::Cow::<'_, [u8]>::Borrowed(&[b'Z'])),
|
||||
"age: Z6"
|
||||
);
|
||||
replace!(
|
||||
impl_cow_slice_owned,
|
||||
replace,
|
||||
r"[0-9]",
|
||||
"age: 26",
|
||||
bytes!(std::borrow::Cow::<'_, [u8]>::Owned(vec![b'Z'])),
|
||||
"age: Z6"
|
||||
);
|
||||
replace!(
|
||||
impl_cow_slice_owned_ref,
|
||||
replace,
|
||||
r"[0-9]",
|
||||
"age: 26",
|
||||
bytes!(&std::borrow::Cow::<'_, [u8]>::Owned(vec![b'Z'])),
|
||||
"age: Z6"
|
||||
);
|
||||
|
|
|
@ -17,6 +17,17 @@ matset!(set16, &["a"], "a", 0);
|
|||
matset!(set17, &[".*a"], "a", 0);
|
||||
matset!(set18, &["a", "β"], "β", 1);
|
||||
|
||||
// regexes that match the empty string
|
||||
matset!(setempty1, &["", "a"], "abc", 0, 1);
|
||||
matset!(setempty2, &["", "b"], "abc", 0, 1);
|
||||
matset!(setempty3, &["", "z"], "abc", 0);
|
||||
matset!(setempty4, &["a", ""], "abc", 0, 1);
|
||||
matset!(setempty5, &["b", ""], "abc", 0, 1);
|
||||
matset!(setempty6, &["z", ""], "abc", 1);
|
||||
matset!(setempty7, &["b", "(?:)"], "abc", 0, 1);
|
||||
matset!(setempty8, &["(?:)", "b"], "abc", 0, 1);
|
||||
matset!(setempty9, &["c(?:)", "b"], "abc", 0, 1);
|
||||
|
||||
nomatset!(nset1, &["a", "a"], "b");
|
||||
nomatset!(nset2, &["^foo", "bar$"], "bar foo");
|
||||
nomatset!(
|
||||
|
@ -43,3 +54,14 @@ fn get_set_patterns() {
|
|||
let set = regex_set!(&["a", "b"]);
|
||||
assert_eq!(vec!["a", "b"], set.patterns());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn len_and_empty() {
|
||||
let empty = regex_set!(&[""; 0]);
|
||||
assert_eq!(empty.len(), 0);
|
||||
assert!(empty.is_empty());
|
||||
|
||||
let not_empty = regex_set!(&["ab", "b"]);
|
||||
assert_eq!(not_empty.len(), 2);
|
||||
assert!(!not_empty.is_empty());
|
||||
}
|
||||
|
|
|
@ -1,8 +1,5 @@
|
|||
#![cfg_attr(feature = "pattern", feature(pattern))]
|
||||
|
||||
extern crate rand;
|
||||
extern crate regex;
|
||||
|
||||
macro_rules! regex_new {
|
||||
($re:expr) => {{
|
||||
use regex::internal::ExecBuilder;
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
extern crate rand;
|
||||
extern crate regex;
|
||||
|
||||
macro_rules! regex_new {
|
||||
($re:expr) => {{
|
||||
use regex::internal::ExecBuilder;
|
||||
|
|
|
@ -1,8 +1,5 @@
|
|||
#![cfg_attr(feature = "pattern", feature(pattern))]
|
||||
|
||||
extern crate rand;
|
||||
extern crate regex;
|
||||
|
||||
macro_rules! regex_new {
|
||||
($re:expr) => {{
|
||||
use regex::internal::ExecBuilder;
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
extern crate quickcheck;
|
||||
extern crate regex;
|
||||
|
||||
/*
|
||||
* This test is a minimal version of <rofl_0> and <subdiff_0>
|
||||
*
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
#![cfg_attr(feature = "pattern", feature(pattern))]
|
||||
|
||||
extern crate rand;
|
||||
extern crate regex;
|
||||
use regex;
|
||||
|
||||
// Due to macro scoping rules, this definition only applies for the modules
|
||||
// defined below. Effectively, it allows us to use the same tests for both
|
||||
|
@ -49,6 +48,7 @@ mod misc;
|
|||
mod multiline;
|
||||
mod noparse;
|
||||
mod regression;
|
||||
mod regression_fuzz;
|
||||
mod replace;
|
||||
mod searcher;
|
||||
mod set;
|
||||
|
@ -82,26 +82,49 @@ fn allow_octal() {
|
|||
#[test]
|
||||
fn oibits() {
|
||||
use regex::bytes;
|
||||
use regex::{Regex, RegexBuilder};
|
||||
use std::panic::UnwindSafe;
|
||||
use regex::{Regex, RegexBuilder, RegexSet, RegexSetBuilder};
|
||||
use std::panic::{RefUnwindSafe, UnwindSafe};
|
||||
|
||||
fn assert_send<T: Send>() {}
|
||||
fn assert_sync<T: Sync>() {}
|
||||
fn assert_unwind_safe<T: UnwindSafe>() {}
|
||||
fn assert_ref_unwind_safe<T: RefUnwindSafe>() {}
|
||||
|
||||
assert_send::<Regex>();
|
||||
assert_sync::<Regex>();
|
||||
assert_unwind_safe::<Regex>();
|
||||
assert_ref_unwind_safe::<Regex>();
|
||||
assert_send::<RegexBuilder>();
|
||||
assert_sync::<RegexBuilder>();
|
||||
assert_unwind_safe::<RegexBuilder>();
|
||||
assert_ref_unwind_safe::<RegexBuilder>();
|
||||
|
||||
assert_send::<bytes::Regex>();
|
||||
assert_sync::<bytes::Regex>();
|
||||
assert_unwind_safe::<bytes::Regex>();
|
||||
assert_ref_unwind_safe::<bytes::Regex>();
|
||||
assert_send::<bytes::RegexBuilder>();
|
||||
assert_sync::<bytes::RegexBuilder>();
|
||||
assert_unwind_safe::<bytes::RegexBuilder>();
|
||||
assert_ref_unwind_safe::<bytes::RegexBuilder>();
|
||||
|
||||
assert_send::<RegexSet>();
|
||||
assert_sync::<RegexSet>();
|
||||
assert_unwind_safe::<RegexSet>();
|
||||
assert_ref_unwind_safe::<RegexSet>();
|
||||
assert_send::<RegexSetBuilder>();
|
||||
assert_sync::<RegexSetBuilder>();
|
||||
assert_unwind_safe::<RegexSetBuilder>();
|
||||
assert_ref_unwind_safe::<RegexSetBuilder>();
|
||||
|
||||
assert_send::<bytes::RegexSet>();
|
||||
assert_sync::<bytes::RegexSet>();
|
||||
assert_unwind_safe::<bytes::RegexSet>();
|
||||
assert_ref_unwind_safe::<bytes::RegexSet>();
|
||||
assert_send::<bytes::RegexSetBuilder>();
|
||||
assert_sync::<bytes::RegexSetBuilder>();
|
||||
assert_unwind_safe::<bytes::RegexSetBuilder>();
|
||||
assert_ref_unwind_safe::<bytes::RegexSetBuilder>();
|
||||
}
|
||||
|
||||
// See: https://github.com/rust-lang/regex/issues/568
|
||||
|
@ -112,3 +135,18 @@ fn oibits_regression() {
|
|||
|
||||
let _ = panic::catch_unwind(|| Regex::new("a").unwrap());
|
||||
}
|
||||
|
||||
// See: https://github.com/rust-lang/regex/issues/750
|
||||
#[test]
|
||||
#[cfg(target_pointer_width = "64")]
|
||||
fn regex_is_reasonably_small() {
|
||||
use std::mem::size_of;
|
||||
|
||||
use regex::bytes;
|
||||
use regex::{Regex, RegexSet};
|
||||
|
||||
assert_eq!(16, size_of::<Regex>());
|
||||
assert_eq!(16, size_of::<RegexSet>());
|
||||
assert_eq!(16, size_of::<bytes::Regex>());
|
||||
assert_eq!(16, size_of::<bytes::RegexSet>());
|
||||
}
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
extern crate rand;
|
||||
extern crate regex;
|
||||
|
||||
macro_rules! regex_new {
|
||||
($re:expr) => {{
|
||||
use regex::bytes::Regex;
|
||||
|
|
|
@ -1,8 +1,5 @@
|
|||
#![cfg_attr(feature = "pattern", feature(pattern))]
|
||||
|
||||
extern crate rand;
|
||||
extern crate regex;
|
||||
|
||||
macro_rules! regex_new {
|
||||
($re:expr) => {{
|
||||
use regex::internal::ExecBuilder;
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
extern crate rand;
|
||||
extern crate regex;
|
||||
|
||||
macro_rules! regex_new {
|
||||
($re:expr) => {{
|
||||
use regex::internal::ExecBuilder;
|
||||
|
|
|
@ -1,8 +1,5 @@
|
|||
#![cfg_attr(feature = "pattern", feature(pattern))]
|
||||
|
||||
extern crate rand;
|
||||
extern crate regex;
|
||||
|
||||
macro_rules! regex_new {
|
||||
($re:expr) => {{
|
||||
use regex::internal::ExecBuilder;
|
||||
|
|
|
@ -74,6 +74,9 @@ mat!(
|
|||
Some((0, 3))
|
||||
);
|
||||
mat!(uni_class_gencat_format, r"\p{Format}", "\u{E007F}", Some((0, 4)));
|
||||
// See: https://github.com/rust-lang/regex/issues/719
|
||||
mat!(uni_class_gencat_format_abbrev1, r"\p{cf}", "\u{E007F}", Some((0, 4)));
|
||||
mat!(uni_class_gencat_format_abbrev2, r"\p{gc=cf}", "\u{E007F}", Some((0, 4)));
|
||||
mat!(
|
||||
uni_class_gencat_initial_punctuation,
|
||||
r"\p{Initial_Punctuation}",
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
{"files":{"Cargo.toml":"a08d3007cec7ad1a83afad57980965ece5457089404f6f5d41eacc8143386d69","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"c9a75f18b9ab2927829a208fc6aa2cf4e63b8420887ba29cdb265d6619ae82d5","README.md":"ab6f09e96c06e37ee3df492562a07c1c3548dd5abf73301f8215a5dcedcccc84","benches/thread_local.rs":"cc8bde81ed6206525feff209598caf1e01e89a83bf21d8b7ccc0dadc8b89d815","src/cached.rs":"089286aa7bcde7c92b1ee7381b74f8c30049c0d80a85c1babdbac69b2e210396","src/lib.rs":"a67d7bf8c7c3bd869ea297cf1d158db8c9c4bbf7ae1e23d9028cfc3a7554e235","src/thread_id.rs":"0962c130061939557aa272115e4420fbbc63b6bd306783a456a8ffcbf304a447","src/unreachable.rs":"830d44988f86f4fc6c3c4dd7e9e4e7d0f2cb9c5b024c360b5f7ceae365983367"},"package":"d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14"}
|
|
@ -1,26 +0,0 @@
|
|||
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
|
||||
#
|
||||
# When uploading crates to the registry Cargo will automatically
|
||||
# "normalize" Cargo.toml files for maximal compatibility
|
||||
# with all versions of Cargo and also rewrite `path` dependencies
|
||||
# to registry (e.g., crates.io) dependencies
|
||||
#
|
||||
# If you believe there's an error in this file please file an
|
||||
# issue against the rust-lang/cargo repository. If you're
|
||||
# editing this file be aware that the upstream Cargo.toml
|
||||
# will likely look very different (and much more reasonable)
|
||||
|
||||
[package]
|
||||
name = "thread_local"
|
||||
version = "1.0.1"
|
||||
authors = ["Amanieu d'Antras <amanieu@gmail.com>"]
|
||||
description = "Per-object thread-local storage"
|
||||
documentation = "https://amanieu.github.io/thread_local-rs/thread_local/index.html"
|
||||
readme = "README.md"
|
||||
keywords = ["thread_local", "concurrent", "thread"]
|
||||
license = "Apache-2.0/MIT"
|
||||
repository = "https://github.com/Amanieu/thread_local-rs"
|
||||
[dependencies.lazy_static]
|
||||
version = "1.0"
|
||||
[badges.travis-ci]
|
||||
repository = "Amanieu/thread_local-rs"
|
|
@ -1,201 +0,0 @@
|
|||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
|
@ -1,25 +0,0 @@
|
|||
Copyright (c) 2016 The Rust Project Developers
|
||||
|
||||
Permission is hereby granted, free of charge, to any
|
||||
person obtaining a copy of this software and associated
|
||||
documentation files (the "Software"), to deal in the
|
||||
Software without restriction, including without
|
||||
limitation the rights to use, copy, modify, merge,
|
||||
publish, distribute, sublicense, and/or sell copies of
|
||||
the Software, and to permit persons to whom the Software
|
||||
is furnished to do so, subject to the following
|
||||
conditions:
|
||||
|
||||
The above copyright notice and this permission notice
|
||||
shall be included in all copies or substantial portions
|
||||
of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
|
||||
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
|
||||
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
|
||||
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
|
||||
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
|
||||
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
|
@ -1,41 +0,0 @@
|
|||
thread_local
|
||||
============
|
||||
|
||||
[![Build Status](https://travis-ci.org/Amanieu/thread_local-rs.svg?branch=master)](https://travis-ci.org/Amanieu/thread_local-rs) [![Crates.io](https://img.shields.io/crates/v/thread_local.svg)](https://crates.io/crates/thread_local)
|
||||
|
||||
This library provides the `ThreadLocal` and `CachedThreadLocal` types which
|
||||
allow a separate copy of an object to be used for each thread. This allows for
|
||||
per-object thread-local storage, unlike the standard library's `thread_local!`
|
||||
macro which only allows static thread-local storage.
|
||||
|
||||
[Documentation](https://amanieu.github.io/thread_local-rs/thread_local/index.html)
|
||||
|
||||
## Usage
|
||||
|
||||
Add this to your `Cargo.toml`:
|
||||
|
||||
```toml
|
||||
[dependencies]
|
||||
thread_local = "1.0"
|
||||
```
|
||||
|
||||
and this to your crate root:
|
||||
|
||||
```rust
|
||||
extern crate thread_local;
|
||||
```
|
||||
|
||||
## License
|
||||
|
||||
Licensed under either of
|
||||
|
||||
* Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
|
||||
* MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
|
||||
|
||||
at your option.
|
||||
|
||||
### Contribution
|
||||
|
||||
Unless you explicitly state otherwise, any contribution intentionally submitted
|
||||
for inclusion in the work by you, as defined in the Apache-2.0 license, shall be dual licensed as above, without any
|
||||
additional terms or conditions.
|
|
@ -1,18 +0,0 @@
|
|||
#![feature(test)]
|
||||
|
||||
extern crate thread_local;
|
||||
extern crate test;
|
||||
|
||||
use thread_local::{ThreadLocal, CachedThreadLocal};
|
||||
|
||||
#[bench]
|
||||
fn thread_local(b: &mut test::Bencher) {
|
||||
let local = ThreadLocal::new();
|
||||
b.iter(|| { let _: &i32 = local.get_or(|| Box::new(0)); });
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn cached_thread_local(b: &mut test::Bencher) {
|
||||
let local = CachedThreadLocal::new();
|
||||
b.iter(|| { let _: &i32 = local.get_or(|| Box::new(0)); });
|
||||
}
|
|
@ -1,198 +0,0 @@
|
|||
use super::{IntoIter, IterMut, ThreadLocal};
|
||||
use std::cell::UnsafeCell;
|
||||
use std::fmt;
|
||||
use std::panic::UnwindSafe;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use thread_id;
|
||||
use unreachable::{UncheckedOptionExt, UncheckedResultExt};
|
||||
|
||||
/// Wrapper around `ThreadLocal` which adds a fast path for a single thread.
|
||||
///
|
||||
/// This has the same API as `ThreadLocal`, but will register the first thread
|
||||
/// that sets a value as its owner. All accesses by the owner will go through
|
||||
/// a special fast path which is much faster than the normal `ThreadLocal` path.
|
||||
pub struct CachedThreadLocal<T: Send> {
|
||||
owner: AtomicUsize,
|
||||
local: UnsafeCell<Option<Box<T>>>,
|
||||
global: ThreadLocal<T>,
|
||||
}
|
||||
|
||||
// CachedThreadLocal is always Sync, even if T isn't
|
||||
unsafe impl<T: Send> Sync for CachedThreadLocal<T> {}
|
||||
|
||||
impl<T: Send> Default for CachedThreadLocal<T> {
|
||||
fn default() -> CachedThreadLocal<T> {
|
||||
CachedThreadLocal::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Send> CachedThreadLocal<T> {
|
||||
/// Creates a new empty `CachedThreadLocal`.
|
||||
pub fn new() -> CachedThreadLocal<T> {
|
||||
CachedThreadLocal {
|
||||
owner: AtomicUsize::new(0),
|
||||
local: UnsafeCell::new(None),
|
||||
global: ThreadLocal::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the element for the current thread, if it exists.
|
||||
pub fn get(&self) -> Option<&T> {
|
||||
let id = thread_id::get();
|
||||
let owner = self.owner.load(Ordering::Relaxed);
|
||||
if owner == id {
|
||||
return unsafe { Some((*self.local.get()).as_ref().unchecked_unwrap()) };
|
||||
}
|
||||
if owner == 0 {
|
||||
return None;
|
||||
}
|
||||
self.global.get_fast(id)
|
||||
}
|
||||
|
||||
/// Returns the element for the current thread, or creates it if it doesn't
|
||||
/// exist.
|
||||
#[inline(always)]
|
||||
pub fn get_or<F>(&self, create: F) -> &T
|
||||
where
|
||||
F: FnOnce() -> T,
|
||||
{
|
||||
unsafe {
|
||||
self.get_or_try(|| Ok::<T, ()>(create()))
|
||||
.unchecked_unwrap_ok()
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the element for the current thread, or creates it if it doesn't
|
||||
/// exist. If `create` fails, that error is returned and no element is
|
||||
/// added.
|
||||
pub fn get_or_try<F, E>(&self, create: F) -> Result<&T, E>
|
||||
where
|
||||
F: FnOnce() -> Result<T, E>,
|
||||
{
|
||||
let id = thread_id::get();
|
||||
let owner = self.owner.load(Ordering::Relaxed);
|
||||
if owner == id {
|
||||
return Ok(unsafe { (*self.local.get()).as_ref().unchecked_unwrap() });
|
||||
}
|
||||
self.get_or_try_slow(id, owner, create)
|
||||
}
|
||||
|
||||
#[cold]
|
||||
#[inline(never)]
|
||||
fn get_or_try_slow<F, E>(&self, id: usize, owner: usize, create: F) -> Result<&T, E>
|
||||
where
|
||||
F: FnOnce() -> Result<T, E>,
|
||||
{
|
||||
if owner == 0 && self.owner.compare_and_swap(0, id, Ordering::Relaxed) == 0 {
|
||||
unsafe {
|
||||
(*self.local.get()) = Some(Box::new(create()?));
|
||||
return Ok((*self.local.get()).as_ref().unchecked_unwrap());
|
||||
}
|
||||
}
|
||||
match self.global.get_fast(id) {
|
||||
Some(x) => Ok(x),
|
||||
None => Ok(self.global.insert(id, Box::new(create()?), true)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a mutable iterator over the local values of all threads.
|
||||
///
|
||||
/// Since this call borrows the `ThreadLocal` mutably, this operation can
|
||||
/// be done safely---the mutable borrow statically guarantees no other
|
||||
/// threads are currently accessing their associated values.
|
||||
pub fn iter_mut(&mut self) -> CachedIterMut<T> {
|
||||
CachedIterMut {
|
||||
local: unsafe { (*self.local.get()).as_mut().map(|x| &mut **x) },
|
||||
global: self.global.iter_mut(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Removes all thread-specific values from the `ThreadLocal`, effectively
|
||||
/// reseting it to its original state.
|
||||
///
|
||||
/// Since this call borrows the `ThreadLocal` mutably, this operation can
|
||||
/// be done safely---the mutable borrow statically guarantees no other
|
||||
/// threads are currently accessing their associated values.
|
||||
pub fn clear(&mut self) {
|
||||
*self = CachedThreadLocal::new();
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Send> IntoIterator for CachedThreadLocal<T> {
|
||||
type Item = T;
|
||||
type IntoIter = CachedIntoIter<T>;
|
||||
|
||||
fn into_iter(self) -> CachedIntoIter<T> {
|
||||
CachedIntoIter {
|
||||
local: unsafe { (*self.local.get()).take().map(|x| *x) },
|
||||
global: self.global.into_iter(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: Send + 'a> IntoIterator for &'a mut CachedThreadLocal<T> {
|
||||
type Item = &'a mut T;
|
||||
type IntoIter = CachedIterMut<'a, T>;
|
||||
|
||||
fn into_iter(self) -> CachedIterMut<'a, T> {
|
||||
self.iter_mut()
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Send + Default> CachedThreadLocal<T> {
|
||||
/// Returns the element for the current thread, or creates a default one if
|
||||
/// it doesn't exist.
|
||||
pub fn get_or_default(&self) -> &T {
|
||||
self.get_or(T::default)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Send + fmt::Debug> fmt::Debug for CachedThreadLocal<T> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "ThreadLocal {{ local_data: {:?} }}", self.get())
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Send + UnwindSafe> UnwindSafe for CachedThreadLocal<T> {}
|
||||
|
||||
/// Mutable iterator over the contents of a `CachedThreadLocal`.
|
||||
pub struct CachedIterMut<'a, T: Send + 'a> {
|
||||
local: Option<&'a mut T>,
|
||||
global: IterMut<'a, T>,
|
||||
}
|
||||
|
||||
impl<'a, T: Send + 'a> Iterator for CachedIterMut<'a, T> {
|
||||
type Item = &'a mut T;
|
||||
|
||||
fn next(&mut self) -> Option<&'a mut T> {
|
||||
self.local.take().or_else(|| self.global.next())
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
let len = self.global.size_hint().0 + self.local.is_some() as usize;
|
||||
(len, Some(len))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: Send + 'a> ExactSizeIterator for CachedIterMut<'a, T> {}
|
||||
|
||||
/// An iterator that moves out of a `CachedThreadLocal`.
|
||||
pub struct CachedIntoIter<T: Send> {
|
||||
local: Option<T>,
|
||||
global: IntoIter<T>,
|
||||
}
|
||||
|
||||
impl<T: Send> Iterator for CachedIntoIter<T> {
|
||||
type Item = T;
|
||||
|
||||
fn next(&mut self) -> Option<T> {
|
||||
self.local.take().or_else(|| self.global.next())
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
let len = self.global.size_hint().0 + self.local.is_some() as usize;
|
||||
(len, Some(len))
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Send> ExactSizeIterator for CachedIntoIter<T> {}
|
|
@ -1,607 +0,0 @@
|
|||
// Copyright 2017 Amanieu d'Antras
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
|
||||
// http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
|
||||
// http://opensource.org/licenses/MIT>, at your option. This file may not be
|
||||
// copied, modified, or distributed except according to those terms.
|
||||
|
||||
//! Per-object thread-local storage
|
||||
//!
|
||||
//! This library provides the `ThreadLocal` type which allows a separate copy of
|
||||
//! an object to be used for each thread. This allows for per-object
|
||||
//! thread-local storage, unlike the standard library's `thread_local!` macro
|
||||
//! which only allows static thread-local storage.
|
||||
//!
|
||||
//! Per-thread objects are not destroyed when a thread exits. Instead, objects
|
||||
//! are only destroyed when the `ThreadLocal` containing them is destroyed.
|
||||
//!
|
||||
//! You can also iterate over the thread-local values of all thread in a
|
||||
//! `ThreadLocal` object using the `iter_mut` and `into_iter` methods. This can
|
||||
//! only be done if you have mutable access to the `ThreadLocal` object, which
|
||||
//! guarantees that you are the only thread currently accessing it.
|
||||
//!
|
||||
//! A `CachedThreadLocal` type is also provided which wraps a `ThreadLocal` but
|
||||
//! also uses a special fast path for the first thread that writes into it. The
|
||||
//! fast path has very low overhead (<1ns per access) while keeping the same
|
||||
//! performance as `ThreadLocal` for other threads.
|
||||
//!
|
||||
//! Note that since thread IDs are recycled when a thread exits, it is possible
|
||||
//! for one thread to retrieve the object of another thread. Since this can only
|
||||
//! occur after a thread has exited this does not lead to any race conditions.
|
||||
//!
|
||||
//! # Examples
|
||||
//!
|
||||
//! Basic usage of `ThreadLocal`:
|
||||
//!
|
||||
//! ```rust
|
||||
//! use thread_local::ThreadLocal;
|
||||
//! let tls: ThreadLocal<u32> = ThreadLocal::new();
|
||||
//! assert_eq!(tls.get(), None);
|
||||
//! assert_eq!(tls.get_or(|| 5), &5);
|
||||
//! assert_eq!(tls.get(), Some(&5));
|
||||
//! ```
|
||||
//!
|
||||
//! Combining thread-local values into a single result:
|
||||
//!
|
||||
//! ```rust
|
||||
//! use thread_local::ThreadLocal;
|
||||
//! use std::sync::Arc;
|
||||
//! use std::cell::Cell;
|
||||
//! use std::thread;
|
||||
//!
|
||||
//! let tls = Arc::new(ThreadLocal::new());
|
||||
//!
|
||||
//! // Create a bunch of threads to do stuff
|
||||
//! for _ in 0..5 {
|
||||
//! let tls2 = tls.clone();
|
||||
//! thread::spawn(move || {
|
||||
//! // Increment a counter to count some event...
|
||||
//! let cell = tls2.get_or(|| Cell::new(0));
|
||||
//! cell.set(cell.get() + 1);
|
||||
//! }).join().unwrap();
|
||||
//! }
|
||||
//!
|
||||
//! // Once all threads are done, collect the counter values and return the
|
||||
//! // sum of all thread-local counter values.
|
||||
//! let tls = Arc::try_unwrap(tls).unwrap();
|
||||
//! let total = tls.into_iter().fold(0, |x, y| x + y.get());
|
||||
//! assert_eq!(total, 5);
|
||||
//! ```
|
||||
|
||||
#![warn(missing_docs)]
|
||||
|
||||
#[macro_use]
|
||||
extern crate lazy_static;
|
||||
|
||||
mod thread_id;
|
||||
mod unreachable;
|
||||
mod cached;
|
||||
|
||||
pub use cached::{CachedIntoIter, CachedIterMut, CachedThreadLocal};
|
||||
|
||||
use std::cell::UnsafeCell;
|
||||
use std::fmt;
|
||||
use std::marker::PhantomData;
|
||||
use std::panic::UnwindSafe;
|
||||
use std::sync::atomic::{AtomicPtr, AtomicUsize, Ordering};
|
||||
use std::sync::Mutex;
|
||||
use unreachable::{UncheckedOptionExt, UncheckedResultExt};
|
||||
|
||||
/// Thread-local variable wrapper
|
||||
///
|
||||
/// See the [module-level documentation](index.html) for more.
|
||||
pub struct ThreadLocal<T: Send> {
|
||||
// Pointer to the current top-level hash table
|
||||
table: AtomicPtr<Table<T>>,
|
||||
|
||||
// Lock used to guard against concurrent modifications. This is only taken
|
||||
// while writing to the table, not when reading from it. This also guards
|
||||
// the counter for the total number of values in the hash table.
|
||||
lock: Mutex<usize>,
|
||||
}
|
||||
|
||||
struct Table<T: Send> {
|
||||
// Hash entries for the table
|
||||
entries: Box<[TableEntry<T>]>,
|
||||
|
||||
// Number of bits used for the hash function
|
||||
hash_bits: usize,
|
||||
|
||||
// Previous table, half the size of the current one
|
||||
prev: Option<Box<Table<T>>>,
|
||||
}
|
||||
|
||||
struct TableEntry<T: Send> {
|
||||
// Current owner of this entry, or 0 if this is an empty entry
|
||||
owner: AtomicUsize,
|
||||
|
||||
// The object associated with this entry. This is only ever accessed by the
|
||||
// owner of the entry.
|
||||
data: UnsafeCell<Option<Box<T>>>,
|
||||
}
|
||||
|
||||
// ThreadLocal is always Sync, even if T isn't
|
||||
unsafe impl<T: Send> Sync for ThreadLocal<T> {}
|
||||
|
||||
impl<T: Send> Default for ThreadLocal<T> {
|
||||
fn default() -> ThreadLocal<T> {
|
||||
ThreadLocal::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Send> Drop for ThreadLocal<T> {
|
||||
fn drop(&mut self) {
|
||||
unsafe {
|
||||
Box::from_raw(self.table.load(Ordering::Relaxed));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Implementation of Clone for TableEntry, needed to make vec![] work
|
||||
impl<T: Send> Clone for TableEntry<T> {
|
||||
fn clone(&self) -> TableEntry<T> {
|
||||
TableEntry {
|
||||
owner: AtomicUsize::new(0),
|
||||
data: UnsafeCell::new(None),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Hash function for the thread id
|
||||
#[cfg(target_pointer_width = "32")]
|
||||
#[inline]
|
||||
fn hash(id: usize, bits: usize) -> usize {
|
||||
id.wrapping_mul(0x9E3779B9) >> (32 - bits)
|
||||
}
|
||||
#[cfg(target_pointer_width = "64")]
|
||||
#[inline]
|
||||
fn hash(id: usize, bits: usize) -> usize {
|
||||
id.wrapping_mul(0x9E37_79B9_7F4A_7C15) >> (64 - bits)
|
||||
}
|
||||
|
||||
impl<T: Send> ThreadLocal<T> {
|
||||
/// Creates a new empty `ThreadLocal`.
|
||||
pub fn new() -> ThreadLocal<T> {
|
||||
let entry = TableEntry {
|
||||
owner: AtomicUsize::new(0),
|
||||
data: UnsafeCell::new(None),
|
||||
};
|
||||
let table = Table {
|
||||
entries: vec![entry; 2].into_boxed_slice(),
|
||||
hash_bits: 1,
|
||||
prev: None,
|
||||
};
|
||||
ThreadLocal {
|
||||
table: AtomicPtr::new(Box::into_raw(Box::new(table))),
|
||||
lock: Mutex::new(0),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the element for the current thread, if it exists.
|
||||
pub fn get(&self) -> Option<&T> {
|
||||
let id = thread_id::get();
|
||||
self.get_fast(id)
|
||||
}
|
||||
|
||||
/// Returns the element for the current thread, or creates it if it doesn't
|
||||
/// exist.
|
||||
pub fn get_or<F>(&self, create: F) -> &T
|
||||
where
|
||||
F: FnOnce() -> T,
|
||||
{
|
||||
unsafe {
|
||||
self.get_or_try(|| Ok::<T, ()>(create()))
|
||||
.unchecked_unwrap_ok()
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the element for the current thread, or creates it if it doesn't
|
||||
/// exist. If `create` fails, that error is returned and no element is
|
||||
/// added.
|
||||
pub fn get_or_try<F, E>(&self, create: F) -> Result<&T, E>
|
||||
where
|
||||
F: FnOnce() -> Result<T, E>,
|
||||
{
|
||||
let id = thread_id::get();
|
||||
match self.get_fast(id) {
|
||||
Some(x) => Ok(x),
|
||||
None => Ok(self.insert(id, Box::new(create()?), true)),
|
||||
}
|
||||
}
|
||||
|
||||
// Simple hash table lookup function
|
||||
fn lookup(id: usize, table: &Table<T>) -> Option<&UnsafeCell<Option<Box<T>>>> {
|
||||
// Because we use a Mutex to prevent concurrent modifications (but not
|
||||
// reads) of the hash table, we can avoid any memory barriers here. No
|
||||
// elements between our hash bucket and our value can have been modified
|
||||
// since we inserted our thread-local value into the table.
|
||||
for entry in table.entries.iter().cycle().skip(hash(id, table.hash_bits)) {
|
||||
let owner = entry.owner.load(Ordering::Relaxed);
|
||||
if owner == id {
|
||||
return Some(&entry.data);
|
||||
}
|
||||
if owner == 0 {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
unreachable!();
|
||||
}
|
||||
|
||||
// Fast path: try to find our thread in the top-level hash table
|
||||
fn get_fast(&self, id: usize) -> Option<&T> {
|
||||
let table = unsafe { &*self.table.load(Ordering::Acquire) };
|
||||
match Self::lookup(id, table) {
|
||||
Some(x) => unsafe { Some((*x.get()).as_ref().unchecked_unwrap()) },
|
||||
None => self.get_slow(id, table),
|
||||
}
|
||||
}
|
||||
|
||||
// Slow path: try to find our thread in the other hash tables, and then
|
||||
// move it to the top-level hash table.
|
||||
#[cold]
|
||||
fn get_slow(&self, id: usize, table_top: &Table<T>) -> Option<&T> {
|
||||
let mut current = &table_top.prev;
|
||||
while let Some(ref table) = *current {
|
||||
if let Some(x) = Self::lookup(id, table) {
|
||||
let data = unsafe { (*x.get()).take().unchecked_unwrap() };
|
||||
return Some(self.insert(id, data, false));
|
||||
}
|
||||
current = &table.prev;
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
#[cold]
|
||||
fn insert(&self, id: usize, data: Box<T>, new: bool) -> &T {
|
||||
// Lock the Mutex to ensure only a single thread is modify the hash
|
||||
// table at once.
|
||||
let mut count = self.lock.lock().unwrap();
|
||||
if new {
|
||||
*count += 1;
|
||||
}
|
||||
let table_raw = self.table.load(Ordering::Relaxed);
|
||||
let table = unsafe { &*table_raw };
|
||||
|
||||
// If the current top-level hash table is more than 75% full, add a new
|
||||
// level with 2x the capacity. Elements will be moved up to the new top
|
||||
// level table as they are accessed.
|
||||
let table = if *count > table.entries.len() * 3 / 4 {
|
||||
let entry = TableEntry {
|
||||
owner: AtomicUsize::new(0),
|
||||
data: UnsafeCell::new(None),
|
||||
};
|
||||
let new_table = Box::into_raw(Box::new(Table {
|
||||
entries: vec![entry; table.entries.len() * 2].into_boxed_slice(),
|
||||
hash_bits: table.hash_bits + 1,
|
||||
prev: unsafe { Some(Box::from_raw(table_raw)) },
|
||||
}));
|
||||
self.table.store(new_table, Ordering::Release);
|
||||
unsafe { &*new_table }
|
||||
} else {
|
||||
table
|
||||
};
|
||||
|
||||
// Insert the new element into the top-level hash table
|
||||
for entry in table.entries.iter().cycle().skip(hash(id, table.hash_bits)) {
|
||||
let owner = entry.owner.load(Ordering::Relaxed);
|
||||
if owner == 0 {
|
||||
unsafe {
|
||||
entry.owner.store(id, Ordering::Relaxed);
|
||||
*entry.data.get() = Some(data);
|
||||
return (*entry.data.get()).as_ref().unchecked_unwrap();
|
||||
}
|
||||
}
|
||||
if owner == id {
|
||||
// This can happen if create() inserted a value into this
|
||||
// ThreadLocal between our calls to get_fast() and insert(). We
|
||||
// just return the existing value and drop the newly-allocated
|
||||
// Box.
|
||||
unsafe {
|
||||
return (*entry.data.get()).as_ref().unchecked_unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
unreachable!();
|
||||
}
|
||||
|
||||
fn raw_iter(&mut self) -> RawIter<T> {
|
||||
RawIter {
|
||||
remaining: *self.lock.get_mut().unwrap(),
|
||||
index: 0,
|
||||
table: self.table.load(Ordering::Relaxed),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a mutable iterator over the local values of all threads.
|
||||
///
|
||||
/// Since this call borrows the `ThreadLocal` mutably, this operation can
|
||||
/// be done safely---the mutable borrow statically guarantees no other
|
||||
/// threads are currently accessing their associated values.
|
||||
pub fn iter_mut(&mut self) -> IterMut<T> {
|
||||
IterMut {
|
||||
raw: self.raw_iter(),
|
||||
marker: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
/// Removes all thread-specific values from the `ThreadLocal`, effectively
|
||||
/// reseting it to its original state.
|
||||
///
|
||||
/// Since this call borrows the `ThreadLocal` mutably, this operation can
|
||||
/// be done safely---the mutable borrow statically guarantees no other
|
||||
/// threads are currently accessing their associated values.
|
||||
pub fn clear(&mut self) {
|
||||
*self = ThreadLocal::new();
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Send> IntoIterator for ThreadLocal<T> {
|
||||
type Item = T;
|
||||
type IntoIter = IntoIter<T>;
|
||||
|
||||
fn into_iter(mut self) -> IntoIter<T> {
|
||||
IntoIter {
|
||||
raw: self.raw_iter(),
|
||||
_thread_local: self,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: Send + 'a> IntoIterator for &'a mut ThreadLocal<T> {
|
||||
type Item = &'a mut T;
|
||||
type IntoIter = IterMut<'a, T>;
|
||||
|
||||
fn into_iter(self) -> IterMut<'a, T> {
|
||||
self.iter_mut()
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Send + Default> ThreadLocal<T> {
|
||||
/// Returns the element for the current thread, or creates a default one if
|
||||
/// it doesn't exist.
|
||||
pub fn get_or_default(&self) -> &T {
|
||||
self.get_or(Default::default)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Send + fmt::Debug> fmt::Debug for ThreadLocal<T> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "ThreadLocal {{ local_data: {:?} }}", self.get())
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Send + UnwindSafe> UnwindSafe for ThreadLocal<T> {}
|
||||
|
||||
struct RawIter<T: Send> {
|
||||
remaining: usize,
|
||||
index: usize,
|
||||
table: *const Table<T>,
|
||||
}
|
||||
|
||||
impl<T: Send> Iterator for RawIter<T> {
|
||||
type Item = *mut Option<Box<T>>;
|
||||
|
||||
fn next(&mut self) -> Option<*mut Option<Box<T>>> {
|
||||
if self.remaining == 0 {
|
||||
return None;
|
||||
}
|
||||
|
||||
loop {
|
||||
let entries = unsafe { &(*self.table).entries[..] };
|
||||
while self.index < entries.len() {
|
||||
let val = entries[self.index].data.get();
|
||||
self.index += 1;
|
||||
if unsafe { (*val).is_some() } {
|
||||
self.remaining -= 1;
|
||||
return Some(val);
|
||||
}
|
||||
}
|
||||
self.index = 0;
|
||||
self.table = unsafe { &**(*self.table).prev.as_ref().unchecked_unwrap() };
|
||||
}
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
(self.remaining, Some(self.remaining))
|
||||
}
|
||||
}
|
||||
|
||||
/// Mutable iterator over the contents of a `ThreadLocal`.
|
||||
pub struct IterMut<'a, T: Send + 'a> {
|
||||
raw: RawIter<T>,
|
||||
marker: PhantomData<&'a mut ThreadLocal<T>>,
|
||||
}
|
||||
|
||||
impl<'a, T: Send + 'a> Iterator for IterMut<'a, T> {
|
||||
type Item = &'a mut T;
|
||||
|
||||
fn next(&mut self) -> Option<&'a mut T> {
|
||||
self.raw
|
||||
.next()
|
||||
.map(|x| unsafe { &mut **(*x).as_mut().unchecked_unwrap() })
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
self.raw.size_hint()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: Send + 'a> ExactSizeIterator for IterMut<'a, T> {}
|
||||
|
||||
/// An iterator that moves out of a `ThreadLocal`.
|
||||
pub struct IntoIter<T: Send> {
|
||||
raw: RawIter<T>,
|
||||
_thread_local: ThreadLocal<T>,
|
||||
}
|
||||
|
||||
impl<T: Send> Iterator for IntoIter<T> {
|
||||
type Item = T;
|
||||
|
||||
fn next(&mut self) -> Option<T> {
|
||||
self.raw
|
||||
.next()
|
||||
.map(|x| unsafe { *(*x).take().unchecked_unwrap() })
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
self.raw.size_hint()
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Send> ExactSizeIterator for IntoIter<T> {}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{CachedThreadLocal, ThreadLocal};
|
||||
use std::cell::RefCell;
|
||||
use std::sync::atomic::AtomicUsize;
|
||||
use std::sync::atomic::Ordering::Relaxed;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
|
||||
fn make_create() -> Arc<dyn Fn() -> usize + Send + Sync> {
|
||||
let count = AtomicUsize::new(0);
|
||||
Arc::new(move || count.fetch_add(1, Relaxed))
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn same_thread() {
|
||||
let create = make_create();
|
||||
let mut tls = ThreadLocal::new();
|
||||
assert_eq!(None, tls.get());
|
||||
assert_eq!("ThreadLocal { local_data: None }", format!("{:?}", &tls));
|
||||
assert_eq!(0, *tls.get_or(|| create()));
|
||||
assert_eq!(Some(&0), tls.get());
|
||||
assert_eq!(0, *tls.get_or(|| create()));
|
||||
assert_eq!(Some(&0), tls.get());
|
||||
assert_eq!(0, *tls.get_or(|| create()));
|
||||
assert_eq!(Some(&0), tls.get());
|
||||
assert_eq!("ThreadLocal { local_data: Some(0) }", format!("{:?}", &tls));
|
||||
tls.clear();
|
||||
assert_eq!(None, tls.get());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn same_thread_cached() {
|
||||
let create = make_create();
|
||||
let mut tls = CachedThreadLocal::new();
|
||||
assert_eq!(None, tls.get());
|
||||
assert_eq!("ThreadLocal { local_data: None }", format!("{:?}", &tls));
|
||||
assert_eq!(0, *tls.get_or(|| create()));
|
||||
assert_eq!(Some(&0), tls.get());
|
||||
assert_eq!(0, *tls.get_or(|| create()));
|
||||
assert_eq!(Some(&0), tls.get());
|
||||
assert_eq!(0, *tls.get_or(|| create()));
|
||||
assert_eq!(Some(&0), tls.get());
|
||||
assert_eq!("ThreadLocal { local_data: Some(0) }", format!("{:?}", &tls));
|
||||
tls.clear();
|
||||
assert_eq!(None, tls.get());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn different_thread() {
|
||||
let create = make_create();
|
||||
let tls = Arc::new(ThreadLocal::new());
|
||||
assert_eq!(None, tls.get());
|
||||
assert_eq!(0, *tls.get_or(|| create()));
|
||||
assert_eq!(Some(&0), tls.get());
|
||||
|
||||
let tls2 = tls.clone();
|
||||
let create2 = create.clone();
|
||||
thread::spawn(move || {
|
||||
assert_eq!(None, tls2.get());
|
||||
assert_eq!(1, *tls2.get_or(|| create2()));
|
||||
assert_eq!(Some(&1), tls2.get());
|
||||
})
|
||||
.join()
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(Some(&0), tls.get());
|
||||
assert_eq!(0, *tls.get_or(|| create()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn different_thread_cached() {
|
||||
let create = make_create();
|
||||
let tls = Arc::new(CachedThreadLocal::new());
|
||||
assert_eq!(None, tls.get());
|
||||
assert_eq!(0, *tls.get_or(|| create()));
|
||||
assert_eq!(Some(&0), tls.get());
|
||||
|
||||
let tls2 = tls.clone();
|
||||
let create2 = create.clone();
|
||||
thread::spawn(move || {
|
||||
assert_eq!(None, tls2.get());
|
||||
assert_eq!(1, *tls2.get_or(|| create2()));
|
||||
assert_eq!(Some(&1), tls2.get());
|
||||
})
|
||||
.join()
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(Some(&0), tls.get());
|
||||
assert_eq!(0, *tls.get_or(|| create()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter() {
|
||||
let tls = Arc::new(ThreadLocal::new());
|
||||
tls.get_or(|| Box::new(1));
|
||||
|
||||
let tls2 = tls.clone();
|
||||
thread::spawn(move || {
|
||||
tls2.get_or(|| Box::new(2));
|
||||
let tls3 = tls2.clone();
|
||||
thread::spawn(move || {
|
||||
tls3.get_or(|| Box::new(3));
|
||||
})
|
||||
.join()
|
||||
.unwrap();
|
||||
})
|
||||
.join()
|
||||
.unwrap();
|
||||
|
||||
let mut tls = Arc::try_unwrap(tls).unwrap();
|
||||
let mut v = tls.iter_mut().map(|x| **x).collect::<Vec<i32>>();
|
||||
v.sort();
|
||||
assert_eq!(vec![1, 2, 3], v);
|
||||
let mut v = tls.into_iter().map(|x| *x).collect::<Vec<i32>>();
|
||||
v.sort();
|
||||
assert_eq!(vec![1, 2, 3], v);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_cached() {
|
||||
let tls = Arc::new(CachedThreadLocal::new());
|
||||
tls.get_or(|| Box::new(1));
|
||||
|
||||
let tls2 = tls.clone();
|
||||
thread::spawn(move || {
|
||||
tls2.get_or(|| Box::new(2));
|
||||
let tls3 = tls2.clone();
|
||||
thread::spawn(move || {
|
||||
tls3.get_or(|| Box::new(3));
|
||||
})
|
||||
.join()
|
||||
.unwrap();
|
||||
})
|
||||
.join()
|
||||
.unwrap();
|
||||
|
||||
let mut tls = Arc::try_unwrap(tls).unwrap();
|
||||
let mut v = tls.iter_mut().map(|x| **x).collect::<Vec<i32>>();
|
||||
v.sort();
|
||||
assert_eq!(vec![1, 2, 3], v);
|
||||
let mut v = tls.into_iter().map(|x| *x).collect::<Vec<i32>>();
|
||||
v.sort();
|
||||
assert_eq!(vec![1, 2, 3], v);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn is_sync() {
|
||||
fn foo<T: Sync>() {}
|
||||
foo::<ThreadLocal<String>>();
|
||||
foo::<ThreadLocal<RefCell<String>>>();
|
||||
foo::<CachedThreadLocal<String>>();
|
||||
foo::<CachedThreadLocal<RefCell<String>>>();
|
||||
}
|
||||
}
|
|
@ -1,61 +0,0 @@
|
|||
// Copyright 2017 Amanieu d'Antras
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
|
||||
// http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
|
||||
// http://opensource.org/licenses/MIT>, at your option. This file may not be
|
||||
// copied, modified, or distributed except according to those terms.
|
||||
|
||||
use std::collections::BinaryHeap;
|
||||
use std::sync::Mutex;
|
||||
use std::usize;
|
||||
|
||||
// Thread ID manager which allocates thread IDs. It attempts to aggressively
|
||||
// reuse thread IDs where possible to avoid cases where a ThreadLocal grows
|
||||
// indefinitely when it is used by many short-lived threads.
|
||||
struct ThreadIdManager {
|
||||
limit: usize,
|
||||
free_list: BinaryHeap<usize>,
|
||||
}
|
||||
impl ThreadIdManager {
|
||||
fn new() -> ThreadIdManager {
|
||||
ThreadIdManager {
|
||||
limit: usize::MAX,
|
||||
free_list: BinaryHeap::new(),
|
||||
}
|
||||
}
|
||||
fn alloc(&mut self) -> usize {
|
||||
if let Some(id) = self.free_list.pop() {
|
||||
id
|
||||
} else {
|
||||
let id = self.limit;
|
||||
self.limit = self.limit.checked_sub(1).expect("Ran out of thread IDs");
|
||||
id
|
||||
}
|
||||
}
|
||||
fn free(&mut self, id: usize) {
|
||||
self.free_list.push(id);
|
||||
}
|
||||
}
|
||||
lazy_static! {
|
||||
static ref THREAD_ID_MANAGER: Mutex<ThreadIdManager> = Mutex::new(ThreadIdManager::new());
|
||||
}
|
||||
|
||||
// Non-zero integer which is unique to the current thread while it is running.
|
||||
// A thread ID may be reused after a thread exits.
|
||||
struct ThreadId(usize);
|
||||
impl ThreadId {
|
||||
fn new() -> ThreadId {
|
||||
ThreadId(THREAD_ID_MANAGER.lock().unwrap().alloc())
|
||||
}
|
||||
}
|
||||
impl Drop for ThreadId {
|
||||
fn drop(&mut self) {
|
||||
THREAD_ID_MANAGER.lock().unwrap().free(self.0);
|
||||
}
|
||||
}
|
||||
thread_local!(static THREAD_ID: ThreadId = ThreadId::new());
|
||||
|
||||
/// Returns a non-zero ID for the current thread
|
||||
pub fn get() -> usize {
|
||||
THREAD_ID.with(|x| x.0)
|
||||
}
|
|
@ -1,74 +0,0 @@
|
|||
// Copyright 2017 Amanieu d'Antras
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
|
||||
// http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
|
||||
// http://opensource.org/licenses/MIT>, at your option. This file may not be
|
||||
// copied, modified, or distributed except according to those terms.
|
||||
|
||||
//! # unreachable
|
||||
//! inlined from https://github.com/reem/rust-unreachable/
|
||||
//!
|
||||
//! An unreachable code optimization hint in stable rust, and some useful
|
||||
//! extension traits for `Option` and `Result`.
|
||||
//!
|
||||
|
||||
/// Hint to the optimizer that any code path which calls this function is
|
||||
/// statically unreachable and can be removed.
|
||||
///
|
||||
/// Calling this function in reachable code invokes undefined behavior. Be
|
||||
/// very, very sure this is what you want; often, a simple `panic!` is more
|
||||
/// suitable.
|
||||
#[inline]
|
||||
pub unsafe fn unreachable() -> ! {
|
||||
/// The empty type for cases which can't occur.
|
||||
enum Void { }
|
||||
let x: &Void = ::std::mem::transmute(1usize);
|
||||
match *x {}
|
||||
}
|
||||
|
||||
/// An extension trait for `Option<T>` providing unchecked unwrapping methods.
|
||||
pub trait UncheckedOptionExt<T> {
|
||||
/// Get the value out of this Option without checking for None.
|
||||
unsafe fn unchecked_unwrap(self) -> T;
|
||||
|
||||
/// Assert that this Option is a None to the optimizer.
|
||||
unsafe fn unchecked_unwrap_none(self);
|
||||
}
|
||||
|
||||
/// An extension trait for `Result<T, E>` providing unchecked unwrapping methods.
|
||||
pub trait UncheckedResultExt<T, E> {
|
||||
/// Get the value out of this Result without checking for Err.
|
||||
unsafe fn unchecked_unwrap_ok(self) -> T;
|
||||
|
||||
/// Get the error out of this Result without checking for Ok.
|
||||
unsafe fn unchecked_unwrap_err(self) -> E;
|
||||
}
|
||||
|
||||
impl<T> UncheckedOptionExt<T> for Option<T> {
|
||||
unsafe fn unchecked_unwrap(self) -> T {
|
||||
match self {
|
||||
Some(x) => x,
|
||||
None => unreachable()
|
||||
}
|
||||
}
|
||||
|
||||
unsafe fn unchecked_unwrap_none(self) {
|
||||
if self.is_some() { unreachable() }
|
||||
}
|
||||
}
|
||||
|
||||
impl<T, E> UncheckedResultExt<T, E> for Result<T, E> {
|
||||
unsafe fn unchecked_unwrap_ok(self) -> T {
|
||||
match self {
|
||||
Ok(x) => x,
|
||||
Err(_) => unreachable()
|
||||
}
|
||||
}
|
||||
|
||||
unsafe fn unchecked_unwrap_err(self) -> E {
|
||||
match self {
|
||||
Ok(_) => unreachable(),
|
||||
Err(e) => e
|
||||
}
|
||||
}
|
||||
}
|
Загрузка…
Ссылка в новой задаче