diff --git a/doc/.document b/doc/.document index 5ef2d99651..c19a3e8909 100644 --- a/doc/.document +++ b/doc/.document @@ -6,3 +6,5 @@ NEWS syntax optparse rdoc +regexp/methods.rdoc +regexp/unicode_properties.rdoc diff --git a/doc/regexp.rdoc b/doc/regexp.rdoc index b9c89b1c86..c797c782f1 100644 --- a/doc/regexp.rdoc +++ b/doc/regexp.rdoc @@ -1,827 +1,1242 @@ -# -*- mode: rdoc; coding: utf-8; fill-column: 74; -*- +A {regular expression}[https://en.wikipedia.org/wiki/Regular_expression] +(also called a _regexp_) is a match pattern (also simply called a _pattern_). -Regular expressions (regexps) are patterns which describe the -contents of a string. They're used for testing whether a string contains a -given pattern, or extracting the portions that match. They are created -with the /pat/ and -%r{pat} literals or the Regexp.new -constructor. +A common notation for a regexp uses enclosing slash characters: -A regexp is usually delimited with forward slashes (/). For -example: + /foo/ - /hay/ =~ 'haystack' #=> 0 - /y/.match('haystack') #=> # +A regexp may be applied to a target string; +The part of the string (if any) that matches the pattern is called a _match_, +and may be said to match: -If a string contains the pattern it is said to match. A literal -string matches itself. + re = /red/ + re.match?('redirect') # => true # Match at beginning of target. + re.match?('bored') # => true # Match at end of target. + re.match?('credit') # => true # Match within target. + re.match?('foo') # => false # No match. -Here 'haystack' does not contain the pattern 'needle', so it doesn't match: +== \Regexp Uses - /needle/.match('haystack') #=> nil +A regexp may be used: -Here 'haystack' contains the pattern 'hay', so it matches: +- To extract substrings based on a given pattern: - /hay/.match('haystack') #=> # + re = /foo/ # => /foo/ + re.match('food') # => # + re.match('good') # => nil -Specifically, /st/ requires that the string contains the letter -_s_ followed by the letter _t_, so it matches _haystack_, also. + See sections {Method match}[rdoc-ref:regexp.rdoc@Method+match] + and {Operator =~}[rdoc-ref:regexp.rdoc@Operator+-3D~]. -Note that any Regexp matching will raise a RuntimeError if timeout is set and -exceeded. See {"Timeout"}[#label-Timeout] section in detail. +- To determine whether a string matches a given pattern: -== \Regexp Interpolation + re.match?('food') # => true + re.match?('good') # => false -A regexp may contain interpolated strings; trivially: + See section {Method match?}[rdoc-ref:regexp.rdoc@Method+match-3F]. - foo = 'bar' - /#{foo}/ # => /bar/ +- As an argument for calls to certain methods in other classes and modules; + most such methods accept an argument that may be either a string + or the (much more powerful) regexp. -== =~ and Regexp#match + See {Regexp Methods}[./Regexp/methods_rdoc.html]. -Pattern matching may be achieved by using =~ operator or Regexp#match -method. +== \Regexp Objects -=== =~ Operator +A regexp object has: -=~ is Ruby's basic pattern-matching operator. When one operand is a -regular expression and the other is a string then the regular expression is -used as a pattern to match against the string. (This operator is equivalently -defined by Regexp and String so the order of String and Regexp do not matter. -Other classes may have different implementations of =~.) If a match -is found, the operator returns index of first match in string, otherwise it -returns +nil+. +- A source; see {Sources}[rdoc-ref:regexp.rdoc@Sources]. - /hay/ =~ 'haystack' #=> 0 - 'haystack' =~ /hay/ #=> 0 - /a/ =~ 'haystack' #=> 1 - /u/ =~ 'haystack' #=> nil +- Several modes; see {Modes}[rdoc-ref:regexp.rdoc@Modes]. -Using =~ operator with a String and Regexp the $~ global -variable is set after a successful match. $~ holds a MatchData -object. Regexp.last_match is equivalent to $~. +- A timeout; see {Timeouts}[rdoc-ref:regexp.rdoc@Timeouts]. -=== Regexp#match Method +- An encoding; see {Encodings}[rdoc-ref:regexp.rdoc@Encodings]. -The #match method returns a MatchData object: +== Creating a \Regexp - /st/.match('haystack') #=> # +A regular expression may be created with: -== Metacharacters and Escapes +- A regexp literal using slash characters + (see {Regexp Literals}[https://docs.ruby-lang.org/en/master/syntax/literals_rdoc.html#label-Regexp+Literals]): -The following are metacharacters (, ), -[, ], {, }, ., ?, -+, *. They have a specific meaning when appearing in a -pattern. To match them literally they must be backslash-escaped. To match -a backslash literally, backslash-escape it: \\\\. + # This is a very common usage. + /foo/ # => /foo/ - /1 \+ 2 = 3\?/.match('Does 1 + 2 = 3?') #=> # - /a\\\\b/.match('a\\\\b') #=> # +- A %r regexp literal + (see {%r: Regexp Literals}[https://docs.ruby-lang.org/en/master/syntax/literals_rdoc.html#label-25r-3A+Regexp+Literals]): -Patterns behave like double-quoted strings and can contain the same -backslash escapes (the meaning of \s is different, however, -see below[#label-Character+Classes]). + # Same delimiter character at beginning and end; + # useful for avoiding escaping characters + %r/name\/value pair/ # => /name\/value pair/ + %r:name/value pair: # => /name\/value pair/ + %r|name/value pair| # => /name\/value pair/ - /\s\u{6771 4eac 90fd}/.match("Go to 東京都") - #=> # + # Certain "paired" characters can be delimiters. + %r[foo] # => /foo/ + %r{foo} # => /foo/ + %r(foo) # => /foo/ + %r # => /foo/ -Arbitrary Ruby expressions can be embedded into patterns with the -#{...} construct. +- \Method Regexp.new. - place = "東京都" - /#{place}/.match("Go to 東京都") - #=> # +== \Method match -== Character Classes +Each of the methods Regexp#match, String#match, and Symbol#match +returns a MatchData object if a match was found, +nil+ otherwise; +each also sets {global variables}[rdoc-ref:regexp.rdoc@Global+Variables]: -A character class is delimited with square brackets ([, -]) and lists characters that may appear at that point in the -match. /[ab]/ means _a_ or _b_, as opposed to /ab/ which -means _a_ followed by _b_. + 'food'.match(/foo/) # => # + 'food'.match(/bar/) # => nil - /W[aeiou]rd/.match("Word") #=> # +== Operator =~ -Within a character class the hyphen (-) is a metacharacter -denoting an inclusive range of characters. [abcd] is equivalent -to [a-d]. A range can be followed by another range, so -[abcdwxyz] is equivalent to [a-dw-z]. The order in which -ranges or individual characters appear inside a character class is -irrelevant. +Each of the operators Regexp#=~, String#=~, and Symbol#=~ +returns an integer offset if a match was found, +nil+ otherwise; +each also sets {global variables}[rdoc-ref:regexp.rdoc@Global+Variables]: - /[0-9a-f]/.match('9f') #=> # - /[9f]/.match('9f') #=> # + /bar/ =~ 'foo bar' # => 4 + 'foo bar' =~ /bar/ # => 4 + /baz/ =~ 'foo bar' # => nil -If the first character of a character class is a caret (^) the -class is inverted: it matches any character _except_ those named. +== \Method match? - /[^a-eg-z]/.match('f') #=> # +Each of the methods Regexp#match?, String#match?, and Symbol#match? +returns +true+ if a match was found, +false+ otherwise; +none sets {global variables}[rdoc-ref:regexp.rdoc@Global+Variables]: -A character class may contain another character class. By itself this -isn't useful because [a-z[0-9]] describes the same set as -[a-z0-9]. However, character classes also support the && -operator which performs set intersection on its arguments. The two can be -combined as follows: + 'food'.match?(/foo/) # => true + 'food'.match?(/bar/) # => false - /[a-w&&[^c-g]z]/ # ([a-w] AND ([^c-g] OR z)) +== Global Variables + +Certain regexp-oriented methods assign values to global variables: + +- #match: see {Method match}[rdoc-ref:regexp.rdoc@Method+match]. +- #=~: see {Operator =~}[rdoc-ref:regexp.rdoc@Operator+-3D~]. + +The affected global variables are: + +- $~: Returns a MatchData object, or +nil+. +- $&: Returns the matched part of the string, or +nil+. +- $`: Returns the part of the string to the left of the match, or +nil+. +- $': Returns the part of the string to the right of the match, or +nil+. +- $+: Returns the last group matched, or +nil+. +- $1, $2, etc.: Returns the first, second, etc., + matched group, or +nil+. + Note that $0 is quite different; + it returns the name of the currently executing program. + +Examples: + + # Matched string, but no matched groups. + 'foo bar bar baz'.match('bar') + $~ # => # + $& # => "bar" + $` # => "foo " + $' # => " bar baz" + $+ # => nil + $1 # => nil + + # Matched groups. + /s(\w{2}).*(c)/.match('haystack') + $~ # => # + $& # => "stac" + $` # => "hay" + $' # => "k" + $+ # => "c" + $1 # => "ta" + $2 # => "c" + $3 # => nil + + # No match. + 'foo'.match('bar') + $~ # => nil + $& # => nil + $` # => nil + $' # => nil + $+ # => nil + $1 # => nil + +Note that Regexp#match?, String#match?, and Symbol#match? +do not set global variables. + +== Sources + +As seen above, the simplest regexp uses a literal expression as its source: + + re = /foo/ # => /foo/ + re.match('food') # => # + re.match('good') # => nil + +A rich collection of available _subexpressions_ +gives the regexp great power and flexibility: + +- {Special characters}[rdoc-ref:regexp.rdoc@Special+Characters] +- {Source literals}[rdoc-ref:regexp.rdoc@Source+Literals] +- {Character classes}[rdoc-ref:regexp.rdoc@Character+Classes] +- {Shorthand character classes}[rdoc-ref:regexp.rdoc@Shorthand+Character+Classes] +- {Anchors}[rdoc-ref:regexp.rdoc@Anchors] +- {Alternation}[rdoc-ref:regexp.rdoc@Alternation] +- {Quantifiers}[rdoc-ref:regexp.rdoc@Quantifiers] +- {Groups and captures}[rdoc-ref:regexp.rdoc@Groups+and+Captures] +- {Unicode}[rdoc-ref:regexp.rdoc@Unicode] +- {POSIX Bracket Expressions}[rdoc-ref:regexp.rdoc@POSIX+Bracket+Expressions] +- {Comments}[rdoc-ref:regexp.rdoc@Comments] + +=== Special Characters + +\Regexp special characters, called _metacharacters_, +have special meanings in certain contexts; +depending on the context, these are sometimes metacharacters: + + . ? - + * ^ \ | $ ( ) [ ] { } + +To match a metacharacter literally, backslash-escape it: + + # Matches one or more 'o' characters. + /o+/.match('foo') # => # + # Would match 'o+'. + /o\+/.match('foo') # => nil + +To match a backslash literally, backslash-escape it: + + /\./.match('\.') # => # + /\\./.match('\.') # => # + +Method Regexp.escape returns an escaped string: + + Regexp.escape('.?-+*^\|$()[]{}') + # => "\\.\\?\\-\\+\\*\\^\\\\\\|\\$\\(\\)\\[\\]\\{\\}" + +=== Source Literals + +The source literal largely behaves like a double-quoted string; +see {String Literals}[rdoc-ref:syntax/literals.rdoc@String+Literals]. + +In particular, a source literal may contain interpolated expressions: + + s = 'foo' # => "foo" + /#{s}/ # => /foo/ + /#{s.capitalize}/ # => /Foo/ + /#{2 + 2}/ # => /4/ + +There are differences between an ordinary string literal and a source literal; +see {Shorthand Character Classes}[rdoc-ref:regexp.rdoc@Shorthand+Character+Classes]. + +- \s in an ordinary string literal is equivalent to a space character; + in a source literal, it's shorthand for matching a whitespace character. +- In an ordinary string literal, these are (needlessly) escaped characters; + in a source literal, they are shorthands for various matching characters: + + \w \W \d \D \h \H \S \R + +=== Character Classes + +A character class is delimited by square brackets; +it specifies that certain characters match at a given point in the target string: + + # This character class will match any vowel. + re = /B[aeiou]rd/ + re.match('Bird') # => # + re.match('Bard') # => # + re.match('Byrd') # => nil + +A character class may contain hyphen characters to specify ranges of characters: + + # These regexps have the same effect. + /[abcdef]/.match('foo') # => # + /[a-f]/.match('foo') # => # + /[a-cd-f]/.match('foo') # => # + +When the first character of a character class is a caret (^), +the sense of the class is inverted: it matches any character _except_ those specified. + + /[^a-eg-z]/.match('f') # => # + +A character class may contain another character class. +By itself this isn't useful because [a-z[0-9]] +describes the same set as [a-z0-9]. + +However, character classes also support the && operator, +which performs set intersection on its arguments. +The two can be combined as follows: + + /[a-w&&[^c-g]z]/ # ([a-w] AND ([^c-g] OR z)) This is equivalent to: /[abh-w]/ -The following metacharacters also behave like character classes: - -* /./ - Any character except a newline. -* /./m - Any character (the +m+ modifier enables multiline mode) -* /\w/ - A word character ([a-zA-Z0-9_]) -* /\W/ - A non-word character ([^a-zA-Z0-9_]). - Please take a look at {Bug #4044}[https://bugs.ruby-lang.org/issues/4044] if - using /\W/ with the /i modifier. -* /\d/ - A digit character ([0-9]) -* /\D/ - A non-digit character ([^0-9]) -* /\h/ - A hexdigit character ([0-9a-fA-F]) -* /\H/ - A non-hexdigit character ([^0-9a-fA-F]) -* /\s/ - A whitespace character: /[ \t\r\n\f\v]/ -* /\S/ - A non-whitespace character: /[^ \t\r\n\f\v]/ -* /\R/ - A linebreak: \n, \v, \f, \r - \u0085 (NEXT LINE), \u2028 (LINE SEPARATOR), \u2029 (PARAGRAPH SEPARATOR) - or \r\n. - -POSIX bracket expressions are also similar to character classes. -They provide a portable alternative to the above, with the added benefit -that they encompass non-ASCII characters. For instance, /\d/ -matches only the ASCII decimal digits (0-9); whereas /[[:digit:]]/ -matches any character in the Unicode _Nd_ category. - -* /[[:alnum:]]/ - Alphabetic and numeric character -* /[[:alpha:]]/ - Alphabetic character -* /[[:blank:]]/ - Space or tab -* /[[:cntrl:]]/ - Control character -* /[[:digit:]]/ - Digit -* /[[:graph:]]/ - Non-blank character (excludes spaces, control - characters, and similar) -* /[[:lower:]]/ - Lowercase alphabetical character -* /[[:print:]]/ - Like [:graph:], but includes the space character -* /[[:punct:]]/ - Punctuation character -* /[[:space:]]/ - Whitespace character ([:blank:], newline, - carriage return, etc.) -* /[[:upper:]]/ - Uppercase alphabetical -* /[[:xdigit:]]/ - Digit allowed in a hexadecimal number (i.e., - 0-9a-fA-F) - -Ruby also supports the following non-POSIX character classes: - -* /[[:word:]]/ - A character in one of the following Unicode - general categories _Letter_, _Mark_, _Number_, - Connector_Punctuation -* /[[:ascii:]]/ - A character in the ASCII character set - - # U+06F2 is "EXTENDED ARABIC-INDIC DIGIT TWO" - /[[:digit:]]/.match("\u06F2") #=> # - /[[:upper:]][[:lower:]]/.match("Hello") #=> # - /[[:xdigit:]][[:xdigit:]]/.match("A6") #=> # - -== Repetition - -The constructs described so far match a single character. They can be -followed by a repetition metacharacter to specify how many times they need -to occur. Such metacharacters are called quantifiers. - -* * - Zero or more times -* + - One or more times -* ? - Zero or one times (optional) -* {n} - Exactly n times -* {n,} - n or more times -* {,m} - m or less times -* {n,m} - At least n and - at most m times - -At least one uppercase character ('H'), at least one lowercase character -('e'), two 'l' characters, then one 'o': - - "Hello".match(/[[:upper:]]+[[:lower:]]+l{2}o/) #=> # - -=== Greedy Match - -Repetition is greedy by default: as many occurrences as possible -are matched while still allowing the overall match to succeed. By -contrast, lazy matching makes the minimal amount of matches -necessary for overall success. Most greedy metacharacters can be made lazy -by following them with ?. For the {n} pattern, because -it specifies an exact number of characters to match and not a variable -number of characters, the ? metacharacter instead makes the -repeated pattern optional. - -Both patterns below match the string. The first uses a greedy quantifier so -'.+' matches ''; the second uses a lazy quantifier so '.+?' matches -'': - - /<.+>/.match("") #=> #"> - /<.+?>/.match("") #=> #"> - -=== Possessive Match - -A quantifier followed by + matches possessively: once it -has matched it does not backtrack. They behave like greedy quantifiers, -but having matched they refuse to "give up" their match even if this -jeopardises the overall match. - - /<.*><.+>/.match("") #=> #"> - /<.*+><.+>/.match("") #=> nil - /<.*><.++>/.match("") #=> nil - -== Capturing - -Parentheses can be used for capturing. The text enclosed by the -nth group of parentheses can be subsequently referred to -with n. Within a pattern use the backreference -\n (e.g. \1); outside of the pattern use -MatchData[n] (e.g. MatchData[1]). - -In this example, 'at' is captured by the first group of -parentheses, then referred to later with \1: - - /[csh](..) [csh]\1 in/.match("The cat sat in the hat") - #=> # +=== Shorthand Character Classes -Regexp#match returns a MatchData object which makes the captured text -available with its #[] method: +Each of the following metacharacters serves as a shorthand +for a character class: - /[csh](..) [csh]\1 in/.match("The cat sat in the hat")[1] #=> 'at' +- /./: Matches any character except a newline: -While Ruby supports an arbitrary number of numbered captured groups, -only groups 1-9 are supported using the \n backreference -syntax. + /./.match('foo') # => # + /./.match("\n") # => nil -Ruby also supports \0 as a special backreference, which -references the entire matched string. This is also available at -MatchData[0]. Note that the \0 backreference cannot -be used inside the regexp, as backreferences can only be used after the -end of the capture group, and the \0 backreference uses the -implicit capture group of the entire match. However, you can use -this backreference when doing substitution: +- /./m: Matches any character, including a newline; + see {Multiline Mode}[rdoc-ref:regexp.rdoc@Multiline+Mode}: - "The cat sat in the hat".gsub(/[csh]at/, '\0s') - # => "The cats sats in the hats" + /./m.match("\n") # => # -=== Named Captures +- /\w/: Matches a word character: equivalent to [a-zA-Z0-9_]: -Capture groups can be referred to by name when defined with the -(?<name>) or (?'name') -constructs. + /\w/.match(' foo') # => # + /\w/.match(' _') # => # + /\w/.match(' ') # => nil - /\$(?\d+)\.(?\d+)/.match("$3.67") - #=> # - /\$(?\d+)\.(?\d+)/.match("$3.67")[:dollars] #=> "3" +- /\W/: Matches a non-word character: equivalent to [^a-zA-Z0-9_]: -Named groups can be backreferenced with \k<name>, -where _name_ is the group name. + /\W/.match(' ') # => # + /\W/.match('_') # => nil - /(?[aeiou]).\k.\k/.match('ototomy') - #=> # +- /\d/: Matches a digit character: equivalent to [0-9]: -*Note*: A regexp can't use named backreferences and numbered -backreferences simultaneously. Also, if a named capture is used in a -regexp, then parentheses used for grouping which would otherwise result -in a unnamed capture are treated as non-capturing. + /\d/.match('THX1138') # => # + /\d/.match('foo') # => nil - /(\w)(\w)/.match("ab").captures # => ["a", "b"] - /(\w)(\w)/.match("ab").named_captures # => {} +- /\D/: Matches a non-digit character: equivalent to [^0-9]: - /(?\w)(\w)/.match("ab").captures # => ["a"] - /(?\w)(\w)/.match("ab").named_captures # => {"c"=>"a"} + /\D/.match('123Jump!') # => # + /\D/.match('123') # => nil -When named capture groups are used with a literal regexp on the left-hand -side of an expression and the =~ operator, the captured text is -also assigned to local variables with corresponding names. +- /\h/: Matches a hexdigit character: equivalent to [0-9a-fA-F]: - /\$(?\d+)\.(?\d+)/ =~ "$3.67" #=> 0 - dollars #=> "3" + /\h/.match('xyz fedcba9876543210') # => # + /\h/.match('xyz') # => nil -== Grouping +- /\H/: Matches a non-hexdigit character: equivalent to [^0-9a-fA-F]: -Parentheses also group the terms they enclose, allowing them to be -quantified as one atomic whole. + /\H/.match('fedcba9876543210xyz') # => # + /\H/.match('fedcba9876543210') # => nil -The pattern below matches a vowel followed by 2 word characters: +- /\s/: Matches a whitespace character: equivalent to /[ \t\r\n\f\v]/: - /[aeiou]\w{2}/.match("Caenorhabditis elegans") #=> # + /\s/.match('foo bar') # => # + /\s/.match('foo') # => nil -Whereas the following pattern matches a vowel followed by a word character, -twice, i.e. [aeiou]\w[aeiou]\w: 'enor'. +- /\S/: Matches a non-whitespace character: equivalent to /[^ \t\r\n\f\v]/: - /([aeiou]\w){2}/.match("Caenorhabditis elegans") - #=> # + /\S/.match(" \t\r\n\f\v foo") # => # + /\S/.match(" \t\r\n\f\v") # => nil -The (?:...) construct provides grouping without -capturing. That is, it combines the terms it contains into an atomic whole -without creating a backreference. This benefits performance at the slight -expense of readability. +- /\R/: Matches a linebreak, platform-independently: -The first group of parentheses captures 'n' and the second 'ti'. The second -group is referred to later with the backreference \2: + /\R/.match("\r") # => # # Carriage return (CR) + /\R/.match("\n") # => # # Newline (LF) + /\R/.match("\f") # => # # Formfeed (FF) + /\R/.match("\v") # => # # Vertical tab (VT) + /\R/.match("\r\n") # => # # CRLF + /\R/.match("\u0085") # => # # Next line (NEL) + /\R/.match("\u2028") # => # # Line separator (LSEP) + /\R/.match("\u2029") # => # # Paragraph separator (PSEP) - /I(n)ves(ti)ga\2ons/.match("Investigations") - #=> # +=== Anchors -The first group of parentheses is now made non-capturing with '?:', so it -still matches 'n', but doesn't create the backreference. Thus, the -backreference \1 now refers to 'ti'. +An anchor is a metasequence that matches a zero-width position between +characters in the target string. - /I(?:n)ves(ti)ga\1ons/.match("Investigations") - #=> # +For a subexpression with no anchor, +matching may begin anywhere in the target string: -=== Atomic Grouping + /real/.match('surrealist') # => # -Grouping can be made atomic with -(?>pat). This causes the subexpression pat -to be matched independently of the rest of the expression such that what -it matches becomes fixed for the remainder of the match, unless the entire -subexpression must be abandoned and subsequently revisited. In this -way pat is treated as a non-divisible whole. Atomic grouping is -typically used to optimise patterns so as to prevent the regular -expression engine from backtracking needlessly. - -The " in the pattern below matches the first character of the string, -then .* matches Quote". This causes the overall match to fail, -so the text matched by .* is backtracked by one position, which -leaves the final character of the string available to match " - - /".*"/.match('"Quote"') #=> # - -If .* is grouped atomically, it refuses to backtrack Quote", -even though this means that the overall match fails - - /"(?>.*)"/.match('"Quote"') #=> nil - -== Subexpression Calls - -The \g<name> syntax matches the previous -subexpression named _name_, which can be a group name or number, again. -This differs from backreferences in that it re-executes the group rather -than simply trying to re-match the same text. - -This pattern matches a ( character and assigns it to the paren -group, tries to call that the paren sub-expression again but fails, -then matches a literal ): - - /\A(?\(\g*\))*\z/ =~ '()' - - - /\A(?\(\g*\))*\z/ =~ '(())' #=> 0 - # ^1 - # ^2 - # ^3 - # ^4 - # ^5 - # ^6 - # ^7 - # ^8 - # ^9 - # ^10 - -1. Matches at the beginning of the string, i.e. before the first - character. -2. Enters a named capture group called paren -3. Matches a literal (, the first character in the string -4. Calls the paren group again, i.e. recurses back to the - second step -5. Re-enters the paren group -6. Matches a literal (, the second character in the - string -7. Try to call paren a third time, but fail because - doing so would prevent an overall successful match -8. Match a literal ), the third character in the string. - Marks the end of the second recursive call -9. Match a literal ), the fourth character in the string -10. Match the end of the string - -== Alternation - -The vertical bar metacharacter (|) combines several expressions into -a single one that matches any of the expressions. Each expression is an -alternative. - - /\w(and|or)\w/.match("Feliformia") #=> # - /\w(and|or)\w/.match("furandi") #=> # - /\w(and|or)\w/.match("dissemblance") #=> nil - -== Condition - -The (?(cond)yes|no) -syntax matches _yes_ part if _cond_ is captured, otherwise matches _no_ part. -In the case _no_ part is empty, also | can be omitted. - -The _cond_ may be a backreference number or a captured name. A backreference -number is an absolute position, but can not be a relative position. - -== Character Properties - -The \p{} construct matches characters with the named property, -much like POSIX bracket classes. - -* /\p{Alnum}/ - Alphabetic and numeric character -* /\p{Alpha}/ - Alphabetic character -* /\p{Blank}/ - Space or tab -* /\p{Cntrl}/ - Control character -* /\p{Digit}/ - Digit -* /\p{Emoji}/ - Unicode emoji -* /\p{Graph}/ - Non-blank character (excludes spaces, control - characters, and similar) -* /\p{Lower}/ - Lowercase alphabetical character -* /\p{Print}/ - Like \p{Graph}, but includes the space character -* /\p{Punct}/ - Punctuation character -* /\p{Space}/ - Whitespace character ([:blank:], newline, - carriage return, etc.) -* /\p{Upper}/ - Uppercase alphabetical -* /\p{XDigit}/ - Digit allowed in a hexadecimal number (i.e., 0-9a-fA-F) -* /\p{Word}/ - A member of one of the following Unicode general - category Letter, Mark, Number, - Connector\_Punctuation -* /\p{ASCII}/ - A character in the ASCII character set -* /\p{Any}/ - Any Unicode character (including unassigned - characters) -* /\p{Assigned}/ - An assigned character - -A Unicode character's General Category value can also be matched -with \p{Ab} where Ab is the category's -abbreviation as described below: - -* /\p{L}/ - 'Letter' -* /\p{Ll}/ - 'Letter: Lowercase' -* /\p{Lm}/ - 'Letter: Mark' -* /\p{Lo}/ - 'Letter: Other' -* /\p{Lt}/ - 'Letter: Titlecase' -* /\p{Lu}/ - 'Letter: Uppercase -* /\p{Lo}/ - 'Letter: Other' -* /\p{M}/ - 'Mark' -* /\p{Mn}/ - 'Mark: Nonspacing' -* /\p{Mc}/ - 'Mark: Spacing Combining' -* /\p{Me}/ - 'Mark: Enclosing' -* /\p{N}/ - 'Number' -* /\p{Nd}/ - 'Number: Decimal Digit' -* /\p{Nl}/ - 'Number: Letter' -* /\p{No}/ - 'Number: Other' -* /\p{P}/ - 'Punctuation' -* /\p{Pc}/ - 'Punctuation: Connector' -* /\p{Pd}/ - 'Punctuation: Dash' -* /\p{Ps}/ - 'Punctuation: Open' -* /\p{Pe}/ - 'Punctuation: Close' -* /\p{Pi}/ - 'Punctuation: Initial Quote' -* /\p{Pf}/ - 'Punctuation: Final Quote' -* /\p{Po}/ - 'Punctuation: Other' -* /\p{S}/ - 'Symbol' -* /\p{Sm}/ - 'Symbol: Math' -* /\p{Sc}/ - 'Symbol: Currency' -* /\p{Sc}/ - 'Symbol: Currency' -* /\p{Sk}/ - 'Symbol: Modifier' -* /\p{So}/ - 'Symbol: Other' -* /\p{Z}/ - 'Separator' -* /\p{Zs}/ - 'Separator: Space' -* /\p{Zl}/ - 'Separator: Line' -* /\p{Zp}/ - 'Separator: Paragraph' -* /\p{C}/ - 'Other' -* /\p{Cc}/ - 'Other: Control' -* /\p{Cf}/ - 'Other: Format' -* /\p{Cn}/ - 'Other: Not Assigned' -* /\p{Co}/ - 'Other: Private Use' -* /\p{Cs}/ - 'Other: Surrogate' - -Lastly, \p{} matches a character's Unicode script. The -following scripts are supported: Arabic, Armenian, -Balinese, Bengali, Bopomofo, Braille, -Buginese, Buhid, Canadian_Aboriginal, Carian, -Cham, Cherokee, Common, Coptic, -Cuneiform, Cypriot, Cyrillic, Deseret, -Devanagari, Ethiopic, Georgian, Glagolitic, -Gothic, Greek, Gujarati, Gurmukhi, Han, -Hangul, Hanunoo, Hebrew, Hiragana, -Inherited, Kannada, Katakana, Kayah_Li, -Kharoshthi, Khmer, Lao, Latin, Lepcha, -Limbu, Linear_B, Lycian, Lydian, -Malayalam, Mongolian, Myanmar, New_Tai_Lue, -Nko, Ogham, Ol_Chiki, Old_Italic, -Old_Persian, Oriya, Osmanya, Phags_Pa, -Phoenician, Rejang, Runic, Saurashtra, -Shavian, Sinhala, Sundanese, Syloti_Nagri, -Syriac, Tagalog, Tagbanwa, Tai_Le, -Tamil, Telugu, Thaana, Thai, Tibetan, -Tifinagh, Ugaritic, Vai, and Yi. - -Unicode codepoint U+06E9 is named "ARABIC PLACE OF SAJDAH" and belongs to the -Arabic script: - - /\p{Arabic}/.match("\u06E9") #=> # - -All character properties can be inverted by prefixing their name with a -caret (^). - -Letter 'A' is not in the Unicode Ll (Letter; Lowercase) category, so this -match succeeds: - - /\p{^Ll}/.match("A") #=> # - -== Anchors - -Anchors are metacharacter that match the zero-width positions between -characters, anchoring the match to a specific position. - -* ^ - Matches beginning of line -* $ - Matches end of line -* \A - Matches beginning of string. -* \Z - Matches end of string. If string ends with a newline, - it matches just before newline -* \z - Matches end of string -* \G - Matches first matching position: - - In methods like String#gsub and String#scan, it changes on each iteration. +For a subexpression with an anchor, +matching must begin at the matched anchor. + +==== Boundary Anchors + +Each of these anchors matches a boundary: + +- ^: Matches the beginning of a line: + + /^bar/.match("foo\nbar") # => # + /^ar/.match("foo\nbar") # => nil + +- $: Matches the end of a line: + + /bar$/.match("foo\nbar") # => # + /ba$/.match("foo\nbar") # => nil + +- \A: Matches the beginning of the string: + + /\Afoo/.match('foo bar') # => # + /\Afoo/.match(' foo bar') # => nil + +- \Z: Matches the end of the string; + if string ends with a single newline, + it matches just before the ending newline: + + /foo\Z/.match('bar foo') # => # + /foo\Z/.match('foo bar') # => nil + /foo\Z/.match("bar foo\n") # => # + /foo\Z/.match("bar foo\n\n") # => nil + +- \z: Matches the end of the string: + + /foo\z/.match('bar foo') # => # + /foo\z/.match('foo bar') # => nil + /foo\z/.match("bar foo\n") # => nil + +- \b: Matches word boundary when not inside brackets; + matches backspace ("0x08") when inside brackets: + + /foo\b/.match('foo bar') # => # + /foo\b/.match('foobar') # => nil + +- \B: Matches non-word boundary: + + /foo\B/.match('foobar') # => # + /foo\B/.match('foo bar') # => nil + +- \G: Matches first matching position: + + In methods like String#gsub and String#scan, it changes on each iteration. It initially matches the beginning of subject, and in each following iteration it matches where the last match finished. - " a b c".gsub(/ /, '_') #=> "____a_b_c" - " a b c".gsub(/\G /, '_') #=> "____a b c" + " a b c".gsub(/ /, '_') # => "____a_b_c" + " a b c".gsub(/\G /, '_') # => "____a b c" - In methods like Regexp#match and String#match that take an (optional) offset, it matches where the search begins. + In methods like Regexp#match and String#match + that take an optional offset, it matches where the search begins. - "hello, world".match(/,/, 3) #=> # - "hello, world".match(/\G,/, 3) #=> nil + "hello, world".match(/,/, 3) # => # + "hello, world".match(/\G,/, 3) # => nil -* \b - Matches word boundaries when outside brackets; - backspace (0x08) when inside brackets -* \B - Matches non-word boundaries -* (?=pat) - Positive lookahead assertion: - ensures that the following characters match pat, but doesn't - include those characters in the matched text -* (?!pat) - Negative lookahead assertion: - ensures that the following characters do not match pat, but - doesn't include those characters in the matched text -* (?<=pat) - Positive lookbehind - assertion: ensures that the preceding characters match pat, but - doesn't include those characters in the matched text -* (?pat) - Negative lookbehind - assertion: ensures that the preceding characters do not match - pat, but doesn't include those characters in the matched text +==== Lookaround Anchors -* \K - Match reset: the matched content preceding - \K in the regexp is excluded from the result. For example, - the following two regexps are almost equivalent: +Lookahead anchors: - /ab\Kc/ =~ "abc" #=> 0 - /(?<=ab)c/ =~ "abc" #=> 2 +- (?=_pat_): Positive lookahead assertion: + ensures that the following characters match _pat_, + but doesn't include those characters in the matched substring. - These match same string and $& equals "c", while the - matched position is different. +- (?!_pat_): Negative lookahead assertion: + ensures that the following characters do not match _pat_, + but doesn't include those characters in the matched substring. + +Lookbehind anchors: + +- (?<=_pat_): Positive lookbehind assertion: + ensures that the preceding characters match _pat_, but + doesn't include those characters in the matched substring. + +- (?: Negative lookbehind assertion: + ensures that the preceding characters do not match + _pat_, but doesn't include those characters in the matched substring. + +The pattern below uses positive lookahead and positive lookbehind to match +text appearing in ... tags +without including the tags in the match: + + /(?<=)\w+(?=<\/b>)/.match("Fortune favors the bold.") + # => # + +==== Match-Reset Anchor + +- \K: Match reset: + the matched content preceding \K in the regexp is excluded from the result. + For example, the following two regexps are almost equivalent: + + /ab\Kc/.match('abc') # => # + /(?<=ab)c/.match('abc') # => # + + These match same string and $& equals 'c', + while the matched position is different. As are the following two regexps: - /(a)\K(b)\Kc/ - /(?<=(?<=(a))(b))c/ + /(a)\K(b)\Kc/ + /(?<=(?<=(a))(b))c/ -If a pattern isn't anchored it can begin at any point in the string: +=== Alternation - /real/.match("surrealist") #=> # +The vertical bar metacharacter (|) may be used within parentheses +to express alternation: +two or more subexpressions any of which may match the target string. -Anchoring the pattern to the beginning of the string forces the match to start -there. 'real' doesn't occur at the beginning of the string, so now the match -fails: +Two alternatives: - /\Areal/.match("surrealist") #=> nil + re = /(a|b)/ + re.match('foo') # => nil + re.match('bar') # => # -The match below fails because although 'Demand' contains 'and', the pattern -does not occur at a word boundary. +Four alternatives: - /\band/.match("Demand") + re = /(a|b|c|d)/ + re.match('shazam') # => # + re.match('cold') # => # -Whereas in the following example 'and' has been anchored to a non-word -boundary so instead of matching the first 'and' it matches from the fourth -letter of 'demand' instead: +Each alternative is a subexpression, and may be composed of other subexpressions: - /\Band.+/.match("Supply and demand curve") #=> # + re = /([a-c]|[x-z])/ + re.match('bar') # => # + re.match('ooz') # => # -The pattern below uses positive lookahead and positive lookbehind to match -text appearing in tags without including the tags in the match: +\Method Regexp.union provides a convenient way to construct +a regexp with alternatives. - /(?<=)\w+(?=<\/b>)/.match("Fortune favours the bold") - #=> # +=== Quantifiers -== Absent operator +A simple regexp matches one character: -Absent operator (?~pat) matches string which does -not match pat. + /\w/.match('Hello') # => # -For example, a regexp to match C comment, which is enclosed by /* -and */ and does not include */, using absent operator: +An added _quantifier_ specifies how many matches are required or allowed: - %r[/\*(?~\*/)\*/] =~ "/* comment */ not-comment */" - #=> # +- * - Matches zero or more times: -This is often shorter and clearer than without absent operator: + /\w*/.match('') + # => # + /\w*/.match('x') + # => # + /\w*/.match('xyz') + # => # - %r[/\*[^\*]*\*+(?:[^\*/][^\*]*\*+)*/] - %r[/\*(?:(?!\*/).)*\*/] - %r[/\*(?>.*?\*/)] +- + - Matches one or more times: -== Options + /\w+/.match('') # => nil + /\w+/.match('x') # => # + /\w+/.match('xyz') # => # -The end delimiter for a regexp can be followed by one or more single-letter -options which control how the pattern can match. +- ? - Matches zero or one times: -* /pat/i - Ignore case -* /pat/m - Treat a newline as a character matched by . -* /pat/x - Ignore whitespace and comments in the pattern -* /pat/o - Perform #{} interpolation only once + /\w?/.match('') # => # + /\w?/.match('x') # => # + /\w?/.match('xyz') # => # -i, m, and x can also be applied on the -subexpression level with the -(?on-off) construct, which -enables options on, and disables options off for the -expression enclosed by the parentheses: +- {_n_} - Matches exactly _n_ times: - /a(?i:b)c/.match('aBc') #=> # - /a(?-i:b)c/i.match('ABC') #=> nil + /\w{2}/.match('') # => nil + /\w{2}/.match('x') # => nil + /\w{2}/.match('xyz') # => # -Additionally, these options can also be toggled for the remainder of the -pattern: +- {_min_,} - Matches _min_ or more times: - /a(?i)bc/.match('abC') #=> # + /\w{2,}/.match('') # => nil + /\w{2,}/.match('x') # => nil + /\w{2,}/.match('xy') # => # + /\w{2,}/.match('xyz') # => # -Options may also be used with Regexp.new: +- {,_max_} - Matches _max_ or fewer times: - Regexp.new("abc", Regexp::IGNORECASE) #=> /abc/i - Regexp.new("abc", Regexp::MULTILINE) #=> /abc/m - Regexp.new("abc # Comment", Regexp::EXTENDED) #=> /abc # Comment/x - Regexp.new("abc", Regexp::IGNORECASE | Regexp::MULTILINE) #=> /abc/mi + /\w{,2}/.match('') # => # + /\w{,2}/.match('x') # => # + /\w{,2}/.match('xyz') # => # - Regexp.new("abc", "i") #=> /abc/i - Regexp.new("abc", "m") #=> /abc/m - Regexp.new("abc # Comment", "x") #=> /abc # Comment/x - Regexp.new("abc", "im") #=> /abc/mi +- {_min_,_max_} - + Matches at least _min_ times and at most _max_ times: -== Free-Spacing Mode and Comments + /\w{1,2}/.match('') # => nil + /\w{1,2}/.match('x') # => # + /\w{1,2}/.match('xyz') # => # -As mentioned above, the x option enables free-spacing -mode. Literal white space inside the pattern is ignored, and the -octothorpe (#) character introduces a comment until the end of -the line. This allows the components of the pattern to be organized in a -potentially more readable fashion. +==== Greedy, Lazy, or Possessive Matching -A contrived pattern to match a number with optional decimal places: +Quantifier matching may be greedy, lazy, or possessive: - float_pat = /\A - [[:digit:]]+ # 1 or more digits before the decimal point - (\. # Decimal point - [[:digit:]]+ # 1 or more digits after the decimal point - )? # The decimal point and following digits are optional - \Z/x - float_pat.match('3.14') #=> # +- In _greedy_ matching, as many occurrences as possible are matched + while still allowing the overall match to succeed. + Greedy quantifiers: *, +, ?, + {min, max} and its variants. +- In _lazy_ matching, the minimum number of occurrences are matched. + Lazy quantifiers: *?, +?, ??, + {min, max}? and its variants. +- In _possessive_ matching, once a match is found, there is no backtracking; + that match is retained, even if it jeopardises the overall match. + Possessive quantifiers: *+, ++, ?+. + Note that {min, max} and its variants do _not_ support possessive matching. -There are a number of strategies for matching whitespace: +More: -* Use a pattern such as \s or \p{Space}. -* Use escaped whitespace such as \ , i.e. a space preceded by a backslash. -* Use a character class such as [ ]. +- About greedy and lazy matching, see + {Choosing Minimal or Maximal Repetition}[https://doc.lagout.org/programmation/Regular%20Expressions/Regular%20Expressions%20Cookbook_%20Detailed%20Solutions%20in%20Eight%20Programming%20Languages%20%282nd%20ed.%29%20%5BGoyvaerts%20%26%20Levithan%202012-09-06%5D.pdf#tutorial-backtrack]. +- About possessive matching, see + {Eliminate Needless Backtracking}[https://doc.lagout.org/programmation/Regular%20Expressions/Regular%20Expressions%20Cookbook_%20Detailed%20Solutions%20in%20Eight%20Programming%20Languages%20%282nd%20ed.%29%20%5BGoyvaerts%20%26%20Levithan%202012-09-06%5D.pdf#tutorial-backtrack]. -Comments can be included in a non-x pattern with the -(?#comment) construct, where comment is -arbitrary text ignored by the regexp engine. +=== Groups and Captures -Comments in regexp literals cannot include unescaped terminator -characters. +A simple regexp has (at most) one match: -== Encoding + re = /\d\d\d\d-\d\d-\d\d/ + re.match('1943-02-04') # => # + re.match('1943-02-04').size # => 1 + re.match('foo') # => nil -Regular expressions are assumed to use the source encoding. This can be -overridden with one of the following modifiers. +Adding one or more pairs of parentheses, (_subexpression_), +defines _groups_, which may result in multiple matched substrings, +called _captures_: -* /pat/u - UTF-8 -* /pat/e - EUC-JP -* /pat/s - Windows-31J -* /pat/n - ASCII-8BIT + re = /(\d\d\d\d)-(\d\d)-(\d\d)/ + re.match('1943-02-04') # => # + re.match('1943-02-04').size # => 4 -A regexp can be matched against a string when they either share an -encoding, or the regexp's encoding is _US-ASCII_ and the string's encoding -is ASCII-compatible. +The first capture is the entire matched string; +the other captures are the matched substrings from the groups. + +A group may have a +{quantifier}[rdoc-ref:regexp.rdoc@Quantifiers]: + + re = /July 4(th)?/ + re.match('July 4') # => # + re.match('July 4th') # => # + + re = /(foo)*/ + re.match('') # => # + re.match('foo') # => # + re.match('foofoo') # => # + + re = /(foo)+/ + re.match('') # => nil + re.match('foo') # => # + re.match('foofoo') # => # + +The returned \MatchData object gives access to the matched substrings: + + re = /(\d\d\d\d)-(\d\d)-(\d\d)/ + md = re.match('1943-02-04') + # => # + md[0] # => "1943-02-04" + md[1] # => "1943" + md[2] # => "02" + md[3] # => "04" + +==== Non-Capturing Groups + +A group may be made non-capturing; +it is still a group (and, for example, can have a quantifier), +but its matching substring is not included among the captures. + +A non-capturing group begins with ?: (inside the parentheses): + + # Don't capture the year. + re = /(?:\d\d\d\d)-(\d\d)-(\d\d)/ + md = re.match('1943-02-04') # => # + +==== Backreferences + +A group match may also be referenced within the regexp itself; +such a reference is called a +backreference+: + + /[csh](..) [csh]\1 in/.match('The cat sat in the hat') + # => # + +This table shows how each subexpression in the regexp above +matches a substring in the target string: + + | Subexpression in Regexp | Matching Substring in Target String | + |---------------------------|-------------------------------------| + | First '[csh]' | Character 'c' | + | '(..)' | First substring 'at' | + | First space ' ' | First space character ' ' | + | Second '[csh]' | Character 's' | + | '\1' (backreference 'at') | Second substring 'at' | + | ' in' | Substring ' in' | + +A regexp may contain any number of groups: + +- For a large number of groups: + + - The ordinary \\_n_ notation applies only for _n_ in range (1..9). + - The MatchData[_n_] notation applies for any non-negative _n_. + +- \0 is a special backreference, referring to the entire matched string; + it may not be used within the regexp itself, + but may be used outside it (for example, in a substitution method call): + + 'The cat sat in the hat'.gsub(/[csh]at/, '\0s') + # => "The cats sats in the hats" + +==== Named Captures + +As seen above, a capture can be referred to by its number. +A capture can also have a name, +prefixed as ?<_name_> or ?'_name_', +and the name (symbolized) may be used as an index in MatchData[]: + + md = /\$(?\d+)\.(?'cents'\d+)/.match("$3.67") + # => # + md[:dollars] # => "3" + md[:cents] # => "67" + # The capture numbers are still valid. + md[2] # => "67" + +When a regexp contains a named capture, there are no unnamed captures: + + /\$(?\d+)\.(\d+)/.match("$3.67") + # => # + +A named group may be backreferenced as \k<_name_>: + + /(?[aeiou]).\k.\k/.match('ototomy') + # => # + +When (and only when) a regexp contains named capture groups +and appears before the =~ operator, +the captured substrings are assigned to local variables with corresponding names: + + /\$(?\d+)\.(?\d+)/ =~ '$3.67' + dollars # => "3" + cents # => "67" + +\Method Regexp#named_captures returns a hash of the capture names and substrings; +method Regexp#names returns an array of the capture names. + +==== Atomic Grouping + +A group may be made _atomic_ with (?>_subexpression_). + +This causes the subexpression to be matched +independently of the rest of the expression, +so that the matched substring becomes fixed for the remainder of the match, +unless the entire subexpression must be abandoned and subsequently revisited. + +In this way _subexpression_ is treated as a non-divisible whole. +Atomic grouping is typically used to optimise patterns +to prevent needless backtracking . + +Example (without atomic grouping): + + /".*"/.match('"Quote"') # => # + +Analysis: + +1. The leading subexpression " in the pattern matches the first character + " in the target string. +2. The next subexpression .* matches the next substring Quote“ + (including the trailing double-quote). +3. Now there is nothing left in the target string to match + the trailing subexpression " in the pattern; + this would cause the overall match to fail. +4. The matched substring is backtracked by one position: Quote. +5. The final subexpression " now matches the final substring ", + and the overall match succeeds. + +If subexpression .* is grouped atomically, +the backtracking is disabled, and the overall match fails: + + /"(?>.*)"/.match('"Quote"') # => nil + +Atomic grouping can affect performance; +see {Atomic Group}[https://www.regular-expressions.info/atomic.html]. + +==== Subexpression Calls + +As seen above, a backreference number (\\_n_) or name (\k<_name_>) +gives access to a captured _substring_; +the corresponding regexp _subexpression_ may also be accessed, +via the number (\\gn) or name (\g<_name_>): + + /\A(?\(\g*\))*\z/.match('(())') + # ^1 + # ^2 + # ^3 + # ^4 + # ^5 + # ^6 + # ^7 + # ^8 + # ^9 + # ^10 + +The pattern: + +1. Matches at the beginning of the string, i.e. before the first character. +2. Enters a named group +paren+. +3. Matches the first character in the string, '('. +4. Calls the +paren+ group again, i.e. recurses back to the second step. +5. Re-enters the +paren+ group. +6. Matches the second character in the string, '('. +7. Attempts to call +paren+ a third time, + but fails because doing so would prevent an overall successful match. +8. Matches the third character in the string, ')'; + marks the end of the second recursive call +9. Matches the fourth character in the string, ')'. +10. Matches the end of the string. + +See {Subexpression calls}[https://learnbyexample.github.io/Ruby_Regexp/groupings-and-backreferences.html?highlight=subexpression#subexpression-calls]. + +==== Conditionals + +The conditional construct takes the form (?(_cond_)_yes_|_no_), where: + +- _cond_ may be a capture number or name. +- The match to be applied is _yes_ if_cond_ is captured; + otherwise the match to be applied is _no_. +- If not needed, |_no_ may be omitted. + +Examples: + + re = /\A(foo)?(?(1)(T)|(F))\z/ + re.match('fooT') # => # + re.match('F') # => # + re.match('fooF') # => nil + re.match('T') # => nil + + re = /\A(?foo)?(?()(T)|(F))\z/ + re.match('fooT') # => # + re.match('F') # => # + re.match('fooF') # => nil + re.match('T') # => nil + + +==== Absence Operator + +The absence operator is a special group that matches anything which does _not_ match the contained subexpressions. + + /(?~real)/.match('surrealist') # => # + /(?~real)ist/.match('surrealist') # => # + /sur(?~real)ist/.match('surrealist') # => nil + +=== Unicode + +==== Unicode Properties + +The /\p{_property_name_}/ construct (with lowercase +p+) +matches characters using a Unicode property name, +much like a character class; +property +Alpha+ specifies alphabetic characters: + + /\p{Alpha}/.match('a') # => # + /\p{Alpha}/.match('1') # => nil + +A property can be inverted +by prefixing the name with a caret character (^): + + /\p{^Alpha}/.match('1') # => # + /\p{^Alpha}/.match('a') # => nil + +Or by using \P (uppercase +P+): + + /\P{Alpha}/.match('1') # => # + /\P{Alpha}/.match('a') # => nil + +See {Unicode Properties}[./Regexp/unicode_properties_rdoc.html] +for regexps based on the numerous properties. + +Some commonly-used properties correspond to POSIX bracket expressions: + +- /\p{Alnum}/: Alphabetic and numeric character +- /\p{Alpha}/: Alphabetic character +- /\p{Blank}/: Space or tab +- /\p{Cntrl}/: Control character +- /\p{Digit}/: Digit + characters, and similar) +- /\p{Lower}/: Lowercase alphabetical character +- /\p{Print}/: Like \p{Graph}, but includes the space character +- /\p{Punct}/: Punctuation character +- /\p{Space}/: Whitespace character ([:blank:], newline, + carriage return, etc.) +- /\p{Upper}/: Uppercase alphabetical +- /\p{XDigit}/: Digit allowed in a hexadecimal number (i.e., 0-9a-fA-F) + +These are also commonly used: + +- /\p{Emoji}/: Unicode emoji. +- /\p{Graph}/: Non-blank character + (excludes spaces, control characters, and similar). +- /\p{Word}/: A member of one of the following Unicode character + categories (see below): + + - +Mark+ (+M+). + - +Letter+ (+L+). + - +Number+ (+N+) + - Connector Punctuation (+Pc+). + +- /\p{ASCII}/: A character in the ASCII character set. +- /\p{Any}/: Any Unicode character (including unassigned characters). +- /\p{Assigned}/: An assigned character. + +==== Unicode Character Categories + +A Unicode character category name: + +- May be either its full name or its abbreviated name. +- Is case-insensitive. +- Treats a space, a hyphen, and an underscore as equivalent. + +Examples: + + /\p{lu}/ # => /\p{lu}/ + /\p{LU}/ # => /\p{LU}/ + /\p{Uppercase Letter}/ # => /\p{Uppercase Letter}/ + /\p{Uppercase_Letter}/ # => /\p{Uppercase_Letter}/ + /\p{UPPERCASE-LETTER}/ # => /\p{UPPERCASE-LETTER}/ + +Below are the Unicode character category abbreviations and names. +Enumerations of characters in each category are at the links. + +Letters: + +- +L+, +Letter+: +LC+, +Lm+, or +Lo+. +- +LC+, +Cased_Letter+: +Ll+, +Lt+, or +Lu+. +- {Lu, Lowercase_Letter}[https://www.compart.com/en/unicode/category/Ll]. +- {Lu, Modifier_Letter}[https://www.compart.com/en/unicode/category/Lm]. +- {Lu, Other_Letter}[https://www.compart.com/en/unicode/category/Lo]. +- {Lu, Titlecase_Letter}[https://www.compart.com/en/unicode/category/Lt]. +- {Lu, Uppercase_Letter}[https://www.compart.com/en/unicode/category/Lu]. + +Marks: + +- +M+, +Mark+: +Mc+, +Me+, or +Mn+. +- {Mc, Spacing_Mark}[https://www.compart.com/en/unicode/category/Mc]. +- {Me, Enclosing_Mark}[https://www.compart.com/en/unicode/category/Me]. +- {Mn, Nonapacing_Mark}[https://www.compart.com/en/unicode/category/Mn]. + +Numbers: + +- +N+, +Number+: +Nd+, +Nl+, or +No+. +- {Nd, Decimal_Number}[https://www.compart.com/en/unicode/category/Nd]. +- {Nl, Letter_Number}[https://www.compart.com/en/unicode/category/Nl]. +- {No, Other_Number}[https://www.compart.com/en/unicode/category/No]. + +Punctation: + +- +P+, +Punctuation+: +Pc+, +Pd+, +Pe+, +Pf+, +Pi+, +Po+, or +Ps+. +- {Pc, Connector_Punctuation}[https://www.compart.com/en/unicode/category/Pc]. +- {Pd, Dash_Punctuation}[https://www.compart.com/en/unicode/category/Pd]. +- {Pe, Close_Punctuation}[https://www.compart.com/en/unicode/category/Pe]. +- {Pf, Final_Punctuation}[https://www.compart.com/en/unicode/category/Pf]. +- {Pi, Initial_Punctuation}[https://www.compart.com/en/unicode/category/Pi]. +- {Po, Open_Punctuation}[https://www.compart.com/en/unicode/category/Po]. +- {Ps, Open_Punctuation}[https://www.compart.com/en/unicode/category/Ps]. + +- +S+, +Symbol+: +Sc+, +Sk+, +Sm+, or +So+. +- {Sc, Currency_Symbol}[https://www.compart.com/en/unicode/category/Sc]. +- {Sk, Modifier_Symbol}[https://www.compart.com/en/unicode/category/Sk]. +- {Sm, Math_Symbol}[https://www.compart.com/en/unicode/category/Sm]. +- {So, Other_Symbol}[https://www.compart.com/en/unicode/category/So]. + +- +Z+, +Separator+: +Zl+, +Zp+, or +Zs+. +- {Zl, Line_Separator}[https://www.compart.com/en/unicode/category/Zl]. +- {Zp, Paragraph_Separator}[https://www.compart.com/en/unicode/category/Zp]. +- {Zs, Space_Separator}[https://www.compart.com/en/unicode/category/Zs]. + +- +C+, +Other+: +Cc+, +Cf+, +Cn+, +Co+, or +Cs+. +- {Cc, Control}[https://www.compart.com/en/unicode/category/Cc]. +- {Cf, Format}[https://www.compart.com/en/unicode/category/Cf]. +- {Cn, Unassigned}[https://www.compart.com/en/unicode/category/Cn]. +- {Co, Private_Use}[https://www.compart.com/en/unicode/category/Co]. +- {Cs, Surrogate}[https://www.compart.com/en/unicode/category/Cs]. + +==== Unicode Scripts and Blocks + +Among the Unicode properties are: + +- {Unicode scripts}[https://en.wikipedia.org/wiki/Script_(Unicode)]; + see {supported scripts}[https://www.unicode.org/standard/supported.html]. +- {Unicode blocks}[https://en.wikipedia.org/wiki/Unicode_block]; + see {supported blocks}[http://www.unicode.org/Public/UNIDATA/Blocks.txt]. + +=== POSIX Bracket Expressions + +A POSIX bracket expression is also similar to a character class. +These expressions provide a portable alternative to the above, +with the added benefit of encompassing non-ASCII characters: + +- /\d/ matches only ASCII decimal digits +0+ through +9+. +- /[[:digit:]]/ matches any character in the Unicode + Decimal Number (+Nd+) category; + see below. + +The POSIX bracket expressions: + +- /[[:digit:]]/: Matches a {Unicode digit}[https://www.compart.com/en/unicode/category/Nd]: + + /[[:digit:]]/.match('9') # => # + /[[:digit:]]/.match("\u1fbf9") # => # + +- /[[:xdigit:]]/: Matches a digit allowed in a hexadecimal number; + equivalent to [0-9a-fA-F]. + +- /[[:upper:]]/: Matches a {Unicode uppercase letter}[https://www.compart.com/en/unicode/category/Lu]: + + /[[:upper:]]/.match('A') # => # + /[[:upper:]]/.match("\u00c6") # => # + +- /[[:lower:]]/: Matches a {Unicode lowercase letter}[https://www.compart.com/en/unicode/category/Ll]: + + /[[:lower:]]/.match('a') # => # + /[[:lower:]]/.match("\u01fd") # => # + +- /[[:alpha:]]/: Matches /[[:upper:]]/ or /[[:lower:]]/. + +- /[[:alnum:]]/: Matches /[[:alpha:]]/ or /[[:digit:]]/. + +- /[[:space:]]/: Matches {Unicode space character}[https://www.compart.com/en/unicode/category/Zs]: + + /[[:space:]]/.match(' ') # => # + /[[:space:]]/.match("\u2005") # => # + +- /[[:blank:]]/: Matches /[[:space:]]/ or tab character: + + /[[:blank:]]/.match(' ') # => # + /[[:blank:]]/.match("\u2005") # => # + /[[:blank:]]/.match("\t") # => # + +- /[[:cntrl:]]/: Matches {Unicode control character}[https://www.compart.com/en/unicode/category/Cc]: + + /[[:cntrl:]]/.match("\u0000") # => # + /[[:cntrl:]]/.match("\u009f") # => # + +- /[[:graph:]]/: Matches any character + except /[[:space:]]/ or /[[:cntrl:]]/. + +- /[[:print:]]/: Matches /[[:graph:]]/ or space character. + +- /[[:punct:]]/: Matches any (Unicode punctuation character}[https://www.compart.com/en/unicode/category/Po]: + +Ruby also supports these (non-POSIX) bracket expressions: + +- /[[:ascii:]]/: Matches a character in the ASCII character set. +- /[[:word:]]/: Matches a character in one of these Unicode character + categories (see below): + + - +Mark+ (+M+). + - +Letter+ (+L+). + - +Number+ (+N+) + - Connector Punctuation (+Pc+). + +=== Comments + +A comment may be included in a regexp pattern +using the (?#_comment_) construct, +where _comment_ is a substring that is to be ignored. +arbitrary text ignored by the regexp engine: + + /foo(?#Ignore me)bar/.match('foobar') # => # + +The comment may not include an unescaped terminator character. + +See also {Extended Mode}[rdoc-ref:regexp.rdoc@Extended+Mode]. + +== Modes + +Each of these modifiers sets a mode for the regexp: + +- +i+: /_pattern_/i sets + {Case-Insensitive Mode}[rdoc-ref:regexp.rdoc@Case-Insensitive+Mode]. +- +m+: /_pattern_/m sets + {Multiline Mode}[rdoc-ref:regexp.rdoc@Multiline+Mode]. +- +x+: /_pattern_/x sets + {Extended Mode}[rdoc-ref:regexp.rdoc@Extended+Mode]. +- +o+: /_pattern_/o sets + {Interpolation Mode}[rdoc-ref:regexp.rdoc@Interpolation+Mode]. + +Any, all, or none of these may be applied. + +Modifiers +i+, +m+, and +x+ may be applied to subexpressions: + +- (?_modifier_) turns the mode "on" for ensuing subexpressions +- (?-_modifier_) turns the mode "off" for ensuing subexpressions +- (?_modifier_:_subexp_) turns the mode "on" for _subexp_ within the group +- (?-_modifier_:_subexp_) turns the mode "off" for _subexp_ within the group + +Example: + + re = /(?i)te(?-i)st/ + re.match('test') # => # + re.match('TEst') # => # + re.match('TEST') # => nil + re.match('teST') # => nil + + re = /t(?i:e)st/ + re.match('test') # => # + re.match('tEst') # => # + re.match('tEST') # => nil + +\Method Regexp#options returns an integer whose value showing +the settings for case-insensitivity mode, multiline mode, and extended mode. + +=== Case-Insensitive Mode + +By default, a regexp is case-sensitive: + + /foo/.match('FOO') # => nil + +Modifier +i+ enables case-insensitive mode: + + /foo/i.match('FOO') + # => # + +\Method Regexp#casefold? returns whether the mode is case-insensitive. + +=== Multiline Mode + +The multiline-mode in Ruby is what is commonly called a "dot-all mode": + +- Without the +m+ modifier, the subexpression . does not match newlines: + + /a.c/.match("a\nc") # => nil + +- With the modifier, it does match: + + /a.c/m.match("a\nc") # => # + +Unlike other languages, the modifier +m+ does not affect the anchors ^ and $. +These anchors always match at line-boundaries in Ruby. + +=== Extended Mode + +Modifier +x+ enables extended mode, which means that: + +- Literal white space in the pattern is to be ignored. +- Character # marks the remainder of its containing line as a comment, + which is also to be ignored for matching purposes. + +In extended mode, whitespace and comments may be used +to form a self-documented regexp. + +Regexp not in extended mode (matches some Roman numerals): + + pattern = '^M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$' + re = /#{pattern}/ + re.match('MCMXLIII') # => # + +Regexp in extended mode: + + pattern = <<-EOT + ^ # beginning of string + M{0,3} # thousands - 0 to 3 Ms + (CM|CD|D?C{0,3}) # hundreds - 900 (CM), 400 (CD), 0-300 (0 to 3 Cs), + # or 500-800 (D, followed by 0 to 3 Cs) + (XC|XL|L?X{0,3}) # tens - 90 (XC), 40 (XL), 0-30 (0 to 3 Xs), + # or 50-80 (L, followed by 0 to 3 Xs) + (IX|IV|V?I{0,3}) # ones - 9 (IX), 4 (IV), 0-3 (0 to 3 Is), + # or 5-8 (V, followed by 0 to 3 Is) + $ # end of string + EOT + re = /#{pattern}/x + re.match('MCMXLIII') # => # + +=== Interpolation Mode + +Modifier +o+ means that the first time a literal regexp with interpolations +is encountered, +the generated Regexp object is saved and used for all future evaluations +of that literal regexp. +Without modifier +o+, the generated Regexp is not saved, +so each evaluation of the literal regexp generates a new Regexp object. + +Without modifier +o+: + + def letters; sleep 5; /[A-Z][a-z]/; end + words = %w[abc def xyz] + start = Time.now + words.each {|word| word.match(/\A[#{letters}]+\z/) } + Time.now - start # => 15.0174892 + +With modifier +o+: + + start = Time.now + words.each {|word| word.match(/\A[#{letters}]+\z/o) } + Time.now - start # => 5.0010866 + +Note that if the literal regexp does not have interpolations, +the +o+ behavior is the default. + +== Encodings + +By default, a regexp with only US-ASCII characters has US-ASCII encoding: + + re = /foo/ + re.source.encoding # => # + re.encoding # => # + +A regular expression containing non-US-ASCII characters +is assumed to use the source encoding. +This can be overridden with one of the following modifiers. + +- /_pat_/n: US-ASCII if only containing US-ASCII characters, + otherwise ASCII-8BIT: + + /foo/n.encoding # => # + /foo\xff/n.encoding # => # + /foo\x7f/n.encoding # => # + +- /_pat_/u: UTF-8 + + /foo/u.encoding # => # + +- /_pat_/e: EUC-JP + + /foo/e.encoding # => # + +- /_pat_/s: Windows-31J + + /foo/s.encoding # => # + +A regexp can be matched against a target string when either: + +- They have the same encoding. +- The regexp's encoding is a fixed encoding and the string + contains only ASCII characters. + Method Regexp#fixed_encoding? returns whether the regexp + has a fixed encoding. If a match between incompatible encodings is attempted an Encoding::CompatibilityError exception is raised. -The Regexp#fixed_encoding? predicate indicates whether the regexp -has a fixed encoding, that is one incompatible with ASCII. A -regexp's encoding can be explicitly fixed by supplying -Regexp::FIXEDENCODING as the second argument of -Regexp.new: - - r = Regexp.new("a".force_encoding("iso-8859-1"),Regexp::FIXEDENCODING) - r =~ "a\u3042" - # raises Encoding::CompatibilityError: incompatible encoding regexp match - # (ISO-8859-1 regexp with UTF-8 string) - -== \Regexp Global Variables - -Pattern matching sets some global variables : - -* $~ is equivalent to Regexp.last_match; -* $& contains the complete matched text; -* $` contains string before match; -* $' contains string after match; -* $1, $2 and so on contain text matching first, second, etc - capture group; -* $+ contains last capture group. - Example: - m = /s(\w{2}).*(c)/.match('haystack') #=> # - $~ #=> # - Regexp.last_match #=> # + re = eval("# encoding: ISO-8859-1\n/foo\\xff?/") + re.encoding # => # + re =~ "foo".encode("UTF-8") # => 0 + re =~ "foo\u0100" # Raises Encoding::CompatibilityError - $& #=> "stac" - # same as m[0] - $` #=> "hay" - # same as m.pre_match - $' #=> "k" - # same as m.post_match - $1 #=> "ta" - # same as m[1] - $2 #=> "c" - # same as m[2] - $3 #=> nil - # no third group in pattern - $+ #=> "c" - # same as m[-1] +The encoding may be explicitly fixed by including Regexp::FIXEDENCODING +in the second argument for Regexp.new: -These global variables are thread-local and method-local variables. + # Regexp with encoding ISO-8859-1. + re = Regexp.new("a".force_encoding('iso-8859-1'), Regexp::FIXEDENCODING) + re.encoding # => # + # Target string with encoding UTF-8. + s = "a\u3042" + s.encoding # => # + re.match(s) # Raises Encoding::CompatibilityError. -== Performance +== Timeouts -Certain pathological combinations of constructs can lead to abysmally bad -performance. +When either a regexp source or a target string comes from untrusted input, +malicious values could become a denial-of-service attack; +to prevent such an attack, it is wise to set a timeout. -Consider a string of 25 as, a d, 4 as, and a -c. +\Regexp has two timeout values: - s = 'a' * 25 + 'd' + 'a' * 4 + 'c' - #=> "aaaaaaaaaaaaaaaaaaaaaaaaadaaaac" +- A class default timeout, used for a regexp whose instance timeout is +nil+; + this default is initially +nil+, and may be set by method Regexp.timeout=: -The following patterns match instantly as you would expect: + Regexp.timeout # => nil + Regexp.timeout = 3.0 + Regexp.timeout # => 3.0 - /(b|a)/ =~ s #=> 0 - /(b|a+)/ =~ s #=> 0 - /(b|a+)*/ =~ s #=> 0 +- An instance timeout, which defaults to +nil+ and may be set in Regexp.new: -However, the following pattern takes appreciably longer: + re = Regexp.new('foo', timeout: 5.0) + re.timeout # => 5.0 - /(b|a+)*c/ =~ s #=> 26 +When regexp.timeout is +nil+, the timeout "falls through" to Regexp.timeout; +when regexp.timeout is non-+nil+, that value controls timing out: -This happens because an atom in the regexp is quantified by both an -immediate + and an enclosing * with nothing to -differentiate which is in control of any particular character. The -nondeterminism that results produces super-linear performance. (Consult -Mastering Regular Expressions (3rd ed.), pp 222, by -Jeffery Friedl, for an in-depth analysis). This particular case -can be fixed by use of atomic grouping, which prevents the unnecessary -backtracking: + | regexp.timeout Value | Regexp.timeout Value | Result | + |----------------------|----------------------|-----------------------------| + | nil | nil | Never times out. | + | nil | Float | Times out in Float seconds. | + | Float | Any | Times out in Float seconds. | - (start = Time.now) && /(b|a+)*c/ =~ s && (Time.now - start) - #=> 24.702736882 - (start = Time.now) && /(?>b|a+)*c/ =~ s && (Time.now - start) - #=> 0.000166571 +== References -A similar case is typified by the following example, which takes -approximately 60 seconds to execute for me: +Read (online PDF books): -Match a string of 29 as against a pattern of 29 optional as -followed by 29 mandatory as: +- {Mastering Regular Expressions}[https://ia902508.us.archive.org/10/items/allitebooks-02/Mastering%20Regular%20Expressions%2C%203rd%20Edition.pdf] + by Jeffrey E.F. Friedl. +- {Regular Expressions Cookbook}[https://doc.lagout.org/programmation/Regular%20Expressions/Regular%20Expressions%20Cookbook_%20Detailed%20Solutions%20in%20Eight%20Programming%20Languages%20%282nd%20ed.%29%20%5BGoyvaerts%20%26%20Levithan%202012-09-06%5D.pdf] + by Jan Goyvaerts & Steven Levithan. - Regexp.new('a?' * 29 + 'a' * 29) =~ 'a' * 29 +Explore, test (interactive online editor): -The 29 optional as match the string, but this prevents the 29 -mandatory as that follow from matching. Ruby must then backtrack -repeatedly so as to satisfy as many of the optional matches as it can -while still matching the mandatory 29. It is plain to us that none of the -optional matches can succeed, but this fact unfortunately eludes Ruby. - -The best way to improve performance is to significantly reduce the amount of -backtracking needed. For this case, instead of individually matching 29 -optional as, a range of optional as can be matched all at once -with a{0,29}: - - Regexp.new('a{0,29}' + 'a' * 29) =~ 'a' * 29 - -== Timeout - -There are two APIs to set timeout. One is Regexp.timeout=, which is -process-global configuration of timeout for Regexp matching. - - Regexp.timeout = 3 - s = 'a' * 25 + 'd' + 'a' * 4 + 'c' - /(b|a+)*c/ =~ s #=> This raises an exception in three seconds - -The other is timeout keyword of Regexp.new. - - re = Regexp.new("(b|a+)*c", timeout: 3) - s = 'a' * 25 + 'd' + 'a' * 4 + 'c' - /(b|a+)*c/ =~ s #=> This raises an exception in three seconds - -When using Regexps to process untrusted input, you should use the timeout -feature to avoid excessive backtracking. Otherwise, a malicious user can -provide input to Regexp causing Denial-of-Service attack. -Note that the timeout is not set by default because an appropriate limit -highly depends on an application requirement and context. +- {Rubular}[https://rubular.com/]. diff --git a/doc/regexp/methods.rdoc b/doc/regexp/methods.rdoc new file mode 100644 index 0000000000..356156ac9a --- /dev/null +++ b/doc/regexp/methods.rdoc @@ -0,0 +1,41 @@ +== \Regexp Methods + +Each of these Ruby core methods can accept a regexp as an argument: + +- Enumerable#all? +- Enumerable#any? +- Enumerable#grep +- Enumerable#grep_v +- Enumerable#none? +- Enumerable#one? +- Enumerable#slice_after +- Enumerable#slice_before +- Regexp#=~ +- Regexp#match +- Regexp#match? +- Regexp.new +- Regexp.union +- String#=~ +- String#[]= +- String#byteindex +- String#byterindex +- String#gsub +- String#gsub! +- String#index +- String#match +- String#match? +- String#partition +- String#rindex +- String#rpartition +- String#scan +- String#slice +- String#slice! +- String#split +- String#start_with? +- String#sub +- String#sub! +- Symbol#=~ +- Symbol#match +- Symbol#match? +- Symbol#slice +- Symbol#start_with? diff --git a/doc/regexp/unicode_properties.rdoc b/doc/regexp/unicode_properties.rdoc new file mode 100644 index 0000000000..354ed3a83c --- /dev/null +++ b/doc/regexp/unicode_properties.rdoc @@ -0,0 +1,863 @@ +== \Regexps Based on Unicode Properties + +The properties shown here are those currently supported in Ruby. +Older versions may not support all of these; +newer versions may support additional properties. + +=== POSIX brackets + +- /\p{Alpha}/ +- /\p{Blank}/ +- /\p{Cntrl}/ +- /\p{Digit}/ +- /\p{Graph}/ +- /\p{Lower}/ +- /\p{Print}/ +- /\p{Punct}/ +- /\p{Space}/ +- /\p{Upper}/ +- /\p{XDigit}/ +- /\p{Word}/ +- /\p{Alnum}/ +- /\p{ASCII}/ +- /\p{XPosixPunct}/ + +=== Special + +- /\p{Any}/ +- /\p{Assigned}/ + +=== Major and General Categories + +- /\p{C}/ +- /\p{Cc}/ +- /\p{Cf}/ +- /\p{Cn}/ +- /\p{Co}/ +- /\p{Cs}/ +- /\p{L}/ +- /\p{LC}/ +- /\p{Ll}/ +- /\p{Lm}/ +- /\p{Lo}/ +- /\p{Lt}/ +- /\p{Lu}/ +- /\p{M}/ +- /\p{Mc}/ +- /\p{Me}/ +- /\p{Mn}/ +- /\p{N}/ +- /\p{Nd}/ +- /\p{Nl}/ +- /\p{No}/ +- /\p{P}/ +- /\p{Pc}/ +- /\p{Pd}/ +- /\p{Pe}/ +- /\p{Pf}/ +- /\p{Pi}/ +- /\p{Po}/ +- /\p{Ps}/ +- /\p{S}/ +- /\p{Sc}/ +- /\p{Sk}/ +- /\p{Sm}/ +- /\p{So}/ +- /\p{Z}/ +- /\p{Zl}/ +- /\p{Zp}/ +- /\p{Zs}/ + +=== Scripts + +- /\p{Adlam}/ +- /\p{Ahom}/ +- /\p{Anatolian_Hieroglyphs}/ +- /\p{Arabic}/ +- /\p{Armenian}/ +- /\p{Avestan}/ +- /\p{Balinese}/ +- /\p{Bamum}/ +- /\p{Bassa_Vah}/ +- /\p{Batak}/ +- /\p{Bengali}/ +- /\p{Bhaiksuki}/ +- /\p{Bopomofo}/ +- /\p{Brahmi}/ +- /\p{Braille}/ +- /\p{Buginese}/ +- /\p{Buhid}/ +- /\p{Canadian_Aboriginal}/ +- /\p{Carian}/ +- /\p{Caucasian_Albanian}/ +- /\p{Chakma}/ +- /\p{Cham}/ +- /\p{Cherokee}/ +- /\p{Common}/ +- /\p{Coptic}/ +- /\p{Cuneiform}/ +- /\p{Cypriot}/ +- /\p{Cyrillic}/ +- /\p{Deseret}/ +- /\p{Devanagari}/ +- /\p{Dogra}/ +- /\p{Duployan}/ +- /\p{Egyptian_Hieroglyphs}/ +- /\p{Elbasan}/ +- /\p{Elymaic}/ +- /\p{Ethiopic}/ +- /\p{Georgian}/ +- /\p{Glagolitic}/ +- /\p{Gothic}/ +- /\p{Grantha}/ +- /\p{Greek}/ +- /\p{Gujarati}/ +- /\p{Gunjala_Gondi}/ +- /\p{Gurmukhi}/ +- /\p{Han}/ +- /\p{Hangul}/ +- /\p{Hanifi_Rohingya}/ +- /\p{Hanunoo}/ +- /\p{Hatran}/ +- /\p{Hebrew}/ +- /\p{Hiragana}/ +- /\p{Imperial_Aramaic}/ +- /\p{Inherited}/ +- /\p{Inscriptional_Pahlavi}/ +- /\p{Inscriptional_Parthian}/ +- /\p{Javanese}/ +- /\p{Kaithi}/ +- /\p{Kannada}/ +- /\p{Katakana}/ +- /\p{Kayah_Li}/ +- /\p{Kharoshthi}/ +- /\p{Khmer}/ +- /\p{Khojki}/ +- /\p{Khudawadi}/ +- /\p{Lao}/ +- /\p{Latin}/ +- /\p{Lepcha}/ +- /\p{Limbu}/ +- /\p{Linear_A}/ +- /\p{Linear_B}/ +- /\p{Lisu}/ +- /\p{Lycian}/ +- /\p{Lydian}/ +- /\p{Mahajani}/ +- /\p{Makasar}/ +- /\p{Malayalam}/ +- /\p{Mandaic}/ +- /\p{Manichaean}/ +- /\p{Marchen}/ +- /\p{Masaram_Gondi}/ +- /\p{Medefaidrin}/ +- /\p{Meetei_Mayek}/ +- /\p{Mende_Kikakui}/ +- /\p{Meroitic_Cursive}/ +- /\p{Meroitic_Hieroglyphs}/ +- /\p{Miao}/ +- /\p{Modi}/ +- /\p{Mongolian}/ +- /\p{Mro}/ +- /\p{Multani}/ +- /\p{Myanmar}/ +- /\p{Nabataean}/ +- /\p{Nandinagari}/ +- /\p{New_Tai_Lue}/ +- /\p{Newa}/ +- /\p{Nko}/ +- /\p{Nushu}/ +- /\p{Nyiakeng_Puachue_Hmong}/ +- /\p{Ogham}/ +- /\p{Ol_Chiki}/ +- /\p{Old_Hungarian}/ +- /\p{Old_Italic}/ +- /\p{Old_North_Arabian}/ +- /\p{Old_Permic}/ +- /\p{Old_Persian}/ +- /\p{Old_Sogdian}/ +- /\p{Old_South_Arabian}/ +- /\p{Old_Turkic}/ +- /\p{Oriya}/ +- /\p{Osage}/ +- /\p{Osmanya}/ +- /\p{Pahawh_Hmong}/ +- /\p{Palmyrene}/ +- /\p{Pau_Cin_Hau}/ +- /\p{Phags_Pa}/ +- /\p{Phoenician}/ +- /\p{Psalter_Pahlavi}/ +- /\p{Rejang}/ +- /\p{Runic}/ +- /\p{Samaritan}/ +- /\p{Saurashtra}/ +- /\p{Sharada}/ +- /\p{Shavian}/ +- /\p{Siddham}/ +- /\p{SignWriting}/ +- /\p{Sinhala}/ +- /\p{Sogdian}/ +- /\p{Sora_Sompeng}/ +- /\p{Soyombo}/ +- /\p{Sundanese}/ +- /\p{Syloti_Nagri}/ +- /\p{Syriac}/ +- /\p{Tagalog}/ +- /\p{Tagbanwa}/ +- /\p{Tai_Le}/ +- /\p{Tai_Tham}/ +- /\p{Tai_Viet}/ +- /\p{Takri}/ +- /\p{Tamil}/ +- /\p{Tangut}/ +- /\p{Telugu}/ +- /\p{Thaana}/ +- /\p{Thai}/ +- /\p{Tibetan}/ +- /\p{Tifinagh}/ +- /\p{Tirhuta}/ +- /\p{Ugaritic}/ +- /\p{Unknown}/ +- /\p{Vai}/ +- /\p{Wancho}/ +- /\p{Warang_Citi}/ +- /\p{Yi}/ +- /\p{Zanabazar_Square}/ + +=== Derived Core Properties + +- /\p{Alphabetic}/ +- /\p{Case_Ignorable}/ +- /\p{Cased}/ +- /\p{Changes_When_Casefolded}/ +- /\p{Changes_When_Casemapped}/ +- /\p{Changes_When_Lowercased}/ +- /\p{Changes_When_Titlecased}/ +- /\p{Changes_When_Uppercased}/ +- /\p{Default_Ignorable_Code_Point}/ +- /\p{Grapheme_Base}/ +- /\p{Grapheme_Extend}/ +- /\p{Grapheme_Link}/ +- /\p{ID_Continue}/ +- /\p{ID_Start}/ +- /\p{Lowercase}/ +- /\p{Math}/ +- /\p{Uppercase}/ +- /\p{XID_Continue}/ +- /\p{XID_Start}/ + +=== Prop List + +- /\p{ASCII_Hex_Digit}/ +- /\p{Bidi_Control}/ +- /\p{Dash}/ +- /\p{Deprecated}/ +- /\p{Diacritic}/ +- /\p{Extender}/ +- /\p{Hex_Digit}/ +- /\p{Hyphen}/ +- /\p{IDS_Binary_Operator}/ +- /\p{IDS_Trinary_Operator}/ +- /\p{Ideographic}/ +- /\p{Join_Control}/ +- /\p{Logical_Order_Exception}/ +- /\p{Noncharacter_Code_Point}/ +- /\p{Other_Alphabetic}/ +- /\p{Other_Default_Ignorable_Code_Point}/ +- /\p{Other_Grapheme_Extend}/ +- /\p{Other_ID_Continue}/ +- /\p{Other_ID_Start}/ +- /\p{Other_Lowercase}/ +- /\p{Other_Math}/ +- /\p{Other_Uppercase}/ +- /\p{Pattern_Syntax}/ +- /\p{Pattern_White_Space}/ +- /\p{Prepended_Concatenation_Mark}/ +- /\p{Quotation_Mark}/ +- /\p{Radical}/ +- /\p{Regional_Indicator}/ +- /\p{Sentence_Terminal}/ +- /\p{Soft_Dotted}/ +- /\p{Terminal_Punctuation}/ +- /\p{Unified_Ideograph}/ +- /\p{Variation_Selector}/ +- /\p{White_Space}/ + +=== Emoji + +- /\p{Emoji}/ +- /\p{Emoji_Component}/ +- /\p{Emoji_Modifier}/ +- /\p{Emoji_Modifier_Base}/ +- /\p{Emoji_Presentation}/ + +=== Property Aliases + +- /\p{AHex}/ +- /\p{Bidi_C}/ +- /\p{CI}/ +- /\p{CWCF}/ +- /\p{CWCM}/ +- /\p{CWL}/ +- /\p{CWT}/ +- /\p{CWU}/ +- /\p{DI}/ +- /\p{Dep}/ +- /\p{Dia}/ +- /\p{Ext}/ +- /\p{Gr_Base}/ +- /\p{Gr_Ext}/ +- /\p{Gr_Link}/ +- /\p{Hex}/ +- /\p{IDC}/ +- /\p{IDS}/ +- /\p{IDSB}/ +- /\p{IDST}/ +- /\p{Ideo}/ +- /\p{Join_C}/ +- /\p{LOE}/ +- /\p{NChar}/ +- /\p{OAlpha}/ +- /\p{ODI}/ +- /\p{OGr_Ext}/ +- /\p{OIDC}/ +- /\p{OIDS}/ +- /\p{OLower}/ +- /\p{OMath}/ +- /\p{OUpper}/ +- /\p{PCM}/ +- /\p{Pat_Syn}/ +- /\p{Pat_WS}/ +- /\p{QMark}/ +- /\p{RI}/ +- /\p{SD}/ +- /\p{STerm}/ +- /\p{Term}/ +- /\p{UIdeo}/ +- /\p{VS}/ +- /\p{WSpace}/ +- /\p{XIDC}/ +- /\p{XIDS}/ + +=== Property Value Aliases (General Category) + +- /\p{Other}/ +- /\p{Control}/ +- /\p{Format}/ +- /\p{Unassigned}/ +- /\p{Private_Use}/ +- /\p{Surrogate}/ +- /\p{Letter}/ +- /\p{Cased_Letter}/ +- /\p{Lowercase_Letter}/ +- /\p{Modifier_Letter}/ +- /\p{Other_Letter}/ +- /\p{Titlecase_Letter}/ +- /\p{Uppercase_Letter}/ +- /\p{Mark}/ +- /\p{Combining_Mark}/ +- /\p{Spacing_Mark}/ +- /\p{Enclosing_Mark}/ +- /\p{Nonspacing_Mark}/ +- /\p{Number}/ +- /\p{Decimal_Number}/ +- /\p{Letter_Number}/ +- /\p{Other_Number}/ +- /\p{Punctuation}/ +- /\p{Connector_Punctuation}/ +- /\p{Dash_Punctuation}/ +- /\p{Close_Punctuation}/ +- /\p{Final_Punctuation}/ +- /\p{Initial_Punctuation}/ +- /\p{Other_Punctuation}/ +- /\p{Open_Punctuation}/ +- /\p{Symbol}/ +- /\p{Currency_Symbol}/ +- /\p{Modifier_Symbol}/ +- /\p{Math_Symbol}/ +- /\p{Other_Symbol}/ +- /\p{Separator}/ +- /\p{Line_Separator}/ +- /\p{Paragraph_Separator}/ +- /\p{Space_Separator}/ + +=== Property Value Aliases (Script) + +- /\p{Adlm}/ +- /\p{Aghb}/ +- /\p{Arab}/ +- /\p{Armi}/ +- /\p{Armn}/ +- /\p{Avst}/ +- /\p{Bali}/ +- /\p{Bamu}/ +- /\p{Bass}/ +- /\p{Batk}/ +- /\p{Beng}/ +- /\p{Bhks}/ +- /\p{Bopo}/ +- /\p{Brah}/ +- /\p{Brai}/ +- /\p{Bugi}/ +- /\p{Buhd}/ +- /\p{Cakm}/ +- /\p{Cans}/ +- /\p{Cari}/ +- /\p{Cher}/ +- /\p{Copt}/ +- /\p{Qaac}/ +- /\p{Cprt}/ +- /\p{Cyrl}/ +- /\p{Deva}/ +- /\p{Dogr}/ +- /\p{Dsrt}/ +- /\p{Dupl}/ +- /\p{Egyp}/ +- /\p{Elba}/ +- /\p{Elym}/ +- /\p{Ethi}/ +- /\p{Geor}/ +- /\p{Glag}/ +- /\p{Gong}/ +- /\p{Gonm}/ +- /\p{Goth}/ +- /\p{Gran}/ +- /\p{Grek}/ +- /\p{Gujr}/ +- /\p{Guru}/ +- /\p{Hang}/ +- /\p{Hani}/ +- /\p{Hano}/ +- /\p{Hatr}/ +- /\p{Hebr}/ +- /\p{Hira}/ +- /\p{Hluw}/ +- /\p{Hmng}/ +- /\p{Hmnp}/ +- /\p{Hung}/ +- /\p{Ital}/ +- /\p{Java}/ +- /\p{Kali}/ +- /\p{Kana}/ +- /\p{Khar}/ +- /\p{Khmr}/ +- /\p{Khoj}/ +- /\p{Knda}/ +- /\p{Kthi}/ +- /\p{Lana}/ +- /\p{Laoo}/ +- /\p{Latn}/ +- /\p{Lepc}/ +- /\p{Limb}/ +- /\p{Lina}/ +- /\p{Linb}/ +- /\p{Lyci}/ +- /\p{Lydi}/ +- /\p{Mahj}/ +- /\p{Maka}/ +- /\p{Mand}/ +- /\p{Mani}/ +- /\p{Marc}/ +- /\p{Medf}/ +- /\p{Mend}/ +- /\p{Merc}/ +- /\p{Mero}/ +- /\p{Mlym}/ +- /\p{Mong}/ +- /\p{Mroo}/ +- /\p{Mtei}/ +- /\p{Mult}/ +- /\p{Mymr}/ +- /\p{Nand}/ +- /\p{Narb}/ +- /\p{Nbat}/ +- /\p{Nkoo}/ +- /\p{Nshu}/ +- /\p{Ogam}/ +- /\p{Olck}/ +- /\p{Orkh}/ +- /\p{Orya}/ +- /\p{Osge}/ +- /\p{Osma}/ +- /\p{Palm}/ +- /\p{Pauc}/ +- /\p{Perm}/ +- /\p{Phag}/ +- /\p{Phli}/ +- /\p{Phlp}/ +- /\p{Phnx}/ +- /\p{Plrd}/ +- /\p{Prti}/ +- /\p{Rjng}/ +- /\p{Rohg}/ +- /\p{Runr}/ +- /\p{Samr}/ +- /\p{Sarb}/ +- /\p{Saur}/ +- /\p{Sgnw}/ +- /\p{Shaw}/ +- /\p{Shrd}/ +- /\p{Sidd}/ +- /\p{Sind}/ +- /\p{Sinh}/ +- /\p{Sogd}/ +- /\p{Sogo}/ +- /\p{Sora}/ +- /\p{Soyo}/ +- /\p{Sund}/ +- /\p{Sylo}/ +- /\p{Syrc}/ +- /\p{Tagb}/ +- /\p{Takr}/ +- /\p{Tale}/ +- /\p{Talu}/ +- /\p{Taml}/ +- /\p{Tang}/ +- /\p{Tavt}/ +- /\p{Telu}/ +- /\p{Tfng}/ +- /\p{Tglg}/ +- /\p{Thaa}/ +- /\p{Tibt}/ +- /\p{Tirh}/ +- /\p{Ugar}/ +- /\p{Vaii}/ +- /\p{Wara}/ +- /\p{Wcho}/ +- /\p{Xpeo}/ +- /\p{Xsux}/ +- /\p{Yiii}/ +- /\p{Zanb}/ +- /\p{Zinh}/ +- /\p{Qaai}/ +- /\p{Zyyy}/ +- /\p{Zzzz}/ + +=== Derived Ages + +- /\p{Age=1.1}/ +- /\p{Age=10.0}/ +- /\p{Age=11.0}/ +- /\p{Age=12.0}/ +- /\p{Age=12.1}/ +- /\p{Age=2.0}/ +- /\p{Age=2.1}/ +- /\p{Age=3.0}/ +- /\p{Age=3.1}/ +- /\p{Age=3.2}/ +- /\p{Age=4.0}/ +- /\p{Age=4.1}/ +- /\p{Age=5.0}/ +- /\p{Age=5.1}/ +- /\p{Age=5.2}/ +- /\p{Age=6.0}/ +- /\p{Age=6.1}/ +- /\p{Age=6.2}/ +- /\p{Age=6.3}/ +- /\p{Age=7.0}/ +- /\p{Age=8.0}/ +- /\p{Age=9.0}/ + +=== Blocks + +- /\p{In_Basic_Latin}/ +- /\p{In_Latin_1_Supplement}/ +- /\p{In_Latin_Extended_A}/ +- /\p{In_Latin_Extended_B}/ +- /\p{In_IPA_Extensions}/ +- /\p{In_Spacing_Modifier_Letters}/ +- /\p{In_Combining_Diacritical_Marks}/ +- /\p{In_Greek_and_Coptic}/ +- /\p{In_Cyrillic}/ +- /\p{In_Cyrillic_Supplement}/ +- /\p{In_Armenian}/ +- /\p{In_Hebrew}/ +- /\p{In_Arabic}/ +- /\p{In_Syriac}/ +- /\p{In_Arabic_Supplement}/ +- /\p{In_Thaana}/ +- /\p{In_NKo}/ +- /\p{In_Samaritan}/ +- /\p{In_Mandaic}/ +- /\p{In_Syriac_Supplement}/ +- /\p{In_Arabic_Extended_A}/ +- /\p{In_Devanagari}/ +- /\p{In_Bengali}/ +- /\p{In_Gurmukhi}/ +- /\p{In_Gujarati}/ +- /\p{In_Oriya}/ +- /\p{In_Tamil}/ +- /\p{In_Telugu}/ +- /\p{In_Kannada}/ +- /\p{In_Malayalam}/ +- /\p{In_Sinhala}/ +- /\p{In_Thai}/ +- /\p{In_Lao}/ +- /\p{In_Tibetan}/ +- /\p{In_Myanmar}/ +- /\p{In_Georgian}/ +- /\p{In_Hangul_Jamo}/ +- /\p{In_Ethiopic}/ +- /\p{In_Ethiopic_Supplement}/ +- /\p{In_Cherokee}/ +- /\p{In_Unified_Canadian_Aboriginal_Syllabics}/ +- /\p{In_Ogham}/ +- /\p{In_Runic}/ +- /\p{In_Tagalog}/ +- /\p{In_Hanunoo}/ +- /\p{In_Buhid}/ +- /\p{In_Tagbanwa}/ +- /\p{In_Khmer}/ +- /\p{In_Mongolian}/ +- /\p{In_Unified_Canadian_Aboriginal_Syllabics_Extended}/ +- /\p{In_Limbu}/ +- /\p{In_Tai_Le}/ +- /\p{In_New_Tai_Lue}/ +- /\p{In_Khmer_Symbols}/ +- /\p{In_Buginese}/ +- /\p{In_Tai_Tham}/ +- /\p{In_Combining_Diacritical_Marks_Extended}/ +- /\p{In_Balinese}/ +- /\p{In_Sundanese}/ +- /\p{In_Batak}/ +- /\p{In_Lepcha}/ +- /\p{In_Ol_Chiki}/ +- /\p{In_Cyrillic_Extended_C}/ +- /\p{In_Georgian_Extended}/ +- /\p{In_Sundanese_Supplement}/ +- /\p{In_Vedic_Extensions}/ +- /\p{In_Phonetic_Extensions}/ +- /\p{In_Phonetic_Extensions_Supplement}/ +- /\p{In_Combining_Diacritical_Marks_Supplement}/ +- /\p{In_Latin_Extended_Additional}/ +- /\p{In_Greek_Extended}/ +- /\p{In_General_Punctuation}/ +- /\p{In_Superscripts_and_Subscripts}/ +- /\p{In_Currency_Symbols}/ +- /\p{In_Combining_Diacritical_Marks_for_Symbols}/ +- /\p{In_Letterlike_Symbols}/ +- /\p{In_Number_Forms}/ +- /\p{In_Arrows}/ +- /\p{In_Mathematical_Operators}/ +- /\p{In_Miscellaneous_Technical}/ +- /\p{In_Control_Pictures}/ +- /\p{In_Optical_Character_Recognition}/ +- /\p{In_Enclosed_Alphanumerics}/ +- /\p{In_Box_Drawing}/ +- /\p{In_Block_Elements}/ +- /\p{In_Geometric_Shapes}/ +- /\p{In_Miscellaneous_Symbols}/ +- /\p{In_Dingbats}/ +- /\p{In_Miscellaneous_Mathematical_Symbols_A}/ +- /\p{In_Supplemental_Arrows_A}/ +- /\p{In_Braille_Patterns}/ +- /\p{In_Supplemental_Arrows_B}/ +- /\p{In_Miscellaneous_Mathematical_Symbols_B}/ +- /\p{In_Supplemental_Mathematical_Operators}/ +- /\p{In_Miscellaneous_Symbols_and_Arrows}/ +- /\p{In_Glagolitic}/ +- /\p{In_Latin_Extended_C}/ +- /\p{In_Coptic}/ +- /\p{In_Georgian_Supplement}/ +- /\p{In_Tifinagh}/ +- /\p{In_Ethiopic_Extended}/ +- /\p{In_Cyrillic_Extended_A}/ +- /\p{In_Supplemental_Punctuation}/ +- /\p{In_CJK_Radicals_Supplement}/ +- /\p{In_Kangxi_Radicals}/ +- /\p{In_Ideographic_Description_Characters}/ +- /\p{In_CJK_Symbols_and_Punctuation}/ +- /\p{In_Hiragana}/ +- /\p{In_Katakana}/ +- /\p{In_Bopomofo}/ +- /\p{In_Hangul_Compatibility_Jamo}/ +- /\p{In_Kanbun}/ +- /\p{In_Bopomofo_Extended}/ +- /\p{In_CJK_Strokes}/ +- /\p{In_Katakana_Phonetic_Extensions}/ +- /\p{In_Enclosed_CJK_Letters_and_Months}/ +- /\p{In_CJK_Compatibility}/ +- /\p{In_CJK_Unified_Ideographs_Extension_A}/ +- /\p{In_Yijing_Hexagram_Symbols}/ +- /\p{In_CJK_Unified_Ideographs}/ +- /\p{In_Yi_Syllables}/ +- /\p{In_Yi_Radicals}/ +- /\p{In_Lisu}/ +- /\p{In_Vai}/ +- /\p{In_Cyrillic_Extended_B}/ +- /\p{In_Bamum}/ +- /\p{In_Modifier_Tone_Letters}/ +- /\p{In_Latin_Extended_D}/ +- /\p{In_Syloti_Nagri}/ +- /\p{In_Common_Indic_Number_Forms}/ +- /\p{In_Phags_pa}/ +- /\p{In_Saurashtra}/ +- /\p{In_Devanagari_Extended}/ +- /\p{In_Kayah_Li}/ +- /\p{In_Rejang}/ +- /\p{In_Hangul_Jamo_Extended_A}/ +- /\p{In_Javanese}/ +- /\p{In_Myanmar_Extended_B}/ +- /\p{In_Cham}/ +- /\p{In_Myanmar_Extended_A}/ +- /\p{In_Tai_Viet}/ +- /\p{In_Meetei_Mayek_Extensions}/ +- /\p{In_Ethiopic_Extended_A}/ +- /\p{In_Latin_Extended_E}/ +- /\p{In_Cherokee_Supplement}/ +- /\p{In_Meetei_Mayek}/ +- /\p{In_Hangul_Syllables}/ +- /\p{In_Hangul_Jamo_Extended_B}/ +- /\p{In_High_Surrogates}/ +- /\p{In_High_Private_Use_Surrogates}/ +- /\p{In_Low_Surrogates}/ +- /\p{In_Private_Use_Area}/ +- /\p{In_CJK_Compatibility_Ideographs}/ +- /\p{In_Alphabetic_Presentation_Forms}/ +- /\p{In_Arabic_Presentation_Forms_A}/ +- /\p{In_Variation_Selectors}/ +- /\p{In_Vertical_Forms}/ +- /\p{In_Combining_Half_Marks}/ +- /\p{In_CJK_Compatibility_Forms}/ +- /\p{In_Small_Form_Variants}/ +- /\p{In_Arabic_Presentation_Forms_B}/ +- /\p{In_Halfwidth_and_Fullwidth_Forms}/ +- /\p{In_Specials}/ +- /\p{In_Linear_B_Syllabary}/ +- /\p{In_Linear_B_Ideograms}/ +- /\p{In_Aegean_Numbers}/ +- /\p{In_Ancient_Greek_Numbers}/ +- /\p{In_Ancient_Symbols}/ +- /\p{In_Phaistos_Disc}/ +- /\p{In_Lycian}/ +- /\p{In_Carian}/ +- /\p{In_Coptic_Epact_Numbers}/ +- /\p{In_Old_Italic}/ +- /\p{In_Gothic}/ +- /\p{In_Old_Permic}/ +- /\p{In_Ugaritic}/ +- /\p{In_Old_Persian}/ +- /\p{In_Deseret}/ +- /\p{In_Shavian}/ +- /\p{In_Osmanya}/ +- /\p{In_Osage}/ +- /\p{In_Elbasan}/ +- /\p{In_Caucasian_Albanian}/ +- /\p{In_Linear_A}/ +- /\p{In_Cypriot_Syllabary}/ +- /\p{In_Imperial_Aramaic}/ +- /\p{In_Palmyrene}/ +- /\p{In_Nabataean}/ +- /\p{In_Hatran}/ +- /\p{In_Phoenician}/ +- /\p{In_Lydian}/ +- /\p{In_Meroitic_Hieroglyphs}/ +- /\p{In_Meroitic_Cursive}/ +- /\p{In_Kharoshthi}/ +- /\p{In_Old_South_Arabian}/ +- /\p{In_Old_North_Arabian}/ +- /\p{In_Manichaean}/ +- /\p{In_Avestan}/ +- /\p{In_Inscriptional_Parthian}/ +- /\p{In_Inscriptional_Pahlavi}/ +- /\p{In_Psalter_Pahlavi}/ +- /\p{In_Old_Turkic}/ +- /\p{In_Old_Hungarian}/ +- /\p{In_Hanifi_Rohingya}/ +- /\p{In_Rumi_Numeral_Symbols}/ +- /\p{In_Old_Sogdian}/ +- /\p{In_Sogdian}/ +- /\p{In_Elymaic}/ +- /\p{In_Brahmi}/ +- /\p{In_Kaithi}/ +- /\p{In_Sora_Sompeng}/ +- /\p{In_Chakma}/ +- /\p{In_Mahajani}/ +- /\p{In_Sharada}/ +- /\p{In_Sinhala_Archaic_Numbers}/ +- /\p{In_Khojki}/ +- /\p{In_Multani}/ +- /\p{In_Khudawadi}/ +- /\p{In_Grantha}/ +- /\p{In_Newa}/ +- /\p{In_Tirhuta}/ +- /\p{In_Siddham}/ +- /\p{In_Modi}/ +- /\p{In_Mongolian_Supplement}/ +- /\p{In_Takri}/ +- /\p{In_Ahom}/ +- /\p{In_Dogra}/ +- /\p{In_Warang_Citi}/ +- /\p{In_Nandinagari}/ +- /\p{In_Zanabazar_Square}/ +- /\p{In_Soyombo}/ +- /\p{In_Pau_Cin_Hau}/ +- /\p{In_Bhaiksuki}/ +- /\p{In_Marchen}/ +- /\p{In_Masaram_Gondi}/ +- /\p{In_Gunjala_Gondi}/ +- /\p{In_Makasar}/ +- /\p{In_Tamil_Supplement}/ +- /\p{In_Cuneiform}/ +- /\p{In_Cuneiform_Numbers_and_Punctuation}/ +- /\p{In_Early_Dynastic_Cuneiform}/ +- /\p{In_Egyptian_Hieroglyphs}/ +- /\p{In_Egyptian_Hieroglyph_Format_Controls}/ +- /\p{In_Anatolian_Hieroglyphs}/ +- /\p{In_Bamum_Supplement}/ +- /\p{In_Mro}/ +- /\p{In_Bassa_Vah}/ +- /\p{In_Pahawh_Hmong}/ +- /\p{In_Medefaidrin}/ +- /\p{In_Miao}/ +- /\p{In_Ideographic_Symbols_and_Punctuation}/ +- /\p{In_Tangut}/ +- /\p{In_Tangut_Components}/ +- /\p{In_Kana_Supplement}/ +- /\p{In_Kana_Extended_A}/ +- /\p{In_Small_Kana_Extension}/ +- /\p{In_Nushu}/ +- /\p{In_Duployan}/ +- /\p{In_Shorthand_Format_Controls}/ +- /\p{In_Byzantine_Musical_Symbols}/ +- /\p{In_Musical_Symbols}/ +- /\p{In_Ancient_Greek_Musical_Notation}/ +- /\p{In_Mayan_Numerals}/ +- /\p{In_Tai_Xuan_Jing_Symbols}/ +- /\p{In_Counting_Rod_Numerals}/ +- /\p{In_Mathematical_Alphanumeric_Symbols}/ +- /\p{In_Sutton_SignWriting}/ +- /\p{In_Glagolitic_Supplement}/ +- /\p{In_Nyiakeng_Puachue_Hmong}/ +- /\p{In_Wancho}/ +- /\p{In_Mende_Kikakui}/ +- /\p{In_Adlam}/ +- /\p{In_Indic_Siyaq_Numbers}/ +- /\p{In_Ottoman_Siyaq_Numbers}/ +- /\p{In_Arabic_Mathematical_Alphabetic_Symbols}/ +- /\p{In_Mahjong_Tiles}/ +- /\p{In_Domino_Tiles}/ +- /\p{In_Playing_Cards}/ +- /\p{In_Enclosed_Alphanumeric_Supplement}/ +- /\p{In_Enclosed_Ideographic_Supplement}/ +- /\p{In_Miscellaneous_Symbols_and_Pictographs}/ +- /\p{In_Emoticons}/ +- /\p{In_Ornamental_Dingbats}/ +- /\p{In_Transport_and_Map_Symbols}/ +- /\p{In_Alchemical_Symbols}/ +- /\p{In_Geometric_Shapes_Extended}/ +- /\p{In_Supplemental_Arrows_C}/ +- /\p{In_Supplemental_Symbols_and_Pictographs}/ +- /\p{In_Chess_Symbols}/ +- /\p{In_Symbols_and_Pictographs_Extended_A}/ +- /\p{In_CJK_Unified_Ideographs_Extension_B}/ +- /\p{In_CJK_Unified_Ideographs_Extension_C}/ +- /\p{In_CJK_Unified_Ideographs_Extension_D}/ +- /\p{In_CJK_Unified_Ideographs_Extension_E}/ +- /\p{In_CJK_Unified_Ideographs_Extension_F}/ +- /\p{In_CJK_Compatibility_Ideographs_Supplement}/ +- /\p{In_Tags}/ +- /\p{In_Variation_Selectors_Supplement}/ +- /\p{In_Supplementary_Private_Use_Area_A}/ +- /\p{In_Supplementary_Private_Use_Area_B}/ +- /\p{In_No_Block}/ diff --git a/doc/syntax/literals.rdoc b/doc/syntax/literals.rdoc index b641433249..0c1e4a434b 100644 --- a/doc/syntax/literals.rdoc +++ b/doc/syntax/literals.rdoc @@ -414,9 +414,9 @@ slash ('/') characters: re = /foo/ # => /foo/ re.class # => Regexp -The trailing slash may be followed by one or more _flag_ characters -that modify the behavior. -See {Regexp options}[rdoc-ref:Regexp@Options] for details. +The trailing slash may be followed by one or more modifiers characters +that set modes for the regexp. +See {Regexp modes}[rdoc-ref:Regexp@Modes] for details. Interpolation may be used inside regular expressions along with escaped characters. Note that a regular expression may require additional escaped @@ -523,9 +523,9 @@ A few "symmetrical" character pairs may be used as delimiters: %r(foo) # => /foo/ %r # => /foo/ -The trailing delimiter may be followed by one or more _flag_ characters -that modify the behavior. -See {Regexp options}[rdoc-ref:Regexp@Options] for details. +The trailing delimiter may be followed by one or more modifier characters +that set modes for the regexp. +See {Regexp modes}[rdoc-ref:Regexp@Modes] for details. === %x: Backtick Literals diff --git a/re.c b/re.c index aee9180fe4..f8db6cb82d 100644 --- a/re.c +++ b/re.c @@ -538,7 +538,7 @@ static VALUE rb_reg_str_with_term(VALUE re, int term); * * The returned string may be used as an argument to Regexp.new, * or as interpolated text for a - * {Regexp literal}[rdoc-ref:regexp.rdoc@Regexp+Literal]: + * {Regexp interpolation}[rdoc-ref:regexp.rdoc@Interpolation+Mode]: * * r1 = Regexp.new(s0) # => /(?ix-m:ab+c)/ * r2 = /#{s0}/ # => /(?ix-m:ab+c)/ @@ -3568,7 +3568,7 @@ reg_match_pos(VALUE re, VALUE *strp, long pos, VALUE* set_match) * Returns the integer index (in characters) of the first match * for +self+ and +string+, or +nil+ if none; * also sets the - * {rdoc-ref:Regexp Global Variables}[rdoc-ref:Regexp@Regexp+Global+Variables]: + * {rdoc-ref:Regexp global variables}[rdoc-ref:Regexp@Global+Variables]: * * /at/ =~ 'input data' # => 7 * $~ # => # @@ -3581,7 +3581,7 @@ reg_match_pos(VALUE re, VALUE *strp, long pos, VALUE* set_match) * - Is a regexp literal; * see {Regexp Literals}[rdoc-ref:literals.rdoc@Regexp+Literals]. * - Does not contain interpolations; - * see {Regexp Interpolation}[rdoc-ref:Regexp@Regexp+Interpolation]. + * see {Regexp interpolation}[rdoc-ref:Regexp@Interpolation+Mode]. * - Is at the left of the expression. * * Example: @@ -4559,7 +4559,7 @@ match_setter(VALUE val, ID _x, VALUE *_y) * * With no argument, returns the value of $!, * which is the result of the most recent pattern match - * (see {Regexp Global Variables}[rdoc-ref:Regexp@Regexp+Global+Variables]): + * (see {Regexp global variables}[rdoc-ref:Regexp@Global+Variables]): * * /c(.)t/ =~ 'cat' # => 0 * Regexp.last_match # => # diff --git a/string.c b/string.c index ad502a1920..d10eb236c9 100644 --- a/string.c +++ b/string.c @@ -4348,7 +4348,7 @@ rb_str_byterindex_m(int argc, VALUE *argv, VALUE str) * 'foo' =~ /o/ # => 1 * 'foo' =~ /x/ # => nil * - * Note: also updates Regexp@Special+global+variables. + * Note: also updates Regexp@Global+Variables. * * If the given +object+ is not a \Regexp, returns the value * returned by object =~ self. @@ -4390,7 +4390,7 @@ static VALUE get_pat(VALUE); * * Returns a \MatchData object (or +nil+) based on +self+ and the given +pattern+. * - * Note: also updates Regexp@Special+global+variables. + * Note: also updates Regexp@Global+Variables. * * - Computes +regexp+ by converting +pattern+ (if not already a \Regexp). * regexp = Regexp.new(pattern) @@ -4439,7 +4439,7 @@ rb_str_match_m(int argc, VALUE *argv, VALUE str) * * Returns +true+ or +false+ based on whether a match is found for +self+ and +pattern+. * - * Note: does not update Regexp@Special+global+variables. + * Note: does not update Regexp@Global+Variables. * * Computes +regexp+ by converting +pattern+ (if not already a \Regexp). * regexp = Regexp.new(pattern) diff --git a/string.rb b/string.rb index be10b407b0..e1f55d17b4 100644 --- a/string.rb +++ b/string.rb @@ -278,7 +278,7 @@ # If argument +capture+ is given and not 0, # it should be either an capture group index (integer) # or a capture group name (string or symbol); -# the slice is the specified capture (see Regexp@Capturing): +# the slice is the specified capture (see Regexp@Groups+and+Captures): # # s = 'hello there' # s[/[aeiou](.)\1/, 1] # => "l"