From a1e55c9f5af6d35f72d97194c1602f03fb778889 Mon Sep 17 00:00:00 2001 From: Iain Ireland Date: Thu, 28 Jul 2022 18:58:30 +0000 Subject: [PATCH] Bug 1779849: Re-import irregexp r=mgaudet This patch was generated by running import-irregexp.py. Depends on D152901 Differential Revision: https://phabricator.services.mozilla.com/D152902 --- js/src/irregexp/IRREGEXP_VERSION | 2 +- .../imported/gen-regexp-special-case.cc | 6 +- .../irregexp/imported/property-sequences.cc | 8 +- js/src/irregexp/imported/property-sequences.h | 6 +- js/src/irregexp/imported/regexp-ast.cc | 28 +- js/src/irregexp/imported/regexp-ast.h | 586 +++--- .../imported/regexp-bytecode-generator-inl.h | 24 +- .../imported/regexp-bytecode-generator.cc | 34 +- .../imported/regexp-bytecode-generator.h | 36 +- .../imported/regexp-bytecode-peephole.cc | 3 - js/src/irregexp/imported/regexp-bytecodes.h | 23 +- .../imported/regexp-compiler-tonode.cc | 741 ++++---- js/src/irregexp/imported/regexp-compiler.cc | 414 +++-- js/src/irregexp/imported/regexp-compiler.h | 60 +- js/src/irregexp/imported/regexp-dotprinter.cc | 6 +- js/src/irregexp/imported/regexp-error.h | 5 + .../irregexp/imported/regexp-interpreter.cc | 52 +- js/src/irregexp/imported/regexp-interpreter.h | 8 +- .../imported/regexp-macro-assembler-tracer.cc | 78 +- .../imported/regexp-macro-assembler-tracer.h | 23 +- .../imported/regexp-macro-assembler.cc | 212 ++- .../imported/regexp-macro-assembler.h | 368 ++-- js/src/irregexp/imported/regexp-nodes.h | 40 +- js/src/irregexp/imported/regexp-parser.cc | 1619 ++++++++++------- js/src/irregexp/imported/regexp-parser.h | 362 +--- js/src/irregexp/imported/regexp-stack.cc | 23 +- js/src/irregexp/imported/regexp-stack.h | 69 +- js/src/irregexp/imported/regexp.h | 35 +- 28 files changed, 2588 insertions(+), 2283 deletions(-) diff --git a/js/src/irregexp/IRREGEXP_VERSION b/js/src/irregexp/IRREGEXP_VERSION index 28c01c4f24b6..213dd2a6056e 100644 --- a/js/src/irregexp/IRREGEXP_VERSION +++ b/js/src/irregexp/IRREGEXP_VERSION @@ -1,2 +1,2 @@ Imported using import-irregexp.py from: -https://github.com/v8/v8/tree/8732b2ee52b567ad4e15ca91d141fd6e27499e99/src/regexp +https://github.com/v8/v8/tree/b8fe2724fc25af2c165180b2cd2930b2119ad831/src/regexp diff --git a/js/src/irregexp/imported/gen-regexp-special-case.cc b/js/src/irregexp/imported/gen-regexp-special-case.cc index 4ad5157d8572..087556825015 100644 --- a/js/src/irregexp/imported/gen-regexp-special-case.cc +++ b/js/src/irregexp/imported/gen-regexp-special-case.cc @@ -12,9 +12,9 @@ namespace v8 { namespace internal { -static const uc32 kSurrogateStart = 0xd800; -static const uc32 kSurrogateEnd = 0xdfff; -static const uc32 kNonBmpStart = 0x10000; +static const base::uc32 kSurrogateStart = 0xd800; +static const base::uc32 kSurrogateEnd = 0xdfff; +static const base::uc32 kNonBmpStart = 0x10000; // The following code generates "src/regexp/special-case.cc". void PrintSet(std::ofstream& out, const char* name, diff --git a/js/src/irregexp/imported/property-sequences.cc b/js/src/irregexp/imported/property-sequences.cc index f1a6180b4ba5..b37ec631152f 100644 --- a/js/src/irregexp/imported/property-sequences.cc +++ b/js/src/irregexp/imported/property-sequences.cc @@ -42,7 +42,7 @@ const generateData = (property) => { buffer.push(' ' + codePoints.join(', ') + ', 0,'); } const output = - `const uc32 UnicodePropertySequences::k${ id }[] = {\n` + + `const base::uc32 UnicodePropertySequences::k${ id }[] = {\n` + `${ buffer.join('\n') }\n 0 // null-terminating the list\n};\n`; return output; }; @@ -60,7 +60,7 @@ for (const property of properties) { */ // clang-format off -const uc32 UnicodePropertySequences::kEmojiFlagSequences[] = { +const base::uc32 UnicodePropertySequences::kEmojiFlagSequences[] = { 0x01F1E6, 0x01F1E8, 0, 0x01F1FF, 0x01F1FC, 0, 0x01F1E6, 0x01F1EA, 0, @@ -322,14 +322,14 @@ const uc32 UnicodePropertySequences::kEmojiFlagSequences[] = { 0 // null-terminating the list }; -const uc32 UnicodePropertySequences::kEmojiTagSequences[] = { +const base::uc32 UnicodePropertySequences::kEmojiTagSequences[] = { 0x01F3F4, 0x0E0067, 0x0E0062, 0x0E0065, 0x0E006E, 0x0E0067, 0x0E007F, 0, 0x01F3F4, 0x0E0067, 0x0E0062, 0x0E0073, 0x0E0063, 0x0E0074, 0x0E007F, 0, 0x01F3F4, 0x0E0067, 0x0E0062, 0x0E0077, 0x0E006C, 0x0E0073, 0x0E007F, 0, 0 // null-terminating the list }; -const uc32 UnicodePropertySequences::kEmojiZWJSequences[] = { +const base::uc32 UnicodePropertySequences::kEmojiZWJSequences[] = { 0x01F468, 0x00200D, 0x002764, 0x00FE0F, 0x00200D, 0x01F468, 0, 0x01F441, 0x00FE0F, 0x00200D, 0x01F5E8, 0x00FE0F, 0, 0x01F468, 0x00200D, 0x01F466, 0, diff --git a/js/src/irregexp/imported/property-sequences.h b/js/src/irregexp/imported/property-sequences.h index 204f28c24ef8..9b3a18886579 100644 --- a/js/src/irregexp/imported/property-sequences.h +++ b/js/src/irregexp/imported/property-sequences.h @@ -14,9 +14,9 @@ namespace internal { class UnicodePropertySequences : public AllStatic { public: - static const uc32 kEmojiFlagSequences[]; - static const uc32 kEmojiTagSequences[]; - static const uc32 kEmojiZWJSequences[]; + static const base::uc32 kEmojiFlagSequences[]; + static const base::uc32 kEmojiTagSequences[]; + static const base::uc32 kEmojiZWJSequences[]; }; } // namespace internal diff --git a/js/src/irregexp/imported/regexp-ast.cc b/js/src/irregexp/imported/regexp-ast.cc index 036d2e7302b1..68b9abffecd8 100644 --- a/js/src/irregexp/imported/regexp-ast.cc +++ b/js/src/irregexp/imported/regexp-ast.cc @@ -26,14 +26,16 @@ FOR_EACH_REG_EXP_TREE_TYPE(MAKE_TYPE_CASE) FOR_EACH_REG_EXP_TREE_TYPE(MAKE_TYPE_CASE) #undef MAKE_TYPE_CASE +namespace { -static Interval ListCaptureRegisters(ZoneList* children) { +Interval ListCaptureRegisters(ZoneList* children) { Interval result = Interval::Empty(); for (int i = 0; i < children->length(); i++) result = result.Union(children->at(i)->CaptureRegisters()); return result; } +} // namespace Interval RegExpAlternative::CaptureRegisters() { return ListCaptureRegisters(nodes()); @@ -62,12 +64,12 @@ Interval RegExpQuantifier::CaptureRegisters() { bool RegExpAssertion::IsAnchoredAtStart() { - return assertion_type() == RegExpAssertion::START_OF_INPUT; + return assertion_type() == RegExpAssertion::Type::START_OF_INPUT; } bool RegExpAssertion::IsAnchoredAtEnd() { - return assertion_type() == RegExpAssertion::END_OF_INPUT; + return assertion_type() == RegExpAssertion::Type::END_OF_INPUT; } @@ -129,6 +131,7 @@ bool RegExpCapture::IsAnchoredAtStart() { return body()->IsAnchoredAtStart(); } bool RegExpCapture::IsAnchoredAtEnd() { return body()->IsAnchoredAtEnd(); } +namespace { // Convert regular expression trees to a simple sexp representation. // This representation should be different from the input grammar @@ -147,6 +150,7 @@ class RegExpUnparser final : public RegExpVisitor { Zone* zone_; }; +} // namespace void* RegExpUnparser::VisitDisjunction(RegExpDisjunction* that, void* data) { os_ << "(|"; @@ -193,22 +197,22 @@ void* RegExpUnparser::VisitCharacterClass(RegExpCharacterClass* that, void* RegExpUnparser::VisitAssertion(RegExpAssertion* that, void* data) { switch (that->assertion_type()) { - case RegExpAssertion::START_OF_INPUT: + case RegExpAssertion::Type::START_OF_INPUT: os_ << "@^i"; break; - case RegExpAssertion::END_OF_INPUT: + case RegExpAssertion::Type::END_OF_INPUT: os_ << "@$i"; break; - case RegExpAssertion::START_OF_LINE: + case RegExpAssertion::Type::START_OF_LINE: os_ << "@^l"; break; - case RegExpAssertion::END_OF_LINE: + case RegExpAssertion::Type::END_OF_LINE: os_ << "@$l"; break; - case RegExpAssertion::BOUNDARY: + case RegExpAssertion::Type::BOUNDARY: os_ << "@b"; break; - case RegExpAssertion::NON_BOUNDARY: + case RegExpAssertion::Type::NON_BOUNDARY: os_ << "@B"; break; } @@ -218,7 +222,7 @@ void* RegExpUnparser::VisitAssertion(RegExpAssertion* that, void* data) { void* RegExpUnparser::VisitAtom(RegExpAtom* that, void* data) { os_ << "'"; - Vector chardata = that->data(); + base::Vector chardata = that->data(); for (int i = 0; i < chardata.length(); i++) { os_ << AsUC16(chardata[i]); } @@ -311,8 +315,9 @@ RegExpDisjunction::RegExpDisjunction(ZoneList* alternatives) } } +namespace { -static int IncreaseBy(int previous, int increase) { +int IncreaseBy(int previous, int increase) { if (RegExpTree::kInfinity - previous < increase) { return RegExpTree::kInfinity; } else { @@ -320,6 +325,7 @@ static int IncreaseBy(int previous, int increase) { } } +} // namespace RegExpAlternative::RegExpAlternative(ZoneList* nodes) : nodes_(nodes) { diff --git a/js/src/irregexp/imported/regexp-ast.h b/js/src/irregexp/imported/regexp-ast.h index efc0211e48a2..da2f11e6c982 100644 --- a/js/src/irregexp/imported/regexp-ast.h +++ b/js/src/irregexp/imported/regexp-ast.h @@ -41,29 +41,25 @@ class RegExpVisitor { #undef MAKE_CASE }; - // A simple closed interval. class Interval { public: Interval() : from_(kNone), to_(kNone - 1) {} // '- 1' for branchless size(). Interval(int from, int to) : from_(from), to_(to) {} Interval Union(Interval that) { - if (that.from_ == kNone) - return *this; - else if (from_ == kNone) - return that; - else - return Interval(std::min(from_, that.from_), std::max(to_, that.to_)); + if (that.from_ == kNone) return *this; + if (from_ == kNone) return that; + return Interval(std::min(from_, that.from_), std::max(to_, that.to_)); } - bool Contains(int value) { return (from_ <= value) && (value <= to_); } - bool is_empty() { return from_ == kNone; } + static Interval Empty() { return Interval(); } + + bool Contains(int value) const { return (from_ <= value) && (value <= to_); } + bool is_empty() const { return from_ == kNone; } int from() const { return from_; } int to() const { return to_; } int size() const { return to_ - from_ + 1; } - static Interval Empty() { return Interval(); } - static constexpr int kNone = -1; private: @@ -71,32 +67,39 @@ class Interval { int to_; }; +// Named standard character sets. +enum class StandardCharacterSet : char { + kWhitespace = 's', // Like /\s/. + kNotWhitespace = 'S', // Like /\S/. + kWord = 'w', // Like /\w/. + kNotWord = 'W', // Like /\W/. + kDigit = 'd', // Like /\d/. + kNotDigit = 'D', // Like /\D/. + kLineTerminator = 'n', // The inverse of /./. + kNotLineTerminator = '.', // Like /./. + kEverything = '*', // Matches every character, like /./s. +}; + // Represents code points (with values up to 0x10FFFF) in the range from from_ // to to_, both ends are inclusive. class CharacterRange { public: - CharacterRange() : from_(0), to_(0) {} - // For compatibility with the CHECK_OK macro + CharacterRange() = default; + // For compatibility with the CHECK_OK macro. CharacterRange(void* null) { DCHECK_NULL(null); } // NOLINT - V8_EXPORT_PRIVATE static void AddClassEscape(char type, - ZoneList* ranges, - Zone* zone); - // Add class escapes. Add case equivalent closure for \w and \W if necessary. - V8_EXPORT_PRIVATE static void AddClassEscape( - char type, ZoneList* ranges, - bool add_unicode_case_equivalents, Zone* zone); - static Vector GetWordBounds(); - static inline CharacterRange Singleton(uc32 value) { + + static inline CharacterRange Singleton(base::uc32 value) { return CharacterRange(value, value); } - static inline CharacterRange Range(uc32 from, uc32 to) { - DCHECK(0 <= from && to <= String::kMaxCodePoint); + static inline CharacterRange Range(base::uc32 from, base::uc32 to) { + DCHECK(0 <= from && to <= kMaxCodePoint); DCHECK(static_cast(from) <= static_cast(to)); return CharacterRange(from, to); } static inline CharacterRange Everything() { - return CharacterRange(0, String::kMaxCodePoint); + return CharacterRange(0, kMaxCodePoint); } + static inline ZoneList* List(Zone* zone, CharacterRange range) { ZoneList* list = @@ -104,17 +107,21 @@ class CharacterRange { list->Add(range, zone); return list; } - bool Contains(uc32 i) { return from_ <= i && i <= to_; } - uc32 from() const { return from_; } - void set_from(uc32 value) { from_ = value; } - uc32 to() const { return to_; } - void set_to(uc32 value) { to_ = value; } - bool is_valid() { return from_ <= to_; } - bool IsEverything(uc32 max) { return from_ == 0 && to_ >= max; } - bool IsSingleton() { return (from_ == to_); } + + // Add class escapes. Add case equivalent closure for \w and \W if necessary. + V8_EXPORT_PRIVATE static void AddClassEscape( + StandardCharacterSet standard_character_set, + ZoneList* ranges, bool add_unicode_case_equivalents, + Zone* zone); V8_EXPORT_PRIVATE static void AddCaseEquivalents( Isolate* isolate, Zone* zone, ZoneList* ranges, bool is_one_byte); + + bool Contains(base::uc32 i) const { return from_ <= i && i <= to_; } + base::uc32 from() const { return from_; } + base::uc32 to() const { return to_; } + bool IsEverything(base::uc32 max) const { return from_ == 0 && to_ >= max; } + bool IsSingleton() const { return from_ == to_; } // Whether a range list is in canonical form: Ranges ordered by from value, // and ranges non-overlapping and non-adjacent. V8_EXPORT_PRIVATE static bool IsCanonical(ZoneList* ranges); @@ -126,35 +133,214 @@ class CharacterRange { // Negate the contents of a character range in canonical form. static void Negate(ZoneList* src, ZoneList* dst, Zone* zone); - static const int kStartMarker = (1 << 24); - static const int kPayloadMask = (1 << 24) - 1; + + // Remove all ranges outside the one-byte range. + static void ClampToOneByte(ZoneList* ranges); private: - CharacterRange(uc32 from, uc32 to) : from_(from), to_(to) {} + CharacterRange(base::uc32 from, base::uc32 to) : from_(from), to_(to) {} - uc32 from_; - uc32 to_; + static constexpr int kMaxCodePoint = 0x10ffff; + + base::uc32 from_ = 0; + base::uc32 to_ = 0; +}; + +#define DECL_BOILERPLATE(Name) \ + void* Accept(RegExpVisitor* visitor, void* data) override; \ + RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) \ + override; \ + RegExp##Name* As##Name() override; \ + bool Is##Name() override + +class RegExpTree : public ZoneObject { + public: + static const int kInfinity = kMaxInt; + virtual ~RegExpTree() = default; + virtual void* Accept(RegExpVisitor* visitor, void* data) = 0; + virtual RegExpNode* ToNode(RegExpCompiler* compiler, + RegExpNode* on_success) = 0; + virtual bool IsTextElement() { return false; } + virtual bool IsAnchoredAtStart() { return false; } + virtual bool IsAnchoredAtEnd() { return false; } + virtual int min_match() = 0; + virtual int max_match() = 0; + // Returns the interval of registers used for captures within this + // expression. + virtual Interval CaptureRegisters() { return Interval::Empty(); } + virtual void AppendToText(RegExpText* text, Zone* zone); + V8_EXPORT_PRIVATE std::ostream& Print(std::ostream& os, Zone* zone); +#define MAKE_ASTYPE(Name) \ + virtual RegExp##Name* As##Name(); \ + virtual bool Is##Name(); + FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ASTYPE) +#undef MAKE_ASTYPE +}; + + +class RegExpDisjunction final : public RegExpTree { + public: + explicit RegExpDisjunction(ZoneList* alternatives); + + DECL_BOILERPLATE(Disjunction); + + Interval CaptureRegisters() override; + bool IsAnchoredAtStart() override; + bool IsAnchoredAtEnd() override; + int min_match() override { return min_match_; } + int max_match() override { return max_match_; } + ZoneList* alternatives() const { return alternatives_; } + + private: + bool SortConsecutiveAtoms(RegExpCompiler* compiler); + void RationalizeConsecutiveAtoms(RegExpCompiler* compiler); + void FixSingleCharacterDisjunctions(RegExpCompiler* compiler); + ZoneList* alternatives_; + int min_match_; + int max_match_; +}; + + +class RegExpAlternative final : public RegExpTree { + public: + explicit RegExpAlternative(ZoneList* nodes); + + DECL_BOILERPLATE(Alternative); + + Interval CaptureRegisters() override; + bool IsAnchoredAtStart() override; + bool IsAnchoredAtEnd() override; + int min_match() override { return min_match_; } + int max_match() override { return max_match_; } + ZoneList* nodes() const { return nodes_; } + + private: + ZoneList* nodes_; + int min_match_; + int max_match_; +}; + + +class RegExpAssertion final : public RegExpTree { + public: + enum class Type { + START_OF_LINE = 0, + START_OF_INPUT = 1, + END_OF_LINE = 2, + END_OF_INPUT = 3, + BOUNDARY = 4, + NON_BOUNDARY = 5, + LAST_ASSERTION_TYPE = NON_BOUNDARY, + }; + explicit RegExpAssertion(Type type) : assertion_type_(type) {} + + DECL_BOILERPLATE(Assertion); + + bool IsAnchoredAtStart() override; + bool IsAnchoredAtEnd() override; + int min_match() override { return 0; } + int max_match() override { return 0; } + Type assertion_type() const { return assertion_type_; } + + private: + const Type assertion_type_; }; class CharacterSet final { public: - explicit CharacterSet(uc16 standard_set_type) - : ranges_(nullptr), standard_set_type_(standard_set_type) {} - explicit CharacterSet(ZoneList* ranges) - : ranges_(ranges), standard_set_type_(0) {} + explicit CharacterSet(StandardCharacterSet standard_set_type) + : standard_set_type_(standard_set_type) {} + explicit CharacterSet(ZoneList* ranges) : ranges_(ranges) {} + ZoneList* ranges(Zone* zone); - uc16 standard_set_type() const { return standard_set_type_; } - void set_standard_set_type(uc16 special_set_type) { - standard_set_type_ = special_set_type; + StandardCharacterSet standard_set_type() const { + return standard_set_type_.value(); } - bool is_standard() { return standard_set_type_ != 0; } + void set_standard_set_type(StandardCharacterSet standard_set_type) { + standard_set_type_ = standard_set_type; + } + bool is_standard() const { return standard_set_type_.has_value(); } V8_EXPORT_PRIVATE void Canonicalize(); private: - ZoneList* ranges_; - // If non-zero, the value represents a standard set (e.g., all whitespace - // characters) without having to expand the ranges. - uc16 standard_set_type_; + ZoneList* ranges_ = nullptr; + base::Optional standard_set_type_; +}; + +class RegExpCharacterClass final : public RegExpTree { + public: + // NEGATED: The character class is negated and should match everything but + // the specified ranges. + // CONTAINS_SPLIT_SURROGATE: The character class contains part of a split + // surrogate and should not be unicode-desugared (crbug.com/641091). + enum Flag { + NEGATED = 1 << 0, + CONTAINS_SPLIT_SURROGATE = 1 << 1, + }; + using CharacterClassFlags = base::Flags; + + RegExpCharacterClass( + Zone* zone, ZoneList* ranges, + CharacterClassFlags character_class_flags = CharacterClassFlags()) + : set_(ranges), character_class_flags_(character_class_flags) { + // Convert the empty set of ranges to the negated Everything() range. + if (ranges->is_empty()) { + ranges->Add(CharacterRange::Everything(), zone); + character_class_flags_ ^= NEGATED; + } + } + explicit RegExpCharacterClass(StandardCharacterSet standard_set_type) + : set_(standard_set_type), character_class_flags_() {} + + DECL_BOILERPLATE(CharacterClass); + + bool IsTextElement() override { return true; } + int min_match() override { return 1; } + // The character class may match two code units for unicode regexps. + // TODO(yangguo): we should split this class for usage in TextElement, and + // make max_match() dependent on the character class content. + int max_match() override { return 2; } + + void AppendToText(RegExpText* text, Zone* zone) override; + + // TODO(lrn): Remove need for complex version if is_standard that + // recognizes a mangled standard set and just do { return set_.is_special(); } + bool is_standard(Zone* zone); + // Returns a value representing the standard character set if is_standard() + // returns true. + StandardCharacterSet standard_type() const { + return set_.standard_set_type(); + } + + CharacterSet character_set() const { return set_; } + ZoneList* ranges(Zone* zone) { return set_.ranges(zone); } + + bool is_negated() const { return (character_class_flags_ & NEGATED) != 0; } + bool contains_split_surrogate() const { + return (character_class_flags_ & CONTAINS_SPLIT_SURROGATE) != 0; + } + + private: + CharacterSet set_; + CharacterClassFlags character_class_flags_; +}; + +class RegExpAtom final : public RegExpTree { + public: + explicit RegExpAtom(base::Vector data) : data_(data) {} + + DECL_BOILERPLATE(Atom); + + bool IsTextElement() override { return true; } + int min_match() override { return data_.length(); } + int max_match() override { return data_.length(); } + void AppendToText(RegExpText* text, Zone* zone) override; + + base::Vector data() const { return data_; } + int length() const { return data_.length(); } + + private: + base::Vector data_; }; class TextElement final { @@ -191,206 +377,12 @@ class TextElement final { RegExpTree* tree_; }; - -class RegExpTree : public ZoneObject { - public: - static const int kInfinity = kMaxInt; - virtual ~RegExpTree() = default; - virtual void* Accept(RegExpVisitor* visitor, void* data) = 0; - virtual RegExpNode* ToNode(RegExpCompiler* compiler, - RegExpNode* on_success) = 0; - virtual bool IsTextElement() { return false; } - virtual bool IsAnchoredAtStart() { return false; } - virtual bool IsAnchoredAtEnd() { return false; } - virtual int min_match() = 0; - virtual int max_match() = 0; - // Returns the interval of registers used for captures within this - // expression. - virtual Interval CaptureRegisters() { return Interval::Empty(); } - virtual void AppendToText(RegExpText* text, Zone* zone); - V8_EXPORT_PRIVATE std::ostream& Print(std::ostream& os, Zone* zone); -#define MAKE_ASTYPE(Name) \ - virtual RegExp##Name* As##Name(); \ - virtual bool Is##Name(); - FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ASTYPE) -#undef MAKE_ASTYPE -}; - - -class RegExpDisjunction final : public RegExpTree { - public: - explicit RegExpDisjunction(ZoneList* alternatives); - void* Accept(RegExpVisitor* visitor, void* data) override; - RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; - RegExpDisjunction* AsDisjunction() override; - Interval CaptureRegisters() override; - bool IsDisjunction() override; - bool IsAnchoredAtStart() override; - bool IsAnchoredAtEnd() override; - int min_match() override { return min_match_; } - int max_match() override { return max_match_; } - ZoneList* alternatives() { return alternatives_; } - - private: - bool SortConsecutiveAtoms(RegExpCompiler* compiler); - void RationalizeConsecutiveAtoms(RegExpCompiler* compiler); - void FixSingleCharacterDisjunctions(RegExpCompiler* compiler); - ZoneList* alternatives_; - int min_match_; - int max_match_; -}; - - -class RegExpAlternative final : public RegExpTree { - public: - explicit RegExpAlternative(ZoneList* nodes); - void* Accept(RegExpVisitor* visitor, void* data) override; - RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; - RegExpAlternative* AsAlternative() override; - Interval CaptureRegisters() override; - bool IsAlternative() override; - bool IsAnchoredAtStart() override; - bool IsAnchoredAtEnd() override; - int min_match() override { return min_match_; } - int max_match() override { return max_match_; } - ZoneList* nodes() { return nodes_; } - - private: - ZoneList* nodes_; - int min_match_; - int max_match_; -}; - - -class RegExpAssertion final : public RegExpTree { - public: - enum AssertionType { - START_OF_LINE = 0, - START_OF_INPUT = 1, - END_OF_LINE = 2, - END_OF_INPUT = 3, - BOUNDARY = 4, - NON_BOUNDARY = 5, - LAST_TYPE = NON_BOUNDARY, - }; - RegExpAssertion(AssertionType type, JSRegExp::Flags flags) - : assertion_type_(type), flags_(flags) {} - void* Accept(RegExpVisitor* visitor, void* data) override; - RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; - RegExpAssertion* AsAssertion() override; - bool IsAssertion() override; - bool IsAnchoredAtStart() override; - bool IsAnchoredAtEnd() override; - int min_match() override { return 0; } - int max_match() override { return 0; } - AssertionType assertion_type() const { return assertion_type_; } - JSRegExp::Flags flags() const { return flags_; } - - private: - const AssertionType assertion_type_; - const JSRegExp::Flags flags_; -}; - - -class RegExpCharacterClass final : public RegExpTree { - public: - // NEGATED: The character class is negated and should match everything but - // the specified ranges. - // CONTAINS_SPLIT_SURROGATE: The character class contains part of a split - // surrogate and should not be unicode-desugared (crbug.com/641091). - enum Flag { - NEGATED = 1 << 0, - CONTAINS_SPLIT_SURROGATE = 1 << 1, - }; - using CharacterClassFlags = base::Flags; - - RegExpCharacterClass( - Zone* zone, ZoneList* ranges, JSRegExp::Flags flags, - CharacterClassFlags character_class_flags = CharacterClassFlags()) - : set_(ranges), - flags_(flags), - character_class_flags_(character_class_flags) { - // Convert the empty set of ranges to the negated Everything() range. - if (ranges->is_empty()) { - ranges->Add(CharacterRange::Everything(), zone); - character_class_flags_ ^= NEGATED; - } - } - RegExpCharacterClass(uc16 type, JSRegExp::Flags flags) - : set_(type), - flags_(flags), - character_class_flags_(CharacterClassFlags()) {} - void* Accept(RegExpVisitor* visitor, void* data) override; - RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; - RegExpCharacterClass* AsCharacterClass() override; - bool IsCharacterClass() override; - bool IsTextElement() override { return true; } - int min_match() override { return 1; } - // The character class may match two code units for unicode regexps. - // TODO(yangguo): we should split this class for usage in TextElement, and - // make max_match() dependent on the character class content. - int max_match() override { return 2; } - void AppendToText(RegExpText* text, Zone* zone) override; - CharacterSet character_set() { return set_; } - // TODO(lrn): Remove need for complex version if is_standard that - // recognizes a mangled standard set and just do { return set_.is_special(); } - bool is_standard(Zone* zone); - // Returns a value representing the standard character set if is_standard() - // returns true. - // Currently used values are: - // s : unicode whitespace - // S : unicode non-whitespace - // w : ASCII word character (digit, letter, underscore) - // W : non-ASCII word character - // d : ASCII digit - // D : non-ASCII digit - // . : non-newline - // * : All characters, for advancing unanchored regexp - uc16 standard_type() const { return set_.standard_set_type(); } - ZoneList* ranges(Zone* zone) { return set_.ranges(zone); } - bool is_negated() const { return (character_class_flags_ & NEGATED) != 0; } - JSRegExp::Flags flags() const { return flags_; } - bool contains_split_surrogate() const { - return (character_class_flags_ & CONTAINS_SPLIT_SURROGATE) != 0; - } - - private: - CharacterSet set_; - const JSRegExp::Flags flags_; - CharacterClassFlags character_class_flags_; -}; - - -class RegExpAtom final : public RegExpTree { - public: - explicit RegExpAtom(Vector data, JSRegExp::Flags flags) - : data_(data), flags_(flags) {} - void* Accept(RegExpVisitor* visitor, void* data) override; - RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; - RegExpAtom* AsAtom() override; - bool IsAtom() override; - bool IsTextElement() override { return true; } - int min_match() override { return data_.length(); } - int max_match() override { return data_.length(); } - void AppendToText(RegExpText* text, Zone* zone) override; - Vector data() { return data_; } - int length() { return data_.length(); } - JSRegExp::Flags flags() const { return flags_; } - bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; } - - private: - Vector data_; - const JSRegExp::Flags flags_; -}; - - class RegExpText final : public RegExpTree { public: - explicit RegExpText(Zone* zone) : elements_(2, zone), length_(0) {} - void* Accept(RegExpVisitor* visitor, void* data) override; - RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; - RegExpText* AsText() override; - bool IsText() override; + explicit RegExpText(Zone* zone) : elements_(2, zone) {} + + DECL_BOILERPLATE(Text); + bool IsTextElement() override { return true; } int min_match() override { return length_; } int max_match() override { return length_; } @@ -403,7 +395,7 @@ class RegExpText final : public RegExpTree { private: ZoneList elements_; - int length_; + int length_ = 0; }; @@ -426,23 +418,22 @@ class RegExpQuantifier final : public RegExpTree { max_match_ = max * body->max_match(); } } - void* Accept(RegExpVisitor* visitor, void* data) override; - RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; + + DECL_BOILERPLATE(Quantifier); + static RegExpNode* ToNode(int min, int max, bool is_greedy, RegExpTree* body, RegExpCompiler* compiler, RegExpNode* on_success, bool not_at_start = false); - RegExpQuantifier* AsQuantifier() override; Interval CaptureRegisters() override; - bool IsQuantifier() override; int min_match() override { return min_match_; } int max_match() override { return max_match_; } int min() const { return min_; } int max() const { return max_; } QuantifierType quantifier_type() const { return quantifier_type_; } bool is_possessive() const { return quantifier_type_ == POSSESSIVE; } - bool is_non_greedy() { return quantifier_type_ == NON_GREEDY; } + bool is_non_greedy() const { return quantifier_type_ == NON_GREEDY; } bool is_greedy() const { return quantifier_type_ == GREEDY; } - RegExpTree* body() { return body_; } + RegExpTree* body() const { return body_; } private: RegExpTree* body_; @@ -462,15 +453,14 @@ class RegExpCapture final : public RegExpTree { min_match_(0), max_match_(0), name_(nullptr) {} - void* Accept(RegExpVisitor* visitor, void* data) override; - RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; + + DECL_BOILERPLATE(Capture); + static RegExpNode* ToNode(RegExpTree* body, int index, RegExpCompiler* compiler, RegExpNode* on_success); - RegExpCapture* AsCapture() override; bool IsAnchoredAtStart() override; bool IsAnchoredAtEnd() override; Interval CaptureRegisters() override; - bool IsCapture() override; int min_match() override { return min_match_; } int max_match() override { return max_match_; } RegExpTree* body() { return body_; } @@ -480,17 +470,17 @@ class RegExpCapture final : public RegExpTree { max_match_ = body->max_match(); } int index() const { return index_; } - const ZoneVector* name() const { return name_; } - void set_name(const ZoneVector* name) { name_ = name; } + const ZoneVector* name() const { return name_; } + void set_name(const ZoneVector* name) { name_ = name; } static int StartRegister(int index) { return index * 2; } static int EndRegister(int index) { return index * 2 + 1; } private: - RegExpTree* body_; + RegExpTree* body_ = nullptr; int index_; - int min_match_; - int max_match_; - const ZoneVector* name_; + int min_match_ = 0; + int max_match_ = 0; + const ZoneVector* name_ = nullptr; }; class RegExpGroup final : public RegExpTree { @@ -499,19 +489,15 @@ class RegExpGroup final : public RegExpTree { : body_(body), min_match_(body->min_match()), max_match_(body->max_match()) {} - void* Accept(RegExpVisitor* visitor, void* data) override; - RegExpNode* ToNode(RegExpCompiler* compiler, - RegExpNode* on_success) override { - return body_->ToNode(compiler, on_success); - } - RegExpGroup* AsGroup() override; + + DECL_BOILERPLATE(Group); + bool IsAnchoredAtStart() override { return body_->IsAnchoredAtStart(); } bool IsAnchoredAtEnd() override { return body_->IsAnchoredAtEnd(); } - bool IsGroup() override; int min_match() override { return min_match_; } int max_match() override { return max_match_; } Interval CaptureRegisters() override { return body_->CaptureRegisters(); } - RegExpTree* body() { return body_; } + RegExpTree* body() const { return body_; } private: RegExpTree* body_; @@ -531,26 +517,24 @@ class RegExpLookaround final : public RegExpTree { capture_from_(capture_from), type_(type) {} - void* Accept(RegExpVisitor* visitor, void* data) override; - RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; - RegExpLookaround* AsLookaround() override; + DECL_BOILERPLATE(Lookaround); + Interval CaptureRegisters() override; - bool IsLookaround() override; bool IsAnchoredAtStart() override; int min_match() override { return 0; } int max_match() override { return 0; } - RegExpTree* body() { return body_; } - bool is_positive() { return is_positive_; } - int capture_count() { return capture_count_; } - int capture_from() { return capture_from_; } - Type type() { return type_; } + RegExpTree* body() const { return body_; } + bool is_positive() const { return is_positive_; } + int capture_count() const { return capture_count_; } + int capture_from() const { return capture_from_; } + Type type() const { return type_; } class Builder { public: Builder(bool is_positive, RegExpNode* on_success, int stack_pointer_register, int position_register, int capture_register_count = 0, int capture_register_start = 0); - RegExpNode* on_match_success() { return on_match_success_; } + RegExpNode* on_match_success() const { return on_match_success_; } RegExpNode* ForMatch(RegExpNode* match); private: @@ -572,38 +556,32 @@ class RegExpLookaround final : public RegExpTree { class RegExpBackReference final : public RegExpTree { public: - explicit RegExpBackReference(JSRegExp::Flags flags) - : capture_(nullptr), name_(nullptr), flags_(flags) {} - RegExpBackReference(RegExpCapture* capture, JSRegExp::Flags flags) - : capture_(capture), name_(nullptr), flags_(flags) {} - void* Accept(RegExpVisitor* visitor, void* data) override; - RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; - RegExpBackReference* AsBackReference() override; - bool IsBackReference() override; + explicit RegExpBackReference(RegExpFlags flags) : flags_(flags) {} + RegExpBackReference(RegExpCapture* capture, RegExpFlags flags) + : capture_(capture), flags_(flags) {} + + DECL_BOILERPLATE(BackReference); + int min_match() override { return 0; } // The back reference may be recursive, e.g. /(\2)(\1)/. To avoid infinite // recursion, we give up. Ignorance is bliss. int max_match() override { return kInfinity; } - int index() { return capture_->index(); } - RegExpCapture* capture() { return capture_; } + int index() const { return capture_->index(); } + RegExpCapture* capture() const { return capture_; } void set_capture(RegExpCapture* capture) { capture_ = capture; } - const ZoneVector* name() const { return name_; } - void set_name(const ZoneVector* name) { name_ = name; } + const ZoneVector* name() const { return name_; } + void set_name(const ZoneVector* name) { name_ = name; } private: - RegExpCapture* capture_; - const ZoneVector* name_; - const JSRegExp::Flags flags_; + RegExpCapture* capture_ = nullptr; + const ZoneVector* name_ = nullptr; + const RegExpFlags flags_; }; class RegExpEmpty final : public RegExpTree { public: - RegExpEmpty() = default; - void* Accept(RegExpVisitor* visitor, void* data) override; - RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; - RegExpEmpty* AsEmpty() override; - bool IsEmpty() override; + DECL_BOILERPLATE(Empty); int min_match() override { return 0; } int max_match() override { return 0; } }; @@ -611,4 +589,6 @@ class RegExpEmpty final : public RegExpTree { } // namespace internal } // namespace v8 +#undef DECL_BOILERPLATE + #endif // V8_REGEXP_REGEXP_AST_H_ diff --git a/js/src/irregexp/imported/regexp-bytecode-generator-inl.h b/js/src/irregexp/imported/regexp-bytecode-generator-inl.h index d85b1bbf4974..807ca66f4789 100644 --- a/js/src/irregexp/imported/regexp-bytecode-generator-inl.h +++ b/js/src/irregexp/imported/regexp-bytecode-generator-inl.h @@ -23,29 +23,29 @@ void RegExpBytecodeGenerator::Emit(uint32_t byte, int32_t twenty_four_bits) { } void RegExpBytecodeGenerator::Emit16(uint32_t word) { - DCHECK(pc_ <= buffer_.length()); - if (pc_ + 1 >= buffer_.length()) { - Expand(); + DCHECK(pc_ <= static_cast(buffer_.size())); + if (pc_ + 1 >= static_cast(buffer_.size())) { + ExpandBuffer(); } - *reinterpret_cast(buffer_.begin() + pc_) = word; + *reinterpret_cast(buffer_.data() + pc_) = word; pc_ += 2; } void RegExpBytecodeGenerator::Emit8(uint32_t word) { - DCHECK(pc_ <= buffer_.length()); - if (pc_ == buffer_.length()) { - Expand(); + DCHECK(pc_ <= static_cast(buffer_.size())); + if (pc_ == static_cast(buffer_.size())) { + ExpandBuffer(); } - *reinterpret_cast(buffer_.begin() + pc_) = word; + *reinterpret_cast(buffer_.data() + pc_) = word; pc_ += 1; } void RegExpBytecodeGenerator::Emit32(uint32_t word) { - DCHECK(pc_ <= buffer_.length()); - if (pc_ + 3 >= buffer_.length()) { - Expand(); + DCHECK(pc_ <= static_cast(buffer_.size())); + if (pc_ + 3 >= static_cast(buffer_.size())) { + ExpandBuffer(); } - *reinterpret_cast(buffer_.begin() + pc_) = word; + *reinterpret_cast(buffer_.data() + pc_) = word; pc_ += 4; } diff --git a/js/src/irregexp/imported/regexp-bytecode-generator.cc b/js/src/irregexp/imported/regexp-bytecode-generator.cc index 8583d09d4814..de7f02253f67 100644 --- a/js/src/irregexp/imported/regexp-bytecode-generator.cc +++ b/js/src/irregexp/imported/regexp-bytecode-generator.cc @@ -14,7 +14,7 @@ namespace internal { RegExpBytecodeGenerator::RegExpBytecodeGenerator(Isolate* isolate, Zone* zone) : RegExpMacroAssembler(isolate, zone), - buffer_(Vector::New(1024)), + buffer_(kInitialBufferSize, zone), pc_(0), advance_current_end_(kInvalidPC), jump_edges_(zone), @@ -22,7 +22,6 @@ RegExpBytecodeGenerator::RegExpBytecodeGenerator(Isolate* isolate, Zone* zone) RegExpBytecodeGenerator::~RegExpBytecodeGenerator() { if (backtrack_.is_linked()) backtrack_.Unuse(); - buffer_.Dispose(); } RegExpBytecodeGenerator::IrregexpImplementation @@ -37,8 +36,8 @@ void RegExpBytecodeGenerator::Bind(Label* l) { int pos = l->pos(); while (pos != 0) { int fixup = pos; - pos = *reinterpret_cast(buffer_.begin() + fixup); - *reinterpret_cast(buffer_.begin() + fixup) = pc_; + pos = *reinterpret_cast(buffer_.data() + fixup); + *reinterpret_cast(buffer_.data() + fixup) = pc_; jump_edges_.emplace(fixup, pc_); } } @@ -218,12 +217,14 @@ void RegExpBytecodeGenerator::LoadCurrentCharacterImpl(int cp_offset, if (check_bounds) EmitOrLink(on_failure); } -void RegExpBytecodeGenerator::CheckCharacterLT(uc16 limit, Label* on_less) { +void RegExpBytecodeGenerator::CheckCharacterLT(base::uc16 limit, + Label* on_less) { Emit(BC_CHECK_LT, limit); EmitOrLink(on_less); } -void RegExpBytecodeGenerator::CheckCharacterGT(uc16 limit, Label* on_greater) { +void RegExpBytecodeGenerator::CheckCharacterGT(base::uc16 limit, + Label* on_greater) { Emit(BC_CHECK_GT, limit); EmitOrLink(on_greater); } @@ -286,14 +287,15 @@ void RegExpBytecodeGenerator::CheckNotCharacterAfterAnd(uint32_t c, } void RegExpBytecodeGenerator::CheckNotCharacterAfterMinusAnd( - uc16 c, uc16 minus, uc16 mask, Label* on_not_equal) { + base::uc16 c, base::uc16 minus, base::uc16 mask, Label* on_not_equal) { Emit(BC_MINUS_AND_CHECK_NOT_CHAR, c); Emit16(minus); Emit16(mask); EmitOrLink(on_not_equal); } -void RegExpBytecodeGenerator::CheckCharacterInRange(uc16 from, uc16 to, +void RegExpBytecodeGenerator::CheckCharacterInRange(base::uc16 from, + base::uc16 to, Label* on_in_range) { Emit(BC_CHECK_CHAR_IN_RANGE, 0); Emit16(from); @@ -301,7 +303,8 @@ void RegExpBytecodeGenerator::CheckCharacterInRange(uc16 from, uc16 to, EmitOrLink(on_in_range); } -void RegExpBytecodeGenerator::CheckCharacterNotInRange(uc16 from, uc16 to, +void RegExpBytecodeGenerator::CheckCharacterNotInRange(base::uc16 from, + base::uc16 to, Label* on_not_in_range) { Emit(BC_CHECK_CHAR_NOT_IN_RANGE, 0); Emit16(from); @@ -377,7 +380,7 @@ Handle RegExpBytecodeGenerator::GetCode(Handle source) { Handle array; if (FLAG_regexp_peephole_optimization) { array = RegExpBytecodePeepholeOptimization::OptimizeBytecode( - isolate_, zone(), source, buffer_.begin(), length(), jump_edges_); + isolate_, zone(), source, buffer_.data(), length(), jump_edges_); } else { array = isolate_->factory()->NewByteArray(length()); Copy(array->GetDataStartAddress()); @@ -389,14 +392,13 @@ Handle RegExpBytecodeGenerator::GetCode(Handle source) { int RegExpBytecodeGenerator::length() { return pc_; } void RegExpBytecodeGenerator::Copy(byte* a) { - MemCopy(a, buffer_.begin(), length()); + MemCopy(a, buffer_.data(), length()); } -void RegExpBytecodeGenerator::Expand() { - Vector old_buffer = buffer_; - buffer_ = Vector::New(old_buffer.length() * 2); - MemCopy(buffer_.begin(), old_buffer.begin(), old_buffer.length()); - old_buffer.Dispose(); +void RegExpBytecodeGenerator::ExpandBuffer() { + // TODO(jgruber): The growth strategy could be smarter for large sizes. + // TODO(jgruber): It's not necessary to default-initialize new elements. + buffer_.resize(buffer_.size() * 2); } } // namespace internal diff --git a/js/src/irregexp/imported/regexp-bytecode-generator.h b/js/src/irregexp/imported/regexp-bytecode-generator.h index dc9bf654544c..351f6e0cc64a 100644 --- a/js/src/irregexp/imported/regexp-bytecode-generator.h +++ b/js/src/irregexp/imported/regexp-bytecode-generator.h @@ -25,7 +25,7 @@ class V8_EXPORT_PRIVATE RegExpBytecodeGenerator : public RegExpMacroAssembler { ~RegExpBytecodeGenerator() override; // The byte-code interpreter checks on each push anyway. int stack_limit_slack() override { return 1; } - bool CanReadUnaligned() override { return false; } + bool CanReadUnaligned() const override { return false; } void Bind(Label* label) override; void AdvanceCurrentPosition(int by) override; // Signed cp change. void PopCurrentPosition() override; @@ -52,19 +52,36 @@ class V8_EXPORT_PRIVATE RegExpBytecodeGenerator : public RegExpMacroAssembler { void CheckCharacter(unsigned c, Label* on_equal) override; void CheckCharacterAfterAnd(unsigned c, unsigned mask, Label* on_equal) override; - void CheckCharacterGT(uc16 limit, Label* on_greater) override; - void CheckCharacterLT(uc16 limit, Label* on_less) override; + void CheckCharacterGT(base::uc16 limit, Label* on_greater) override; + void CheckCharacterLT(base::uc16 limit, Label* on_less) override; void CheckGreedyLoop(Label* on_tos_equals_current_position) override; void CheckAtStart(int cp_offset, Label* on_at_start) override; void CheckNotAtStart(int cp_offset, Label* on_not_at_start) override; void CheckNotCharacter(unsigned c, Label* on_not_equal) override; void CheckNotCharacterAfterAnd(unsigned c, unsigned mask, Label* on_not_equal) override; - void CheckNotCharacterAfterMinusAnd(uc16 c, uc16 minus, uc16 mask, + void CheckNotCharacterAfterMinusAnd(base::uc16 c, base::uc16 minus, + base::uc16 mask, Label* on_not_equal) override; - void CheckCharacterInRange(uc16 from, uc16 to, Label* on_in_range) override; - void CheckCharacterNotInRange(uc16 from, uc16 to, + void CheckCharacterInRange(base::uc16 from, base::uc16 to, + Label* on_in_range) override; + void CheckCharacterNotInRange(base::uc16 from, base::uc16 to, Label* on_not_in_range) override; + bool CheckCharacterInRangeArray(const ZoneList* ranges, + Label* on_in_range) override { + // Disabled in the interpreter, because 1) there is no constant pool that + // could store the ByteArray pointer, 2) bytecode size limits are not as + // restrictive as code (e.g. branch distances on arm), 3) bytecode for + // large character classes is already quite compact. + // TODO(jgruber): Consider using BytecodeArrays (with a constant pool) + // instead of plain ByteArrays; then we could implement + // CheckCharacterInRangeArray in the interpreter. + return false; + } + bool CheckCharacterNotInRangeArray(const ZoneList* ranges, + Label* on_not_in_range) override { + return false; + } void CheckBitInTable(Handle table, Label* on_bit_set) override; void CheckNotBackReference(int start_reg, bool read_backward, Label* on_no_match) override; @@ -79,7 +96,8 @@ class V8_EXPORT_PRIVATE RegExpBytecodeGenerator : public RegExpMacroAssembler { Handle GetCode(Handle source) override; private: - void Expand(); + void ExpandBuffer(); + // Code and bitmap emission. inline void EmitOrLink(Label* label); inline void Emit32(uint32_t x); @@ -92,7 +110,9 @@ class V8_EXPORT_PRIVATE RegExpBytecodeGenerator : public RegExpMacroAssembler { void Copy(byte* a); // The buffer into which code and relocation info are generated. - Vector buffer_; + static constexpr int kInitialBufferSize = 1024; + ZoneVector buffer_; + // The program counter. int pc_; Label backtrack_; diff --git a/js/src/irregexp/imported/regexp-bytecode-peephole.cc b/js/src/irregexp/imported/regexp-bytecode-peephole.cc index fd751cd7bdf3..d0e5ce7ce12d 100644 --- a/js/src/irregexp/imported/regexp-bytecode-peephole.cc +++ b/js/src/irregexp/imported/regexp-bytecode-peephole.cc @@ -258,13 +258,10 @@ int32_t GetArgumentValue(const byte* bytecode, int offset, int length) { switch (length) { case 1: return GetValue(bytecode, offset); - break; case 2: return GetValue(bytecode, offset); - break; case 4: return GetValue(bytecode, offset); - break; default: UNREACHABLE(); } diff --git a/js/src/irregexp/imported/regexp-bytecodes.h b/js/src/irregexp/imported/regexp-bytecodes.h index c4115dd53bda..5602d8d7bc80 100644 --- a/js/src/irregexp/imported/regexp-bytecodes.h +++ b/js/src/irregexp/imported/regexp-bytecodes.h @@ -22,8 +22,9 @@ constexpr int BYTECODE_MASK = kRegExpPaddedBytecodeCount - 1; // positive values. const unsigned int MAX_FIRST_ARG = 0x7fffffu; const int BYTECODE_SHIFT = 8; -STATIC_ASSERT(1 << BYTECODE_SHIFT > BYTECODE_MASK); +static_assert(1 << BYTECODE_SHIFT > BYTECODE_MASK); +// The list of bytecodes, in format: V(Name, Code, ByteLength). // TODO(pthier): Argument offsets of bytecodes should be easily accessible by // name or at least by position. // TODO(jgruber): More precise types (e.g. int32/uint32 instead of value32). @@ -85,12 +86,14 @@ STATIC_ASSERT(1 << BYTECODE_SHIFT > BYTECODE_MASK); /* 0x10 - 0x1F: Character to match against (after mask aplied) */ \ /* 0x20 - 0x3F: Bitmask bitwise and combined with current character */ \ /* 0x40 - 0x5F: Address of bytecode when matched */ \ - V(AND_CHECK_CHAR, 28, 12) /* bc8 pad8 uint16 uint32 addr32 */ \ - V(AND_CHECK_NOT_4_CHARS, 29, 16) /* bc8 pad24 uint32 uint32 addr32 */ \ - V(AND_CHECK_NOT_CHAR, 30, 12) /* bc8 pad8 uint16 uint32 addr32 */ \ - V(MINUS_AND_CHECK_NOT_CHAR, 31, 12) /* bc8 pad8 uc16 uc16 uc16 addr32 */ \ - V(CHECK_CHAR_IN_RANGE, 32, 12) /* bc8 pad24 uc16 uc16 addr32 */ \ - V(CHECK_CHAR_NOT_IN_RANGE, 33, 12) /* bc8 pad24 uc16 uc16 addr32 */ \ + V(AND_CHECK_CHAR, 28, 12) /* bc8 pad8 uint16 uint32 addr32 */ \ + V(AND_CHECK_NOT_4_CHARS, 29, 16) /* bc8 pad24 uint32 uint32 addr32 */ \ + V(AND_CHECK_NOT_CHAR, 30, 12) /* bc8 pad8 uint16 uint32 addr32 */ \ + V(MINUS_AND_CHECK_NOT_CHAR, 31, \ + 12) /* bc8 pad8 base::uc16 base::uc16 base::uc16 addr32 */ \ + V(CHECK_CHAR_IN_RANGE, 32, 12) /* bc8 pad24 base::uc16 base::uc16 addr32 */ \ + V(CHECK_CHAR_NOT_IN_RANGE, 33, \ + 12) /* bc8 pad24 base::uc16 base::uc16 addr32 */ \ /* Checks if the current character matches any of the characters encoded */ \ /* in a bit table. Similar to/inspired by boyer moore string search */ \ /* Bit Layout: */ \ @@ -99,8 +102,8 @@ STATIC_ASSERT(1 << BYTECODE_SHIFT > BYTECODE_MASK); /* 0x20 - 0x3F: Address of bytecode when bit is set */ \ /* 0x40 - 0xBF: Bit table */ \ V(CHECK_BIT_IN_TABLE, 34, 24) /* bc8 pad24 addr32 bits128 */ \ - V(CHECK_LT, 35, 8) /* bc8 pad8 uc16 addr32 */ \ - V(CHECK_GT, 36, 8) /* bc8 pad8 uc16 addr32 */ \ + V(CHECK_LT, 35, 8) /* bc8 pad8 base::uc16 addr32 */ \ + V(CHECK_GT, 36, 8) /* bc8 pad8 base::uc16 addr32 */ \ V(CHECK_NOT_BACK_REF, 37, 8) /* bc8 reg_idx24 addr32 */ \ V(CHECK_NOT_BACK_REF_NO_CASE, 38, 8) /* bc8 reg_idx24 addr32 */ \ V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE, 39, 8) \ @@ -215,7 +218,7 @@ static constexpr int kRegExpBytecodeCount = BYTECODE_ITERATOR(COUNT); // contiguous, strictly increasing, and start at 0. // TODO(jgruber): Do not explicitly assign values, instead generate them // implicitly from the list order. -STATIC_ASSERT(kRegExpBytecodeCount == 59); +static_assert(kRegExpBytecodeCount == 59); #define DECLARE_BYTECODES(name, code, length) \ static constexpr int BC_##name = code; diff --git a/js/src/irregexp/imported/regexp-compiler-tonode.cc b/js/src/irregexp/imported/regexp-compiler-tonode.cc index 625f4a91c69d..695a864883d4 100644 --- a/js/src/irregexp/imported/regexp-compiler-tonode.cc +++ b/js/src/irregexp/imported/regexp-compiler-tonode.cc @@ -5,11 +5,9 @@ #include "irregexp/imported/regexp-compiler.h" #include "irregexp/imported/regexp.h" -#ifdef V8_INTL_SUPPORT -#include "irregexp/imported/special-case.h" -#endif // V8_INTL_SUPPORT #ifdef V8_INTL_SUPPORT +#include "irregexp/imported/special-case.h" #include "unicode/locid.h" #include "unicode/uniset.h" #include "unicode/utypes.h" @@ -20,6 +18,11 @@ namespace internal { using namespace regexp_compiler_constants; // NOLINT(build/namespaces) +constexpr base::uc32 kMaxCodePoint = 0x10ffff; +constexpr int kMaxUtf16CodeUnit = 0xffff; +constexpr uint32_t kMaxUtf16CodeUnitU = 0xffff; +constexpr int32_t kMaxOneByteCharCode = unibrow::Latin1::kMaxChar; + // ------------------------------------------------------------------- // Tree to graph conversion @@ -38,51 +41,53 @@ RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, on_success); } -static bool CompareInverseRanges(ZoneList* ranges, - const int* special_class, int length) { +namespace { + +bool CompareInverseRanges(ZoneList* ranges, + const int* special_class, int length) { length--; // Remove final marker. + DCHECK_EQ(kRangeEndMarker, special_class[length]); DCHECK_NE(0, ranges->length()); DCHECK_NE(0, length); DCHECK_NE(0, special_class[0]); - if (ranges->length() != (length >> 1) + 1) { - return false; - } + + if (ranges->length() != (length >> 1) + 1) return false; + CharacterRange range = ranges->at(0); - if (range.from() != 0) { - return false; - } + if (range.from() != 0) return false; + for (int i = 0; i < length; i += 2) { - if (static_cast(special_class[i]) != (range.to() + 1)) { + if (static_cast(special_class[i]) != (range.to() + 1)) { return false; } range = ranges->at((i >> 1) + 1); - if (static_cast(special_class[i + 1]) != range.from()) { + if (static_cast(special_class[i + 1]) != range.from()) { return false; } } - if (range.to() != String::kMaxCodePoint) { - return false; + + return range.to() == kMaxCodePoint; +} + +bool CompareRanges(ZoneList* ranges, const int* special_class, + int length) { + length--; // Remove final marker. + + DCHECK_EQ(kRangeEndMarker, special_class[length]); + if (ranges->length() * 2 != length) return false; + + for (int i = 0; i < length; i += 2) { + CharacterRange range = ranges->at(i >> 1); + if (range.from() != static_cast(special_class[i]) || + range.to() != static_cast(special_class[i + 1] - 1)) { + return false; + } } return true; } -static bool CompareRanges(ZoneList* ranges, - const int* special_class, int length) { - length--; // Remove final marker. - DCHECK_EQ(kRangeEndMarker, special_class[length]); - if (ranges->length() * 2 != length) { - return false; - } - for (int i = 0; i < length; i += 2) { - CharacterRange range = ranges->at(i >> 1); - if (range.from() != static_cast(special_class[i]) || - range.to() != static_cast(special_class[i + 1] - 1)) { - return false; - } - } - return true; -} +} // namespace bool RegExpCharacterClass::is_standard(Zone* zone) { // TODO(lrn): Remove need for this function, by not throwing away information @@ -94,29 +99,29 @@ bool RegExpCharacterClass::is_standard(Zone* zone) { return true; } if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) { - set_.set_standard_set_type('s'); + set_.set_standard_set_type(StandardCharacterSet::kWhitespace); return true; } if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) { - set_.set_standard_set_type('S'); + set_.set_standard_set_type(StandardCharacterSet::kNotWhitespace); return true; } if (CompareInverseRanges(set_.ranges(zone), kLineTerminatorRanges, kLineTerminatorRangeCount)) { - set_.set_standard_set_type('.'); + set_.set_standard_set_type(StandardCharacterSet::kNotLineTerminator); return true; } if (CompareRanges(set_.ranges(zone), kLineTerminatorRanges, kLineTerminatorRangeCount)) { - set_.set_standard_set_type('n'); + set_.set_standard_set_type(StandardCharacterSet::kLineTerminator); return true; } if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) { - set_.set_standard_set_type('w'); + set_.set_standard_set_type(StandardCharacterSet::kWord); return true; } if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) { - set_.set_standard_set_type('W'); + set_.set_standard_set_type(StandardCharacterSet::kNotWord); return true; } return false; @@ -135,29 +140,29 @@ UnicodeRangeSplitter::UnicodeRangeSplitter(ZoneList* base) { } void UnicodeRangeSplitter::AddRange(CharacterRange range) { - static constexpr uc32 kBmp1Start = 0; - static constexpr uc32 kBmp1End = kLeadSurrogateStart - 1; - static constexpr uc32 kBmp2Start = kTrailSurrogateEnd + 1; - static constexpr uc32 kBmp2End = kNonBmpStart - 1; + static constexpr base::uc32 kBmp1Start = 0; + static constexpr base::uc32 kBmp1End = kLeadSurrogateStart - 1; + static constexpr base::uc32 kBmp2Start = kTrailSurrogateEnd + 1; + static constexpr base::uc32 kBmp2End = kNonBmpStart - 1; // Ends are all inclusive. - STATIC_ASSERT(kBmp1Start == 0); - STATIC_ASSERT(kBmp1Start < kBmp1End); - STATIC_ASSERT(kBmp1End + 1 == kLeadSurrogateStart); - STATIC_ASSERT(kLeadSurrogateStart < kLeadSurrogateEnd); - STATIC_ASSERT(kLeadSurrogateEnd + 1 == kTrailSurrogateStart); - STATIC_ASSERT(kTrailSurrogateStart < kTrailSurrogateEnd); - STATIC_ASSERT(kTrailSurrogateEnd + 1 == kBmp2Start); - STATIC_ASSERT(kBmp2Start < kBmp2End); - STATIC_ASSERT(kBmp2End + 1 == kNonBmpStart); - STATIC_ASSERT(kNonBmpStart < kNonBmpEnd); + static_assert(kBmp1Start == 0); + static_assert(kBmp1Start < kBmp1End); + static_assert(kBmp1End + 1 == kLeadSurrogateStart); + static_assert(kLeadSurrogateStart < kLeadSurrogateEnd); + static_assert(kLeadSurrogateEnd + 1 == kTrailSurrogateStart); + static_assert(kTrailSurrogateStart < kTrailSurrogateEnd); + static_assert(kTrailSurrogateEnd + 1 == kBmp2Start); + static_assert(kBmp2Start < kBmp2End); + static_assert(kBmp2End + 1 == kNonBmpStart); + static_assert(kNonBmpStart < kNonBmpEnd); - static constexpr uc32 kStarts[] = { + static constexpr base::uc32 kStarts[] = { kBmp1Start, kLeadSurrogateStart, kTrailSurrogateStart, kBmp2Start, kNonBmpStart, }; - static constexpr uc32 kEnds[] = { + static constexpr base::uc32 kEnds[] = { kBmp1End, kLeadSurrogateEnd, kTrailSurrogateEnd, kBmp2End, kNonBmpEnd, }; @@ -166,13 +171,13 @@ void UnicodeRangeSplitter::AddRange(CharacterRange range) { }; static constexpr int kCount = arraysize(kStarts); - STATIC_ASSERT(kCount == arraysize(kEnds)); - STATIC_ASSERT(kCount == arraysize(kTargets)); + static_assert(kCount == arraysize(kEnds)); + static_assert(kCount == arraysize(kTargets)); for (int i = 0; i < kCount; i++) { if (kStarts[i] > range.to()) break; - const uc32 from = std::max(kStarts[i], range.from()); - const uc32 to = std::min(kEnds[i], range.to()); + const base::uc32 from = std::max(kStarts[i], range.from()); + const base::uc32 to = std::min(kEnds[i], range.to()); if (from > to) continue; kTargets[i]->emplace_back(CharacterRange::Range(from, to)); } @@ -196,24 +201,68 @@ ZoneList* ToCanonicalZoneList( } void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result, - RegExpNode* on_success, UnicodeRangeSplitter* splitter, - JSRegExp::Flags flags) { + RegExpNode* on_success, UnicodeRangeSplitter* splitter) { ZoneList* bmp = ToCanonicalZoneList(splitter->bmp(), compiler->zone()); if (bmp == nullptr) return; result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges( - compiler->zone(), bmp, compiler->read_backward(), on_success, flags))); + compiler->zone(), bmp, compiler->read_backward(), on_success))); +} + +using UC16Range = uint32_t; // {from, to} packed into one uint32_t. +constexpr UC16Range ToUC16Range(base::uc16 from, base::uc16 to) { + return (static_cast(from) << 16) | to; +} +constexpr base::uc16 ExtractFrom(UC16Range r) { + return static_cast(r >> 16); +} +constexpr base::uc16 ExtractTo(UC16Range r) { + return static_cast(r); } void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result, RegExpNode* on_success, - UnicodeRangeSplitter* splitter, - JSRegExp::Flags flags) { - ZoneList* non_bmp = - ToCanonicalZoneList(splitter->non_bmp(), compiler->zone()); - if (non_bmp == nullptr) return; + UnicodeRangeSplitter* splitter) { DCHECK(!compiler->one_byte()); - Zone* zone = compiler->zone(); + Zone* const zone = compiler->zone(); + ZoneList* non_bmp = + ToCanonicalZoneList(splitter->non_bmp(), zone); + if (non_bmp == nullptr) return; + + // Translate each 32-bit code point range into the corresponding 16-bit code + // unit representation consisting of the lead- and trail surrogate. + // + // The generated alternatives are grouped by the leading surrogate to avoid + // emitting excessive code. For example, for + // + // { \ud800[\udc00-\udc01] + // , \ud800[\udc05-\udc06] + // } + // + // there's no need to emit matching code for the leading surrogate \ud800 + // twice. We also create a dedicated grouping for full trailing ranges, i.e. + // [dc00-dfff]. + ZoneUnorderedMap*> grouped_by_leading( + zone); + ZoneList* leading_with_full_trailing_range = + zone->New>(1, zone); + const auto AddRange = [&](base::uc16 from_l, base::uc16 to_l, + base::uc16 from_t, base::uc16 to_t) { + const UC16Range leading_range = ToUC16Range(from_l, to_l); + if (grouped_by_leading.count(leading_range) == 0) { + if (from_t == kTrailSurrogateStart && to_t == kTrailSurrogateEnd) { + leading_with_full_trailing_range->Add( + CharacterRange::Range(from_l, to_l), zone); + return; + } + grouped_by_leading[leading_range] = + zone->New>(2, zone); + } + grouped_by_leading[leading_range]->Add(CharacterRange::Range(from_t, to_t), + zone); + }; + + // First, create the grouped ranges. CharacterRange::Canonicalize(non_bmp); for (int i = 0; i < non_bmp->length(); i++) { // Match surrogate pair. @@ -221,85 +270,88 @@ void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result, // \ud800[\udc05-\udfff]| // [\ud801-\ud803][\udc00-\udfff]| // \ud804[\udc00-\udc05] - uc32 from = non_bmp->at(i).from(); - uc32 to = non_bmp->at(i).to(); - uc16 from_l = unibrow::Utf16::LeadSurrogate(from); - uc16 from_t = unibrow::Utf16::TrailSurrogate(from); - uc16 to_l = unibrow::Utf16::LeadSurrogate(to); - uc16 to_t = unibrow::Utf16::TrailSurrogate(to); + base::uc32 from = non_bmp->at(i).from(); + base::uc32 to = non_bmp->at(i).to(); + base::uc16 from_l = unibrow::Utf16::LeadSurrogate(from); + base::uc16 from_t = unibrow::Utf16::TrailSurrogate(from); + base::uc16 to_l = unibrow::Utf16::LeadSurrogate(to); + base::uc16 to_t = unibrow::Utf16::TrailSurrogate(to); + if (from_l == to_l) { // The lead surrogate is the same. - result->AddAlternative( - GuardedAlternative(TextNode::CreateForSurrogatePair( - zone, CharacterRange::Singleton(from_l), - CharacterRange::Range(from_t, to_t), compiler->read_backward(), - on_success, flags))); - } else { - if (from_t != kTrailSurrogateStart) { - // Add [from_l][from_t-\udfff] - result->AddAlternative( - GuardedAlternative(TextNode::CreateForSurrogatePair( - zone, CharacterRange::Singleton(from_l), - CharacterRange::Range(from_t, kTrailSurrogateEnd), - compiler->read_backward(), on_success, flags))); - from_l++; - } - if (to_t != kTrailSurrogateEnd) { - // Add [to_l][\udc00-to_t] - result->AddAlternative( - GuardedAlternative(TextNode::CreateForSurrogatePair( - zone, CharacterRange::Singleton(to_l), - CharacterRange::Range(kTrailSurrogateStart, to_t), - compiler->read_backward(), on_success, flags))); - to_l--; - } - if (from_l <= to_l) { - // Add [from_l-to_l][\udc00-\udfff] - result->AddAlternative( - GuardedAlternative(TextNode::CreateForSurrogatePair( - zone, CharacterRange::Range(from_l, to_l), - CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd), - compiler->read_backward(), on_success, flags))); - } + AddRange(from_l, to_l, from_t, to_t); + continue; } + + if (from_t != kTrailSurrogateStart) { + // Add [from_l][from_t-\udfff]. + AddRange(from_l, from_l, from_t, kTrailSurrogateEnd); + from_l++; + } + if (to_t != kTrailSurrogateEnd) { + // Add [to_l][\udc00-to_t]. + AddRange(to_l, to_l, kTrailSurrogateStart, to_t); + to_l--; + } + if (from_l <= to_l) { + // Add [from_l-to_l][\udc00-\udfff]. + AddRange(from_l, to_l, kTrailSurrogateStart, kTrailSurrogateEnd); + } + } + + // Create the actual TextNode now that ranges are fully grouped. + if (!leading_with_full_trailing_range->is_empty()) { + CharacterRange::Canonicalize(leading_with_full_trailing_range); + result->AddAlternative(GuardedAlternative(TextNode::CreateForSurrogatePair( + zone, leading_with_full_trailing_range, + CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd), + compiler->read_backward(), on_success))); + } + for (const auto& it : grouped_by_leading) { + CharacterRange leading_range = + CharacterRange::Range(ExtractFrom(it.first), ExtractTo(it.first)); + ZoneList* trailing_ranges = it.second; + CharacterRange::Canonicalize(trailing_ranges); + result->AddAlternative(GuardedAlternative(TextNode::CreateForSurrogatePair( + zone, leading_range, trailing_ranges, compiler->read_backward(), + on_success))); } } RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch( RegExpCompiler* compiler, ZoneList* lookbehind, - ZoneList* match, RegExpNode* on_success, bool read_backward, - JSRegExp::Flags flags) { + ZoneList* match, RegExpNode* on_success, + bool read_backward) { Zone* zone = compiler->zone(); RegExpNode* match_node = TextNode::CreateForCharacterRanges( - zone, match, read_backward, on_success, flags); + zone, match, read_backward, on_success); int stack_register = compiler->UnicodeLookaroundStackRegister(); int position_register = compiler->UnicodeLookaroundPositionRegister(); RegExpLookaround::Builder lookaround(false, match_node, stack_register, position_register); RegExpNode* negative_match = TextNode::CreateForCharacterRanges( - zone, lookbehind, !read_backward, lookaround.on_match_success(), flags); + zone, lookbehind, !read_backward, lookaround.on_match_success()); return lookaround.ForMatch(negative_match); } RegExpNode* MatchAndNegativeLookaroundInReadDirection( RegExpCompiler* compiler, ZoneList* match, ZoneList* lookahead, RegExpNode* on_success, - bool read_backward, JSRegExp::Flags flags) { + bool read_backward) { Zone* zone = compiler->zone(); int stack_register = compiler->UnicodeLookaroundStackRegister(); int position_register = compiler->UnicodeLookaroundPositionRegister(); RegExpLookaround::Builder lookaround(false, on_success, stack_register, position_register); RegExpNode* negative_match = TextNode::CreateForCharacterRanges( - zone, lookahead, read_backward, lookaround.on_match_success(), flags); + zone, lookahead, read_backward, lookaround.on_match_success()); return TextNode::CreateForCharacterRanges( - zone, match, read_backward, lookaround.ForMatch(negative_match), flags); + zone, match, read_backward, lookaround.ForMatch(negative_match)); } void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result, RegExpNode* on_success, - UnicodeRangeSplitter* splitter, - JSRegExp::Flags flags) { + UnicodeRangeSplitter* splitter) { ZoneList* lead_surrogates = ToCanonicalZoneList(splitter->lead_surrogates(), compiler->zone()); if (lead_surrogates == nullptr) return; @@ -313,20 +365,19 @@ void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result, // Reading backward. Assert that reading forward, there is no trail // surrogate, and then backward match the lead surrogate. match = NegativeLookaroundAgainstReadDirectionAndMatch( - compiler, trail_surrogates, lead_surrogates, on_success, true, flags); + compiler, trail_surrogates, lead_surrogates, on_success, true); } else { // Reading forward. Forward match the lead surrogate and assert that // no trail surrogate follows. match = MatchAndNegativeLookaroundInReadDirection( - compiler, lead_surrogates, trail_surrogates, on_success, false, flags); + compiler, lead_surrogates, trail_surrogates, on_success, false); } result->AddAlternative(GuardedAlternative(match)); } void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result, RegExpNode* on_success, - UnicodeRangeSplitter* splitter, - JSRegExp::Flags flags) { + UnicodeRangeSplitter* splitter) { ZoneList* trail_surrogates = ToCanonicalZoneList(splitter->trail_surrogates(), compiler->zone()); if (trail_surrogates == nullptr) return; @@ -340,12 +391,12 @@ void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result, // Reading backward. Backward match the trail surrogate and assert that no // lead surrogate precedes it. match = MatchAndNegativeLookaroundInReadDirection( - compiler, trail_surrogates, lead_surrogates, on_success, true, flags); + compiler, trail_surrogates, lead_surrogates, on_success, true); } else { // Reading forward. Assert that reading backward, there is no lead // surrogate, and then forward match the trail surrogate. match = NegativeLookaroundAgainstReadDirectionAndMatch( - compiler, lead_surrogates, trail_surrogates, on_success, false, flags); + compiler, lead_surrogates, trail_surrogates, on_success, false); } result->AddAlternative(GuardedAlternative(match)); } @@ -359,11 +410,9 @@ RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler, // we advanced into the middle of a surrogate pair, it will work out, as // nothing will match from there. We will have to advance again, consuming // the associated trail surrogate. - ZoneList* range = CharacterRange::List( - zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit)); - JSRegExp::Flags default_flags = JSRegExp::Flags(); - return TextNode::CreateForCharacterRanges(zone, range, false, on_success, - default_flags); + ZoneList* range = + CharacterRange::List(zone, CharacterRange::Range(0, kMaxUtf16CodeUnit)); + return TextNode::CreateForCharacterRanges(zone, range, false, on_success); } void AddUnicodeCaseEquivalents(ZoneList* ranges, Zone* zone) { @@ -404,48 +453,61 @@ void AddUnicodeCaseEquivalents(ZoneList* ranges, Zone* zone) { RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, RegExpNode* on_success) { set_.Canonicalize(); - Zone* zone = compiler->zone(); + Zone* const zone = compiler->zone(); ZoneList* ranges = this->ranges(zone); - if (NeedsUnicodeCaseEquivalents(flags_)) { + + if (NeedsUnicodeCaseEquivalents(compiler->flags())) { AddUnicodeCaseEquivalents(ranges, zone); } - if (IsUnicode(flags_) && !compiler->one_byte() && - !contains_split_surrogate()) { - if (is_negated()) { - ZoneList* negated = - zone->New>(2, zone); - CharacterRange::Negate(ranges, negated, zone); - ranges = negated; - } - if (ranges->length() == 0) { - JSRegExp::Flags default_flags; - RegExpCharacterClass* fail = - zone->New(zone, ranges, default_flags); - return zone->New(fail, compiler->read_backward(), on_success); - } - if (standard_type() == '*') { - return UnanchoredAdvance(compiler, on_success); - } else { - ChoiceNode* result = zone->New(2, zone); - UnicodeRangeSplitter splitter(ranges); - AddBmpCharacters(compiler, result, on_success, &splitter, flags_); - AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter, flags_); - AddLoneLeadSurrogates(compiler, result, on_success, &splitter, flags_); - AddLoneTrailSurrogates(compiler, result, on_success, &splitter, flags_); - static constexpr int kMaxRangesToInline = 32; // Arbitrary. - if (ranges->length() > kMaxRangesToInline) result->SetDoNotInline(); - return result; - } - } else { + + if (!IsUnicode(compiler->flags()) || compiler->one_byte() || + contains_split_surrogate()) { return zone->New(this, compiler->read_backward(), on_success); } + + if (is_negated()) { + ZoneList* negated = + zone->New>(2, zone); + CharacterRange::Negate(ranges, negated, zone); + ranges = negated; + } + + if (ranges->length() == 0) { + // The empty character class is used as a 'fail' node. + RegExpCharacterClass* fail = zone->New(zone, ranges); + return zone->New(fail, compiler->read_backward(), on_success); + } + + if (set_.is_standard() && + standard_type() == StandardCharacterSet::kEverything) { + return UnanchoredAdvance(compiler, on_success); + } + + // Split ranges in order to handle surrogates correctly: + // - Surrogate pairs: translate the 32-bit code point into two uc16 code + // units (irregexp operates only on code units). + // - Lone surrogates: these require lookarounds to ensure we don't match in + // the middle of a surrogate pair. + ChoiceNode* result = zone->New(2, zone); + UnicodeRangeSplitter splitter(ranges); + AddBmpCharacters(compiler, result, on_success, &splitter); + AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter); + AddLoneLeadSurrogates(compiler, result, on_success, &splitter); + AddLoneTrailSurrogates(compiler, result, on_success, &splitter); + + static constexpr int kMaxRangesToInline = 32; // Arbitrary. + if (ranges->length() > kMaxRangesToInline) result->SetDoNotInline(); + + return result; } +namespace { + int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) { RegExpAtom* atom1 = (*a)->AsAtom(); RegExpAtom* atom2 = (*b)->AsAtom(); - uc16 character1 = atom1->data().at(0); - uc16 character2 = atom2->data().at(0); + base::uc16 character1 = atom1->data().at(0); + base::uc16 character2 = atom2->data().at(0); if (character1 < character2) return -1; if (character1 > character2) return 1; return 0; @@ -453,17 +515,34 @@ int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) { #ifdef V8_INTL_SUPPORT -// Case Insensitve comparesion -int CompareFirstCharCaseInsensitve(RegExpTree* const* a, RegExpTree* const* b) { +int CompareCaseInsensitive(const icu::UnicodeString& a, + const icu::UnicodeString& b) { + return a.caseCompare(b, U_FOLD_CASE_DEFAULT); +} + +int CompareFirstCharCaseInsensitive(RegExpTree* const* a, + RegExpTree* const* b) { RegExpAtom* atom1 = (*a)->AsAtom(); RegExpAtom* atom2 = (*b)->AsAtom(); - icu::UnicodeString character1(atom1->data().at(0)); - return character1.caseCompare(atom2->data().at(0), U_FOLD_CASE_DEFAULT); + return CompareCaseInsensitive(icu::UnicodeString{atom1->data().at(0)}, + icu::UnicodeString{atom2->data().at(0)}); +} + +bool Equals(bool ignore_case, const icu::UnicodeString& a, + const icu::UnicodeString& b) { + if (a == b) return true; + if (ignore_case) return CompareCaseInsensitive(a, b) == 0; + return false; // Case-sensitive equality already checked above. +} + +bool CharAtEquals(bool ignore_case, int index, const RegExpAtom* a, + const RegExpAtom* b) { + return Equals(ignore_case, a->data().at(index), b->data().at(index)); } #else -static unibrow::uchar Canonical( +unibrow::uchar Canonical( unibrow::Mapping* canonicalize, unibrow::uchar c) { unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth]; @@ -474,22 +553,47 @@ static unibrow::uchar Canonical( return canonical; } -int CompareFirstCharCaseIndependent( +int CompareCaseInsensitive( + unibrow::Mapping* canonicalize, + unibrow::uchar a, unibrow::uchar b) { + if (a == b) return 0; + if (a >= 'a' || b >= 'a') { + a = Canonical(canonicalize, a); + b = Canonical(canonicalize, b); + } + return static_cast(a) - static_cast(b); +} + +int CompareFirstCharCaseInsensitive( unibrow::Mapping* canonicalize, RegExpTree* const* a, RegExpTree* const* b) { RegExpAtom* atom1 = (*a)->AsAtom(); RegExpAtom* atom2 = (*b)->AsAtom(); - unibrow::uchar character1 = atom1->data().at(0); - unibrow::uchar character2 = atom2->data().at(0); - if (character1 == character2) return 0; - if (character1 >= 'a' || character2 >= 'a') { - character1 = Canonical(canonicalize, character1); - character2 = Canonical(canonicalize, character2); - } - return static_cast(character1) - static_cast(character2); + return CompareCaseInsensitive(canonicalize, atom1->data().at(0), + atom2->data().at(0)); } + +bool Equals(bool ignore_case, + unibrow::Mapping* canonicalize, + unibrow::uchar a, unibrow::uchar b) { + if (a == b) return true; + if (ignore_case) { + return CompareCaseInsensitive(canonicalize, a, b) == 0; + } + return false; // Case-sensitive equality already checked above. +} + +bool CharAtEquals(bool ignore_case, + unibrow::Mapping* canonicalize, + int index, const RegExpAtom* a, const RegExpAtom* b) { + return Equals(ignore_case, canonicalize, a->data().at(index), + b->data().at(index)); +} + #endif // V8_INTL_SUPPORT +} // namespace + // We can stable sort runs of atoms, since the order does not matter if they // start with different characters. // Returns true if any consecutive atoms were found. @@ -506,12 +610,10 @@ bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) { // i is length or it is the index of an atom. if (i == length) break; int first_atom = i; - JSRegExp::Flags flags = alternatives->at(i)->AsAtom()->flags(); i++; while (i < length) { RegExpTree* alternative = alternatives->at(i); if (!alternative->IsAtom()) break; - if (alternative->AsAtom()->flags() != flags) break; i++; } // Sort atoms to get ones with common prefixes together. @@ -523,16 +625,16 @@ bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) { DCHECK_LT(first_atom, alternatives->length()); DCHECK_LE(i, alternatives->length()); DCHECK_LE(first_atom, i); - if (IgnoreCase(flags)) { + if (IsIgnoreCase(compiler->flags())) { #ifdef V8_INTL_SUPPORT - alternatives->StableSort(CompareFirstCharCaseInsensitve, first_atom, + alternatives->StableSort(CompareFirstCharCaseInsensitive, first_atom, i - first_atom); #else unibrow::Mapping* canonicalize = compiler->isolate()->regexp_macro_assembler_canonicalize(); auto compare_closure = [canonicalize](RegExpTree* const* a, RegExpTree* const* b) { - return CompareFirstCharCaseIndependent(canonicalize, a, b); + return CompareFirstCharCaseInsensitive(canonicalize, a, b); }; alternatives->StableSort(compare_closure, first_atom, i - first_atom); #endif // V8_INTL_SUPPORT @@ -549,6 +651,7 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { Zone* zone = compiler->zone(); ZoneList* alternatives = this->alternatives(); int length = alternatives->length(); + const bool ignore_case = IsIgnoreCase(compiler->flags()); int write_posn = 0; int i = 0; @@ -560,11 +663,15 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { continue; } RegExpAtom* const atom = alternative->AsAtom(); - JSRegExp::Flags flags = atom->flags(); #ifdef V8_INTL_SUPPORT icu::UnicodeString common_prefix(atom->data().at(0)); #else + unibrow::Mapping* const canonicalize = + compiler->isolate()->regexp_macro_assembler_canonicalize(); unibrow::uchar common_prefix = atom->data().at(0); + if (ignore_case) { + common_prefix = Canonical(canonicalize, common_prefix); + } #endif // V8_INTL_SUPPORT int first_with_prefix = i; int prefix_length = atom->length(); @@ -572,27 +679,15 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { while (i < length) { alternative = alternatives->at(i); if (!alternative->IsAtom()) break; - RegExpAtom* const atom = alternative->AsAtom(); - if (atom->flags() != flags) break; + RegExpAtom* const alt_atom = alternative->AsAtom(); #ifdef V8_INTL_SUPPORT - icu::UnicodeString new_prefix(atom->data().at(0)); - if (new_prefix != common_prefix) { - if (!IgnoreCase(flags)) break; - if (common_prefix.caseCompare(new_prefix, U_FOLD_CASE_DEFAULT) != 0) - break; - } + icu::UnicodeString new_prefix(alt_atom->data().at(0)); + if (!Equals(ignore_case, new_prefix, common_prefix)) break; #else - unibrow::uchar new_prefix = atom->data().at(0); - if (new_prefix != common_prefix) { - if (!IgnoreCase(flags)) break; - unibrow::Mapping* canonicalize = - compiler->isolate()->regexp_macro_assembler_canonicalize(); - new_prefix = Canonical(canonicalize, new_prefix); - common_prefix = Canonical(canonicalize, common_prefix); - if (new_prefix != common_prefix) break; - } + unibrow::uchar new_prefix = alt_atom->data().at(0); + if (!Equals(ignore_case, canonicalize, new_prefix, common_prefix)) break; #endif // V8_INTL_SUPPORT - prefix_length = std::min(prefix_length, atom->length()); + prefix_length = std::min(prefix_length, alt_atom->length()); i++; } if (i > first_with_prefix + 2) { @@ -602,19 +697,24 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { // common prefix if the terms were similar or presorted in the input. // Find out how long the common prefix is. int run_length = i - first_with_prefix; - RegExpAtom* const atom = alternatives->at(first_with_prefix)->AsAtom(); + RegExpAtom* const alt_atom = + alternatives->at(first_with_prefix)->AsAtom(); for (int j = 1; j < run_length && prefix_length > 1; j++) { RegExpAtom* old_atom = alternatives->at(j + first_with_prefix)->AsAtom(); for (int k = 1; k < prefix_length; k++) { - if (atom->data().at(k) != old_atom->data().at(k)) { +#ifdef V8_INTL_SUPPORT + if (!CharAtEquals(ignore_case, k, alt_atom, old_atom)) { +#else + if (!CharAtEquals(ignore_case, canonicalize, k, alt_atom, old_atom)) { +#endif // V8_INTL_SUPPORT prefix_length = k; break; } } } - RegExpAtom* prefix = zone->New( - atom->data().SubVector(0, prefix_length), flags); + RegExpAtom* prefix = + zone->New(alt_atom->data().SubVector(0, prefix_length)); ZoneList* pair = zone->New>(2, zone); pair->Add(prefix, zone); ZoneList* suffixes = @@ -627,8 +727,7 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { suffixes->Add(zone->New(), zone); } else { RegExpTree* suffix = zone->New( - old_atom->data().SubVector(prefix_length, old_atom->length()), - flags); + old_atom->data().SubVector(prefix_length, old_atom->length())); suffixes->Add(suffix, zone); } } @@ -666,7 +765,7 @@ void RegExpDisjunction::FixSingleCharacterDisjunctions( i++; continue; } - JSRegExp::Flags flags = atom->flags(); + const RegExpFlags flags = compiler->flags(); DCHECK_IMPLIES(IsUnicode(flags), !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0))); bool contains_trail_surrogate = @@ -678,13 +777,12 @@ void RegExpDisjunction::FixSingleCharacterDisjunctions( while (i < length) { alternative = alternatives->at(i); if (!alternative->IsAtom()) break; - RegExpAtom* const atom = alternative->AsAtom(); - if (atom->length() != 1) break; - if (atom->flags() != flags) break; + RegExpAtom* const alt_atom = alternative->AsAtom(); + if (alt_atom->length() != 1) break; DCHECK_IMPLIES(IsUnicode(flags), - !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0))); + !unibrow::Utf16::IsLeadSurrogate(alt_atom->data().at(0))); contains_trail_surrogate |= - unibrow::Utf16::IsTrailSurrogate(atom->data().at(0)); + unibrow::Utf16::IsTrailSurrogate(alt_atom->data().at(0)); i++; } if (i > first_in_run + 1) { @@ -701,8 +799,8 @@ void RegExpDisjunction::FixSingleCharacterDisjunctions( if (IsUnicode(flags) && contains_trail_surrogate) { character_class_flags = RegExpCharacterClass::CONTAINS_SPLIT_SURROGATE; } - alternatives->at(write_posn++) = zone->New( - zone, ranges, flags, character_class_flags); + alternatives->at(write_posn++) = + zone->New(zone, ranges, character_class_flags); } else { // Just copy any trivial alternatives. for (int j = first_in_run; j < i; j++) { @@ -715,6 +813,8 @@ void RegExpDisjunction::FixSingleCharacterDisjunctions( RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler, RegExpNode* on_success) { + compiler->ToNodeMaybeCheckForStackOverflow(); + ZoneList* alternatives = this->alternatives(); if (alternatives->length() > 2) { @@ -748,13 +848,14 @@ namespace { // \B to (?<=\w)(?=\w)|(?<=\W)(?=\W) RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler, RegExpNode* on_success, - RegExpAssertion::AssertionType type, - JSRegExp::Flags flags) { - DCHECK(NeedsUnicodeCaseEquivalents(flags)); + RegExpAssertion::Type type, + RegExpFlags flags) { + CHECK(NeedsUnicodeCaseEquivalents(flags)); Zone* zone = compiler->zone(); ZoneList* word_range = zone->New>(2, zone); - CharacterRange::AddClassEscape('w', word_range, true, zone); + CharacterRange::AddClassEscape(StandardCharacterSet::kWord, word_range, true, + zone); int stack_register = compiler->UnicodeLookaroundStackRegister(); int position_register = compiler->UnicodeLookaroundPositionRegister(); ChoiceNode* result = zone->New(2, zone); @@ -763,18 +864,18 @@ RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler, for (int i = 0; i < 2; i++) { bool lookbehind_for_word = i == 0; bool lookahead_for_word = - (type == RegExpAssertion::BOUNDARY) ^ lookbehind_for_word; + (type == RegExpAssertion::Type::BOUNDARY) ^ lookbehind_for_word; // Look to the left. RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success, stack_register, position_register); RegExpNode* backward = TextNode::CreateForCharacterRanges( - zone, word_range, true, lookbehind.on_match_success(), flags); + zone, word_range, true, lookbehind.on_match_success()); // Look to the right. RegExpLookaround::Builder lookahead(lookahead_for_word, lookbehind.ForMatch(backward), stack_register, position_register); RegExpNode* forward = TextNode::CreateForCharacterRanges( - zone, word_range, false, lookahead.on_match_success(), flags); + zone, word_range, false, lookahead.on_match_success()); result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward))); } return result; @@ -787,23 +888,24 @@ RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, Zone* zone = compiler->zone(); switch (assertion_type()) { - case START_OF_LINE: + case Type::START_OF_LINE: return AssertionNode::AfterNewline(on_success); - case START_OF_INPUT: + case Type::START_OF_INPUT: return AssertionNode::AtStart(on_success); - case BOUNDARY: - return NeedsUnicodeCaseEquivalents(flags_) - ? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY, - flags_) + case Type::BOUNDARY: + return NeedsUnicodeCaseEquivalents(compiler->flags()) + ? BoundaryAssertionAsLookaround( + compiler, on_success, Type::BOUNDARY, compiler->flags()) : AssertionNode::AtBoundary(on_success); - case NON_BOUNDARY: - return NeedsUnicodeCaseEquivalents(flags_) + case Type::NON_BOUNDARY: + return NeedsUnicodeCaseEquivalents(compiler->flags()) ? BoundaryAssertionAsLookaround(compiler, on_success, - NON_BOUNDARY, flags_) + Type::NON_BOUNDARY, + compiler->flags()) : AssertionNode::AtNonBoundary(on_success); - case END_OF_INPUT: + case Type::END_OF_INPUT: return AssertionNode::AtEnd(on_success); - case END_OF_LINE: { + case Type::END_OF_LINE: { // Compile $ in multiline regexps as an alternation with a positive // lookahead in one side and an end-of-input on the other side. // We need two registers for the lookahead. @@ -814,10 +916,10 @@ RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, // Create a newline atom. ZoneList* newline_ranges = zone->New>(3, zone); - CharacterRange::AddClassEscape('n', newline_ranges, false, zone); - JSRegExp::Flags default_flags = JSRegExp::Flags(); - RegExpCharacterClass* newline_atom = - zone->New('n', default_flags); + CharacterRange::AddClassEscape(StandardCharacterSet::kLineTerminator, + newline_ranges, false, zone); + RegExpCharacterClass* newline_atom = zone->New( + StandardCharacterSet::kLineTerminator); TextNode* newline_matcher = zone->New(newline_atom, false, ActionNode::PositiveSubmatchSuccess( @@ -838,7 +940,6 @@ RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, default: UNREACHABLE(); } - return on_success; } RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler, @@ -854,6 +955,11 @@ RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler, return on_success; } +RegExpNode* RegExpGroup::ToNode(RegExpCompiler* compiler, + RegExpNode* on_success) { + return body_->ToNode(compiler, on_success); +} + RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success, int stack_pointer_register, int position_register, @@ -970,18 +1076,14 @@ class AssertionSequenceRewriter final { // Bitfield of all seen assertions. uint32_t seen_assertions = 0; - STATIC_ASSERT(RegExpAssertion::LAST_TYPE < kUInt32Size * kBitsPerByte); - - // Flags must match for folding. - JSRegExp::Flags flags = terms_->at(from)->AsAssertion()->flags(); - bool saw_mismatched_flags = false; + static_assert(static_cast(RegExpAssertion::Type::LAST_ASSERTION_TYPE) < + kUInt32Size * kBitsPerByte); for (int i = from; i < to; i++) { RegExpAssertion* t = terms_->at(i)->AsAssertion(); - if (t->flags() != flags) saw_mismatched_flags = true; - const uint32_t bit = 1 << t->assertion_type(); + const uint32_t bit = 1 << static_cast(t->assertion_type()); - if ((seen_assertions & bit) && !saw_mismatched_flags) { + if (seen_assertions & bit) { // Fold duplicates. terms_->Set(i, zone_->New()); } @@ -991,7 +1093,8 @@ class AssertionSequenceRewriter final { // Collapse failures. const uint32_t always_fails_mask = - 1 << RegExpAssertion::BOUNDARY | 1 << RegExpAssertion::NON_BOUNDARY; + 1 << static_cast(RegExpAssertion::Type::BOUNDARY) | + 1 << static_cast(RegExpAssertion::Type::NON_BOUNDARY); if ((seen_assertions & always_fails_mask) == always_fails_mask) { ReplaceSequenceWithFailure(from, to); } @@ -1003,8 +1106,7 @@ class AssertionSequenceRewriter final { // negated '*' (everything) range serves the purpose. ZoneList* ranges = zone_->New>(0, zone_); - RegExpCharacterClass* cc = - zone_->New(zone_, ranges, JSRegExp::Flags()); + RegExpCharacterClass* cc = zone_->New(zone_, ranges); terms_->Set(from, cc); // Zero out the rest. @@ -1024,6 +1126,8 @@ class AssertionSequenceRewriter final { RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler, RegExpNode* on_success) { + compiler->ToNodeMaybeCheckForStackOverflow(); + ZoneList* children = nodes(); AssertionSequenceRewriter::MaybeRewrite(children, compiler->zone()); @@ -1041,8 +1145,10 @@ RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler, return current; } -static void AddClass(const int* elmv, int elmc, - ZoneList* ranges, Zone* zone) { +namespace { + +void AddClass(const int* elmv, int elmc, ZoneList* ranges, + Zone* zone) { elmc--; DCHECK_EQ(kRangeEndMarker, elmv[elmc]); for (int i = 0; i < elmc; i += 2) { @@ -1051,26 +1157,31 @@ static void AddClass(const int* elmv, int elmc, } } -static void AddClassNegated(const int* elmv, int elmc, - ZoneList* ranges, Zone* zone) { +void AddClassNegated(const int* elmv, int elmc, + ZoneList* ranges, Zone* zone) { elmc--; DCHECK_EQ(kRangeEndMarker, elmv[elmc]); DCHECK_NE(0x0000, elmv[0]); - DCHECK_NE(String::kMaxCodePoint, elmv[elmc - 1]); - uc16 last = 0x0000; + DCHECK_NE(kMaxCodePoint, elmv[elmc - 1]); + base::uc16 last = 0x0000; for (int i = 0; i < elmc; i += 2) { DCHECK(last <= elmv[i] - 1); DCHECK(elmv[i] < elmv[i + 1]); ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone); last = elmv[i + 1]; } - ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone); + ranges->Add(CharacterRange::Range(last, kMaxCodePoint), zone); } -void CharacterRange::AddClassEscape(char type, ZoneList* ranges, +} // namespace + +void CharacterRange::AddClassEscape(StandardCharacterSet standard_character_set, + ZoneList* ranges, bool add_unicode_case_equivalents, Zone* zone) { - if (add_unicode_case_equivalents && (type == 'w' || type == 'W')) { + if (add_unicode_case_equivalents && + (standard_character_set == StandardCharacterSet::kWord || + standard_character_set == StandardCharacterSet::kNotWord)) { // See #sec-runtime-semantics-wordcharacters-abstract-operation // In case of unicode and ignore_case, we need to create the closure over // case equivalent characters before negating. @@ -1078,7 +1189,7 @@ void CharacterRange::AddClassEscape(char type, ZoneList* ranges, zone->New>(2, zone); AddClass(kWordRanges, kWordRangeCount, new_ranges, zone); AddUnicodeCaseEquivalents(new_ranges, zone); - if (type == 'W') { + if (standard_character_set == StandardCharacterSet::kNotWord) { ZoneList* negated = zone->New>(2, zone); CharacterRange::Negate(new_ranges, negated, zone); @@ -1087,54 +1198,44 @@ void CharacterRange::AddClassEscape(char type, ZoneList* ranges, ranges->AddAll(*new_ranges, zone); return; } - AddClassEscape(type, ranges, zone); -} -void CharacterRange::AddClassEscape(char type, ZoneList* ranges, - Zone* zone) { - switch (type) { - case 's': + switch (standard_character_set) { + case StandardCharacterSet::kWhitespace: AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone); break; - case 'S': + case StandardCharacterSet::kNotWhitespace: AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone); break; - case 'w': + case StandardCharacterSet::kWord: AddClass(kWordRanges, kWordRangeCount, ranges, zone); break; - case 'W': + case StandardCharacterSet::kNotWord: AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone); break; - case 'd': + case StandardCharacterSet::kDigit: AddClass(kDigitRanges, kDigitRangeCount, ranges, zone); break; - case 'D': + case StandardCharacterSet::kNotDigit: AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone); break; - case '.': + // This is the set of characters matched by the $ and ^ symbols + // in multiline mode. + case StandardCharacterSet::kLineTerminator: + AddClass(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges, zone); + break; + case StandardCharacterSet::kNotLineTerminator: AddClassNegated(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges, zone); break; // This is not a character range as defined by the spec but a // convenient shorthand for a character class that matches any // character. - case '*': + case StandardCharacterSet::kEverything: ranges->Add(CharacterRange::Everything(), zone); break; - // This is the set of characters matched by the $ and ^ symbols - // in multiline mode. - case 'n': - AddClass(kLineTerminatorRanges, kLineTerminatorRangeCount, ranges, zone); - break; - default: - UNREACHABLE(); } } -Vector CharacterRange::GetWordBounds() { - return Vector(kWordRanges, kWordRangeCount - 1); -} - // static void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone, ZoneList* ranges, @@ -1145,14 +1246,14 @@ void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone, icu::UnicodeSet others; for (int i = 0; i < range_count; i++) { CharacterRange range = ranges->at(i); - uc32 from = range.from(); - if (from > String::kMaxUtf16CodeUnit) continue; - uc32 to = std::min({range.to(), String::kMaxUtf16CodeUnitU}); + base::uc32 from = range.from(); + if (from > kMaxUtf16CodeUnit) continue; + base::uc32 to = std::min({range.to(), kMaxUtf16CodeUnitU}); // Nothing to be done for surrogates. if (from >= kLeadSurrogateStart && to <= kTrailSurrogateEnd) continue; if (is_one_byte && !RangeContainsLatin1Equivalents(range)) { - if (from > String::kMaxOneByteCharCode) continue; - if (to > String::kMaxOneByteCharCode) to = String::kMaxOneByteCharCode; + if (from > kMaxOneByteCharCode) continue; + if (to > kMaxOneByteCharCode) to = kMaxOneByteCharCode; } others.add(from, to); } @@ -1188,21 +1289,21 @@ void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone, #else for (int i = 0; i < range_count; i++) { CharacterRange range = ranges->at(i); - uc32 bottom = range.from(); - if (bottom > String::kMaxUtf16CodeUnit) continue; - uc32 top = std::min({range.to(), String::kMaxUtf16CodeUnitU}); + base::uc32 bottom = range.from(); + if (bottom > kMaxUtf16CodeUnit) continue; + base::uc32 top = std::min({range.to(), kMaxUtf16CodeUnitU}); // Nothing to be done for surrogates. if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) continue; if (is_one_byte && !RangeContainsLatin1Equivalents(range)) { - if (bottom > String::kMaxOneByteCharCode) continue; - if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode; + if (bottom > kMaxOneByteCharCode) continue; + if (top > kMaxOneByteCharCode) top = kMaxOneByteCharCode; } unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; if (top == bottom) { // If this is a singleton we just expand the one character. int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars); for (int i = 0; i < length; i++) { - uc32 chr = chars[i]; + base::uc32 chr = chars[i]; if (chr != bottom) { ranges->Add(CharacterRange::Singleton(chars[i]), zone); } @@ -1225,11 +1326,11 @@ void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone, // block we do this for all the blocks covered by the range (handling // characters that is not in a block as a "singleton block"). unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth]; - uc32 pos = bottom; + base::uc32 pos = bottom; while (pos <= top) { int length = isolate->jsregexp_canonrange()->get(pos, '\0', equivalents); - uc32 block_end; + base::uc32 block_end; if (length == 0) { block_end = pos; } else { @@ -1240,9 +1341,9 @@ void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone, length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', equivalents); for (int i = 0; i < length; i++) { - uc32 c = equivalents[i]; - uc32 range_from = c - (block_end - pos); - uc32 range_to = c - (block_end - end); + base::uc32 c = equivalents[i]; + base::uc32 range_from = c - (block_end - pos); + base::uc32 range_to = c - (block_end - end); if (!(bottom <= range_from && range_to <= top)) { ranges->Add(CharacterRange::Range(range_from, range_to), zone); } @@ -1258,7 +1359,7 @@ bool CharacterRange::IsCanonical(ZoneList* ranges) { DCHECK_NOT_NULL(ranges); int n = ranges->length(); if (n <= 1) return true; - uc32 max = ranges->at(0).to(); + base::uc32 max = ranges->at(0).to(); for (int i = 1; i < n; i++) { CharacterRange next_range = ranges->at(i); if (next_range.from() <= max + 1) return false; @@ -1270,15 +1371,17 @@ bool CharacterRange::IsCanonical(ZoneList* ranges) { ZoneList* CharacterSet::ranges(Zone* zone) { if (ranges_ == nullptr) { ranges_ = zone->New>(2, zone); - CharacterRange::AddClassEscape(standard_set_type_, ranges_, false, zone); + CharacterRange::AddClassEscape(standard_set_type_.value(), ranges_, false, + zone); } return ranges_; } +namespace { + // Move a number of elements in a zonelist to another position // in the same list. Handles overlapping source and target areas. -static void MoveRanges(ZoneList* list, int from, int to, - int count) { +void MoveRanges(ZoneList* list, int from, int to, int count) { // Ranges are potentially overlapping. if (from < to) { for (int i = count - 1; i >= 0; i--) { @@ -1291,15 +1394,15 @@ static void MoveRanges(ZoneList* list, int from, int to, } } -static int InsertRangeInCanonicalList(ZoneList* list, int count, - CharacterRange insert) { +int InsertRangeInCanonicalList(ZoneList* list, int count, + CharacterRange insert) { // Inserts a range into list[0..count[, which must be sorted // by from value and non-overlapping and non-adjacent, using at most // list[0..count] for the result. Returns the number of resulting // canonicalized ranges. Inserting a range may collapse existing ranges into // fewer ranges, so the return value can be anything in the range 1..count+1. - uc32 from = insert.from(); - uc32 to = insert.to(); + base::uc32 from = insert.from(); + base::uc32 to = insert.to(); int start_pos = 0; int end_pos = count; for (int i = count - 1; i >= 0; i--) { @@ -1347,6 +1450,8 @@ static int InsertRangeInCanonicalList(ZoneList* list, int count, return count - (end_pos - start_pos) + 1; } +} // namespace + void CharacterSet::Canonicalize() { // Special/default classes are always considered canonical. The result // of calling ranges() will be sorted. @@ -1354,12 +1459,13 @@ void CharacterSet::Canonicalize() { CharacterRange::Canonicalize(ranges_); } +// static void CharacterRange::Canonicalize(ZoneList* character_ranges) { if (character_ranges->length() <= 1) return; // Check whether ranges are already canonical (increasing, non-overlapping, // non-adjacent). int n = character_ranges->length(); - uc32 max = character_ranges->at(0).to(); + base::uc32 max = character_ranges->at(0).to(); int i = 1; while (i < n) { CharacterRange current = character_ranges->at(i); @@ -1389,13 +1495,14 @@ void CharacterRange::Canonicalize(ZoneList* character_ranges) { DCHECK(CharacterRange::IsCanonical(character_ranges)); } +// static void CharacterRange::Negate(ZoneList* ranges, ZoneList* negated_ranges, Zone* zone) { DCHECK(CharacterRange::IsCanonical(ranges)); DCHECK_EQ(0, negated_ranges->length()); int range_count = ranges->length(); - uc32 from = 0; + base::uc32 from = 0; int i = 0; if (range_count > 0 && ranges->at(0).from() == 0) { from = ranges->at(0).to() + 1; @@ -1407,12 +1514,34 @@ void CharacterRange::Negate(ZoneList* ranges, from = range.to() + 1; i++; } - if (from < String::kMaxCodePoint) { - negated_ranges->Add(CharacterRange::Range(from, String::kMaxCodePoint), - zone); + if (from < kMaxCodePoint) { + negated_ranges->Add(CharacterRange::Range(from, kMaxCodePoint), zone); } } +// static +void CharacterRange::ClampToOneByte(ZoneList* ranges) { + DCHECK(IsCanonical(ranges)); + + // Drop all ranges that don't contain one-byte code units, and clamp the last + // range s.t. it likewise only contains one-byte code units. Note this relies + // on `ranges` being canonicalized, i.e. sorted and non-overlapping. + + static constexpr base::uc32 max_char = String::kMaxOneByteCharCodeU; + int n = ranges->length(); + for (; n > 0; n--) { + CharacterRange& r = ranges->at(n - 1); + if (r.from() <= max_char) { + r.to_ = std::min(r.to_, max_char); + break; + } + } + + ranges->Rewind(n); +} + +namespace { + // Scoped object to keep track of how much we unroll quantifier loops in the // regexp graph generator. class RegExpExpansionLimiter { @@ -1450,6 +1579,8 @@ class RegExpExpansionLimiter { DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter); }; +} // namespace + RegExpNode* RegExpQuantifier::ToNode(int min, int max, bool is_greedy, RegExpTree* body, RegExpCompiler* compiler, RegExpNode* on_success, diff --git a/js/src/irregexp/imported/regexp-compiler.cc b/js/src/irregexp/imported/regexp-compiler.cc index 3d44ba6f16df..5de78811cdad 100644 --- a/js/src/irregexp/imported/regexp-compiler.cc +++ b/js/src/irregexp/imported/regexp-compiler.cc @@ -5,11 +5,9 @@ #include "irregexp/imported/regexp-compiler.h" #include "irregexp/imported/regexp-macro-assembler-arch.h" -#ifdef V8_INTL_SUPPORT -#include "irregexp/imported/special-case.h" -#endif // V8_INTL_SUPPORT #ifdef V8_INTL_SUPPORT +#include "irregexp/imported/special-case.h" #include "unicode/locid.h" #include "unicode/uniset.h" #include "unicode/utypes.h" @@ -171,17 +169,17 @@ using namespace regexp_compiler_constants; // NOLINT(build/namespaces) namespace { -constexpr uc32 MaxCodeUnit(const bool one_byte) { - STATIC_ASSERT(String::kMaxOneByteCharCodeU <= +constexpr base::uc32 MaxCodeUnit(const bool one_byte) { + static_assert(String::kMaxOneByteCharCodeU <= std::numeric_limits::max()); - STATIC_ASSERT(String::kMaxUtf16CodeUnitU <= + static_assert(String::kMaxUtf16CodeUnitU <= std::numeric_limits::max()); return one_byte ? String::kMaxOneByteCharCodeU : String::kMaxUtf16CodeUnitU; } constexpr uint32_t CharMask(const bool one_byte) { - STATIC_ASSERT(base::bits::IsPowerOfTwo(String::kMaxOneByteCharCodeU + 1)); - STATIC_ASSERT(base::bits::IsPowerOfTwo(String::kMaxUtf16CodeUnitU + 1)); + static_assert(base::bits::IsPowerOfTwo(String::kMaxOneByteCharCodeU + 1)); + static_assert(base::bits::IsPowerOfTwo(String::kMaxUtf16CodeUnitU + 1)); return MaxCodeUnit(one_byte); } @@ -235,12 +233,13 @@ class RecursionCheck { // Attempts to compile the regexp using an Irregexp code generator. Returns // a fixed array or a null handle depending on whether it succeeded. RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count, - bool one_byte) + RegExpFlags flags, bool one_byte) : next_register_(JSRegExp::RegistersForCaptureCount(capture_count)), unicode_lookaround_stack_register_(kNoRegister), unicode_lookaround_position_register_(kNoRegister), work_list_(nullptr), recursion_depth_(0), + flags_(flags), one_byte_(one_byte), reg_exp_too_big_(false), limiting_recursion_(false), @@ -274,6 +273,9 @@ RegExpCompiler::CompilationResult RegExpCompiler::Assemble( if (!node->label()->is_bound()) node->Emit(this, &new_trace); } if (reg_exp_too_big_) { + if (FLAG_correctness_fuzzer_suppressions) { + FATAL("Aborting on excess zone allocation"); + } macro_assembler_->AbortedCodeGeneration(); return CompilationResult::RegExpTooBig(); } @@ -480,7 +482,6 @@ void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler, } default: UNREACHABLE(); - break; } } } @@ -734,7 +735,7 @@ namespace { #ifdef DEBUG bool ContainsOnlyUtf16CodeUnits(unibrow::uchar* chars, int length) { - STATIC_ASSERT(sizeof(unibrow::uchar) == 4); + static_assert(sizeof(unibrow::uchar) == 4); for (int i = 0; i < length; i++) { if (chars[i] > String::kMaxUtf16CodeUnit) return false; } @@ -742,14 +743,11 @@ bool ContainsOnlyUtf16CodeUnits(unibrow::uchar* chars, int length) { } #endif // DEBUG -} // namespace - // Returns the number of characters in the equivalence class, omitting those // that cannot occur in the source string because it is Latin1. -static int GetCaseIndependentLetters(Isolate* isolate, uc16 character, - bool one_byte_subject, - unibrow::uchar* letters, - int letter_length) { +int GetCaseIndependentLetters(Isolate* isolate, base::uc16 character, + bool one_byte_subject, unibrow::uchar* letters, + int letter_length) { #ifdef V8_INTL_SUPPORT if (RegExpCaseFolding::IgnoreSet().contains(character)) { letters[0] = character; @@ -809,10 +807,9 @@ static int GetCaseIndependentLetters(Isolate* isolate, uc16 character, #endif // V8_INTL_SUPPORT } -static inline bool EmitSimpleCharacter(Isolate* isolate, - RegExpCompiler* compiler, uc16 c, - Label* on_failure, int cp_offset, - bool check, bool preloaded) { +inline bool EmitSimpleCharacter(Isolate* isolate, RegExpCompiler* compiler, + base::uc16 c, Label* on_failure, int cp_offset, + bool check, bool preloaded) { RegExpMacroAssembler* assembler = compiler->macro_assembler(); bool bound_checked = false; if (!preloaded) { @@ -825,9 +822,9 @@ static inline bool EmitSimpleCharacter(Isolate* isolate, // Only emits non-letters (things that don't have case). Only used for case // independent matches. -static inline bool EmitAtomNonLetter(Isolate* isolate, RegExpCompiler* compiler, - uc16 c, Label* on_failure, int cp_offset, - bool check, bool preloaded) { +inline bool EmitAtomNonLetter(Isolate* isolate, RegExpCompiler* compiler, + base::uc16 c, Label* on_failure, int cp_offset, + bool check, bool preloaded) { RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); bool one_byte = compiler->one_byte(); unibrow::uchar chars[4]; @@ -854,28 +851,28 @@ static inline bool EmitAtomNonLetter(Isolate* isolate, RegExpCompiler* compiler, return checked; } -static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler, - bool one_byte, uc16 c1, uc16 c2, - Label* on_failure) { +bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler, + bool one_byte, base::uc16 c1, base::uc16 c2, + Label* on_failure) { const uint32_t char_mask = CharMask(one_byte); - uc16 exor = c1 ^ c2; + base::uc16 exor = c1 ^ c2; // Check whether exor has only one bit set. if (((exor - 1) & exor) == 0) { // If c1 and c2 differ only by one bit. // Ecma262UnCanonicalize always gives the highest number last. DCHECK(c2 > c1); - uc16 mask = char_mask ^ exor; + base::uc16 mask = char_mask ^ exor; macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure); return true; } DCHECK(c2 > c1); - uc16 diff = c2 - c1; + base::uc16 diff = c2 - c1; if (((diff - 1) & diff) == 0 && c1 >= diff) { // If the characters differ by 2^n but don't differ by one bit then // subtract the difference from the found character, then do the or // trick. We avoid the theoretical case where negative numbers are // involved in order to simplify code generation. - uc16 mask = char_mask ^ diff; + base::uc16 mask = char_mask ^ diff; macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff, diff, mask, on_failure); return true; @@ -885,9 +882,9 @@ static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler, // Only emits letters (things that have case). Only used for case independent // matches. -static inline bool EmitAtomLetter(Isolate* isolate, RegExpCompiler* compiler, - uc16 c, Label* on_failure, int cp_offset, - bool check, bool preloaded) { +inline bool EmitAtomLetter(Isolate* isolate, RegExpCompiler* compiler, + base::uc16 c, Label* on_failure, int cp_offset, + bool check, bool preloaded) { RegExpMacroAssembler* macro_assembler = compiler->macro_assembler(); bool one_byte = compiler->one_byte(); unibrow::uchar chars[4]; @@ -925,9 +922,9 @@ static inline bool EmitAtomLetter(Isolate* isolate, RegExpCompiler* compiler, return true; } -static void EmitBoundaryTest(RegExpMacroAssembler* masm, int border, - Label* fall_through, Label* above_or_equal, - Label* below) { +void EmitBoundaryTest(RegExpMacroAssembler* masm, int border, + Label* fall_through, Label* above_or_equal, + Label* below) { if (below != fall_through) { masm->CheckCharacterLT(border, below); if (above_or_equal != fall_through) masm->GoTo(above_or_equal); @@ -936,9 +933,9 @@ static void EmitBoundaryTest(RegExpMacroAssembler* masm, int border, } } -static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm, int first, - int last, Label* fall_through, - Label* in_range, Label* out_of_range) { +void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm, int first, int last, + Label* fall_through, Label* in_range, + Label* out_of_range) { if (in_range == fall_through) { if (first == last) { masm->CheckNotCharacter(first, out_of_range); @@ -957,15 +954,15 @@ static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm, int first, // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even. // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd. -static void EmitUseLookupTable(RegExpMacroAssembler* masm, - ZoneList* ranges, uint32_t start_index, - uint32_t end_index, uc32 min_char, - Label* fall_through, Label* even_label, - Label* odd_label) { +void EmitUseLookupTable(RegExpMacroAssembler* masm, + ZoneList* ranges, uint32_t start_index, + uint32_t end_index, base::uc32 min_char, + Label* fall_through, Label* even_label, + Label* odd_label) { static const uint32_t kSize = RegExpMacroAssembler::kTableSize; static const uint32_t kMask = RegExpMacroAssembler::kTableMask; - uc32 base = (min_char & ~kMask); + base::uc32 base = (min_char & ~kMask); USE(base); // Assert that everything is on one kTableSize page. @@ -1012,10 +1009,9 @@ static void EmitUseLookupTable(RegExpMacroAssembler* masm, if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear); } -static void CutOutRange(RegExpMacroAssembler* masm, ZoneList* ranges, - uint32_t start_index, uint32_t end_index, - uint32_t cut_index, Label* even_label, - Label* odd_label) { +void CutOutRange(RegExpMacroAssembler* masm, ZoneList* ranges, + uint32_t start_index, uint32_t end_index, uint32_t cut_index, + Label* even_label, Label* odd_label) { bool odd = (((cut_index - start_index) & 1) == 1); Label* in_range_label = odd ? odd_label : even_label; Label dummy; @@ -1036,14 +1032,14 @@ static void CutOutRange(RegExpMacroAssembler* masm, ZoneList* ranges, // Unicode case. Split the search space into kSize spaces that are handled // with recursion. -static void SplitSearchSpace(ZoneList* ranges, uint32_t start_index, - uint32_t end_index, uint32_t* new_start_index, - uint32_t* new_end_index, uc32* border) { +void SplitSearchSpace(ZoneList* ranges, uint32_t start_index, + uint32_t end_index, uint32_t* new_start_index, + uint32_t* new_end_index, base::uc32* border) { static const uint32_t kSize = RegExpMacroAssembler::kTableSize; static const uint32_t kMask = RegExpMacroAssembler::kTableMask; - uc32 first = ranges->at(start_index); - uc32 last = ranges->at(end_index) - 1; + base::uc32 first = ranges->at(start_index); + base::uc32 last = ranges->at(end_index) - 1; *new_start_index = start_index; *border = (ranges->at(start_index) & ~kMask) + kSize; @@ -1102,15 +1098,16 @@ static void SplitSearchSpace(ZoneList* ranges, uint32_t start_index, // know that the character is in the range of min_char to max_char inclusive. // Either label can be nullptr indicating backtracking. Either label can also // be equal to the fall_through label. -static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList* ranges, - uint32_t start_index, uint32_t end_index, - uc32 min_char, uc32 max_char, Label* fall_through, - Label* even_label, Label* odd_label) { +void GenerateBranches(RegExpMacroAssembler* masm, ZoneList* ranges, + uint32_t start_index, uint32_t end_index, + base::uc32 min_char, base::uc32 max_char, + Label* fall_through, Label* even_label, + Label* odd_label) { DCHECK_LE(min_char, String::kMaxUtf16CodeUnit); DCHECK_LE(max_char, String::kMaxUtf16CodeUnit); - uc32 first = ranges->at(start_index); - uc32 last = ranges->at(end_index) - 1; + base::uc32 first = ranges->at(start_index); + base::uc32 last = ranges->at(end_index) - 1; DCHECK_LT(min_char, first); @@ -1170,7 +1167,7 @@ static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList* ranges, uint32_t new_start_index = 0; uint32_t new_end_index = 0; - uc32 border = 0; + base::uc32 border = 0; SplitSearchSpace(ranges, start_index, end_index, &new_start_index, &new_end_index, &border); @@ -1213,24 +1210,19 @@ static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList* ranges, } } -static void EmitCharClass(RegExpMacroAssembler* macro_assembler, - RegExpCharacterClass* cc, bool one_byte, - Label* on_failure, int cp_offset, bool check_offset, - bool preloaded, Zone* zone) { +void EmitCharClass(RegExpMacroAssembler* macro_assembler, + RegExpCharacterClass* cc, bool one_byte, Label* on_failure, + int cp_offset, bool check_offset, bool preloaded, + Zone* zone) { ZoneList* ranges = cc->ranges(zone); CharacterRange::Canonicalize(ranges); - const uc32 max_char = MaxCodeUnit(one_byte); - int range_count = ranges->length(); + // Now that all processing (like case-insensitivity) is done, clamp the + // ranges to the set of ranges that may actually occur in the subject string. + if (one_byte) CharacterRange::ClampToOneByte(ranges); - int last_valid_range = range_count - 1; - while (last_valid_range >= 0) { - CharacterRange& range = ranges->at(last_valid_range); - if (range.from() <= max_char) break; - last_valid_range--; - } - - if (last_valid_range < 0) { + const int ranges_length = ranges->length(); + if (ranges_length == 0) { if (!cc->is_negated()) { macro_assembler->GoTo(on_failure); } @@ -1240,7 +1232,8 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler, return; } - if (last_valid_range == 0 && ranges->at(0).IsEverything(max_char)) { + const base::uc32 max_char = MaxCodeUnit(one_byte); + if (ranges_length == 1 && ranges->at(0).IsEverything(max_char)) { if (cc->is_negated()) { macro_assembler->GoTo(on_failure); } else { @@ -1261,18 +1254,33 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler, return; } - // A new list with ascending entries. Each entry is a code unit - // where there is a boundary between code units that are part of - // the class and code units that are not. Normally we insert an - // entry at zero which goes to the failure label, but if there - // was already one there we fall through for success on that entry. - // Subsequent entries have alternating meaning (success/failure). - ZoneList* range_boundaries = - zone->New>(last_valid_range, zone); + static constexpr int kMaxRangesForInlineBranchGeneration = 16; + if (ranges_length > kMaxRangesForInlineBranchGeneration) { + // For large range sets, emit a more compact instruction sequence to avoid + // a potentially problematic increase in code size. + // Note the flipped logic below (we check InRange if negated, NotInRange if + // not negated); this is necessary since the method falls through on + // failure whereas we want to fall through on success. + if (cc->is_negated()) { + if (macro_assembler->CheckCharacterInRangeArray(ranges, on_failure)) { + return; + } + } else { + if (macro_assembler->CheckCharacterNotInRangeArray(ranges, on_failure)) { + return; + } + } + } + + // Generate a flat list of range boundaries for consumption by + // GenerateBranches. See the comment on that function for how the list should + // be structured + ZoneList* range_boundaries = + zone->New>(ranges_length * 2, zone); bool zeroth_entry_is_failure = !cc->is_negated(); - for (int i = 0; i <= last_valid_range; i++) { + for (int i = 0; i < ranges_length; i++) { CharacterRange& range = ranges->at(i); if (range.from() == 0) { DCHECK_EQ(i, 0); @@ -1280,6 +1288,8 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler, } else { range_boundaries->Add(range.from(), zone); } + // `+ 1` to convert from inclusive to exclusive `to`. + // [from, to] == [from, to+1[. range_boundaries->Add(range.to() + 1, zone); } int end_index = range_boundaries->length() - 1; @@ -1298,6 +1308,8 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler, macro_assembler->Bind(&fall_through); } +} // namespace + RegExpNode::~RegExpNode() = default; RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler, @@ -1385,8 +1397,10 @@ void NegativeLookaroundChoiceNode::GetQuickCheckDetails( return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start); } +namespace { + // Takes the left-most 1-bit and smears it out, setting all bits to its right. -static inline uint32_t SmearBitsRight(uint32_t v) { +inline uint32_t SmearBitsRight(uint32_t v) { v |= v >> 1; v |= v >> 2; v |= v >> 4; @@ -1395,6 +1409,8 @@ static inline uint32_t SmearBitsRight(uint32_t v) { return v; } +} // namespace + bool QuickCheckDetails::Rationalize(bool asc) { bool found_useful_op = false; const uint32_t char_mask = CharMask(asc); @@ -1574,12 +1590,12 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, for (int k = 0; k < elements()->length(); k++) { TextElement elm = elements()->at(k); if (elm.text_type() == TextElement::ATOM) { - Vector quarks = elm.atom()->data(); + base::Vector quarks = elm.atom()->data(); for (int i = 0; i < characters && i < quarks.length(); i++) { QuickCheckDetails::Position* pos = details->positions(characters_filled_in); - uc16 c = quarks[i]; - if (elm.atom()->ignore_case()) { + base::uc16 c = quarks[i]; + if (IsIgnoreCase(compiler->flags())) { unibrow::uchar chars[4]; int length = GetCaseIndependentLetters( isolate, c, compiler->one_byte(), chars, 4); @@ -1640,12 +1656,14 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, details->positions(characters_filled_in); RegExpCharacterClass* tree = elm.char_class(); ZoneList* ranges = tree->ranges(zone()); - DCHECK(!ranges->is_empty()); - if (tree->is_negated()) { + if (tree->is_negated() || ranges->is_empty()) { // A quick check uses multi-character mask and compare. There is no // useful way to incorporate a negative char class into this scheme // so we just conservatively create a mask and value that will always // succeed. + // Likewise for empty ranges (empty ranges can occur e.g. when + // compiling for one-byte subjects and impossible (non-one-byte) ranges + // have been removed). pos->mask = 0; pos->value = 0; } else { @@ -1659,8 +1677,9 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, } } CharacterRange range = ranges->at(first_range); - const uc32 first_from = range.from(); - const uc32 first_to = (range.to() > char_mask) ? char_mask : range.to(); + const base::uc32 first_from = range.from(); + const base::uc32 first_to = + (range.to() > char_mask) ? char_mask : range.to(); const uint32_t differing_bits = (first_from ^ first_to); // A mask and compare is only perfect if the differing bits form a // number like 00011111 with one single block of trailing 1s. @@ -1671,10 +1690,11 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, uint32_t common_bits = ~SmearBitsRight(differing_bits); uint32_t bits = (first_from & common_bits); for (int i = first_range + 1; i < ranges->length(); i++) { - CharacterRange range = ranges->at(i); - const uc32 from = range.from(); + range = ranges->at(i); + const base::uc32 from = range.from(); if (from > char_mask) continue; - const uc32 to = (range.to() > char_mask) ? char_mask : range.to(); + const base::uc32 to = + (range.to() > char_mask) ? char_mask : range.to(); // Here we are combining more ranges into the mask and compare // value. With each new range the mask becomes more sparse and // so the chances of a false positive rise. A character class @@ -1685,8 +1705,8 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, new_common_bits = ~SmearBitsRight(new_common_bits); common_bits &= new_common_bits; bits &= new_common_bits; - uint32_t differing_bits = (from & common_bits) ^ bits; - common_bits ^= differing_bits; + uint32_t new_differing_bits = (from & common_bits) ^ bits; + common_bits ^= new_differing_bits; bits &= common_bits; } pos->mask = common_bits; @@ -1807,16 +1827,16 @@ class IterationDecrementer { LoopChoiceNode* node_; }; -RegExpNode* SeqRegExpNode::FilterOneByte(int depth) { +RegExpNode* SeqRegExpNode::FilterOneByte(int depth, RegExpFlags flags) { if (info()->replacement_calculated) return replacement(); if (depth < 0) return this; DCHECK(!info()->visited); VisitMarker marker(info()); - return FilterSuccessor(depth - 1); + return FilterSuccessor(depth - 1, flags); } -RegExpNode* SeqRegExpNode::FilterSuccessor(int depth) { - RegExpNode* next = on_success_->FilterOneByte(depth - 1); +RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, RegExpFlags flags) { + RegExpNode* next = on_success_->FilterOneByte(depth - 1, flags); if (next == nullptr) return set_replacement(nullptr); on_success_ = next; return set_replacement(this); @@ -1829,7 +1849,9 @@ bool RangeContainsLatin1Equivalents(CharacterRange range) { range.Contains(0x0178); } -static bool RangesContainLatin1Equivalents(ZoneList* ranges) { +namespace { + +bool RangesContainLatin1Equivalents(ZoneList* ranges) { for (int i = 0; i < ranges->length(); i++) { // TODO(dcarney): this could be a lot more efficient. if (RangeContainsLatin1Equivalents(ranges->at(i))) return true; @@ -1837,7 +1859,9 @@ static bool RangesContainLatin1Equivalents(ZoneList* ranges) { return false; } -RegExpNode* TextNode::FilterOneByte(int depth) { +} // namespace + +RegExpNode* TextNode::FilterOneByte(int depth, RegExpFlags flags) { if (info()->replacement_calculated) return replacement(); if (depth < 0) return this; DCHECK(!info()->visited); @@ -1846,15 +1870,15 @@ RegExpNode* TextNode::FilterOneByte(int depth) { for (int i = 0; i < element_count; i++) { TextElement elm = elements()->at(i); if (elm.text_type() == TextElement::ATOM) { - Vector quarks = elm.atom()->data(); + base::Vector quarks = elm.atom()->data(); for (int j = 0; j < quarks.length(); j++) { - uc16 c = quarks[j]; - if (elm.atom()->ignore_case()) { + base::uc16 c = quarks[j]; + if (IsIgnoreCase(flags)) { c = unibrow::Latin1::TryConvertToLatin1(c); } if (c > unibrow::Latin1::kMaxChar) return set_replacement(nullptr); // Replace quark in case we converted to Latin-1. - uc16* writable_quarks = const_cast(quarks.begin()); + base::uc16* writable_quarks = const_cast(quarks.begin()); writable_quarks[j] = c; } } else { @@ -1868,8 +1892,7 @@ RegExpNode* TextNode::FilterOneByte(int depth) { if (range_count != 0 && ranges->at(0).from() == 0 && ranges->at(0).to() >= String::kMaxOneByteCharCode) { // This will be handled in a later filter. - if (IgnoreCase(cc->flags()) && - RangesContainLatin1Equivalents(ranges)) { + if (IsIgnoreCase(flags) && RangesContainLatin1Equivalents(ranges)) { continue; } return set_replacement(nullptr); @@ -1878,8 +1901,7 @@ RegExpNode* TextNode::FilterOneByte(int depth) { if (range_count == 0 || ranges->at(0).from() > String::kMaxOneByteCharCode) { // This will be handled in a later filter. - if (IgnoreCase(cc->flags()) && - RangesContainLatin1Equivalents(ranges)) { + if (IsIgnoreCase(flags) && RangesContainLatin1Equivalents(ranges)) { continue; } return set_replacement(nullptr); @@ -1887,26 +1909,27 @@ RegExpNode* TextNode::FilterOneByte(int depth) { } } } - return FilterSuccessor(depth - 1); + return FilterSuccessor(depth - 1, flags); } -RegExpNode* LoopChoiceNode::FilterOneByte(int depth) { +RegExpNode* LoopChoiceNode::FilterOneByte(int depth, RegExpFlags flags) { if (info()->replacement_calculated) return replacement(); if (depth < 0) return this; if (info()->visited) return this; { VisitMarker marker(info()); - RegExpNode* continue_replacement = continue_node_->FilterOneByte(depth - 1); + RegExpNode* continue_replacement = + continue_node_->FilterOneByte(depth - 1, flags); // If we can't continue after the loop then there is no sense in doing the // loop. if (continue_replacement == nullptr) return set_replacement(nullptr); } - return ChoiceNode::FilterOneByte(depth - 1); + return ChoiceNode::FilterOneByte(depth - 1, flags); } -RegExpNode* ChoiceNode::FilterOneByte(int depth) { +RegExpNode* ChoiceNode::FilterOneByte(int depth, RegExpFlags flags) { if (info()->replacement_calculated) return replacement(); if (depth < 0) return this; if (info()->visited) return this; @@ -1926,7 +1949,8 @@ RegExpNode* ChoiceNode::FilterOneByte(int depth) { RegExpNode* survivor = nullptr; for (int i = 0; i < choice_count; i++) { GuardedAlternative alternative = alternatives_->at(i); - RegExpNode* replacement = alternative.node()->FilterOneByte(depth - 1); + RegExpNode* replacement = + alternative.node()->FilterOneByte(depth - 1, flags); DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK. if (replacement != nullptr) { alternatives_->at(i).set_node(replacement); @@ -1946,7 +1970,7 @@ RegExpNode* ChoiceNode::FilterOneByte(int depth) { zone()->New>(surviving, zone()); for (int i = 0; i < choice_count; i++) { RegExpNode* replacement = - alternatives_->at(i).node()->FilterOneByte(depth - 1); + alternatives_->at(i).node()->FilterOneByte(depth - 1, flags); if (replacement != nullptr) { alternatives_->at(i).set_node(replacement); new_alternatives->Add(alternatives_->at(i), zone()); @@ -1956,7 +1980,8 @@ RegExpNode* ChoiceNode::FilterOneByte(int depth) { return this; } -RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth) { +RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth, + RegExpFlags flags) { if (info()->replacement_calculated) return replacement(); if (depth < 0) return this; if (info()->visited) return this; @@ -1964,12 +1989,12 @@ RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth) { // Alternative 0 is the negative lookahead, alternative 1 is what comes // afterwards. RegExpNode* node = continue_node(); - RegExpNode* replacement = node->FilterOneByte(depth - 1); + RegExpNode* replacement = node->FilterOneByte(depth - 1, flags); if (replacement == nullptr) return set_replacement(nullptr); alternatives_->at(kContinueIndex).set_node(replacement); RegExpNode* neg_node = lookaround_node(); - RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1); + RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, flags); // If the negative lookahead is always going to fail then // we don't need to check it. if (neg_replacement == nullptr) return set_replacement(replacement); @@ -2062,7 +2087,8 @@ namespace { void EmitWordCheck(RegExpMacroAssembler* assembler, Label* word, Label* non_word, bool fall_through_on_word) { if (assembler->CheckSpecialCharacterClass( - fall_through_on_word ? 'w' : 'W', + fall_through_on_word ? StandardCharacterSet::kWord + : StandardCharacterSet::kNotWord, fall_through_on_word ? non_word : word)) { // Optimized implementation available. return; @@ -2108,7 +2134,8 @@ void EmitHat(RegExpCompiler* compiler, RegExpNode* on_success, Trace* trace) { const bool can_skip_bounds_check = !may_be_at_or_before_subject_string_start; assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, new_trace.backtrack(), can_skip_bounds_check); - if (!assembler->CheckSpecialCharacterClass('n', new_trace.backtrack())) { + if (!assembler->CheckSpecialCharacterClass( + StandardCharacterSet::kLineTerminator, new_trace.backtrack())) { // Newline means \n, \r, 0x2028 or 0x2029. if (!compiler->one_byte()) { assembler->CheckCharacterAfterAnd(0x2028, 0xFFFE, &ok); @@ -2253,18 +2280,22 @@ void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) { on_success()->Emit(compiler, trace); } -static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) { +namespace { + +bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) { if (quick_check == nullptr) return false; if (offset >= quick_check->characters()) return false; return quick_check->positions(offset)->determines_perfectly; } -static void UpdateBoundsCheck(int index, int* checked_up_to) { +void UpdateBoundsCheck(int index, int* checked_up_to) { if (index > *checked_up_to) { *checked_up_to = index; } } +} // namespace + // We call this repeatedly to generate code for each pass over the text node. // The passes are in increasing order of difficulty because we hope one // of the first passes will fail in which case we are saved the work of the @@ -2308,13 +2339,13 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass, TextElement elm = elements()->at(i); int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset; if (elm.text_type() == TextElement::ATOM) { - if (SkipPass(pass, elm.atom()->ignore_case())) continue; - Vector quarks = elm.atom()->data(); + if (SkipPass(pass, IsIgnoreCase(compiler->flags()))) continue; + base::Vector quarks = elm.atom()->data(); for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) { if (first_element_checked && i == 0 && j == 0) continue; if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue; - uc16 quark = quarks[j]; - if (elm.atom()->ignore_case()) { + base::uc16 quark = quarks[j]; + if (IsIgnoreCase(compiler->flags())) { // Everywhere else we assume that a non-Latin-1 character cannot match // a Latin-1 character. Avoid the cases where this is assumption is // invalid by using the Latin1 equivalent instead. @@ -2383,29 +2414,38 @@ bool TextNode::SkipPass(TextEmitPassType pass, bool ignore_case) { TextNode* TextNode::CreateForCharacterRanges(Zone* zone, ZoneList* ranges, bool read_backward, - RegExpNode* on_success, - JSRegExp::Flags flags) { + RegExpNode* on_success) { DCHECK_NOT_NULL(ranges); - ZoneList* elms = zone->New>(1, zone); + // TODO(jgruber): There's no fundamental need to create this + // RegExpCharacterClass; we could refactor to avoid the allocation. + return zone->New(zone->New(zone, ranges), + read_backward, on_success); +} + +TextNode* TextNode::CreateForSurrogatePair( + Zone* zone, CharacterRange lead, ZoneList* trail_ranges, + bool read_backward, RegExpNode* on_success) { + ZoneList* lead_ranges = CharacterRange::List(zone, lead); + ZoneList* elms = zone->New>(2, zone); elms->Add(TextElement::CharClass( - zone->New(zone, ranges, flags)), + zone->New(zone, lead_ranges)), + zone); + elms->Add(TextElement::CharClass( + zone->New(zone, trail_ranges)), zone); return zone->New(elms, read_backward, on_success); } -TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead, - CharacterRange trail, - bool read_backward, - RegExpNode* on_success, - JSRegExp::Flags flags) { - ZoneList* lead_ranges = CharacterRange::List(zone, lead); +TextNode* TextNode::CreateForSurrogatePair( + Zone* zone, ZoneList* lead_ranges, CharacterRange trail, + bool read_backward, RegExpNode* on_success) { ZoneList* trail_ranges = CharacterRange::List(zone, trail); ZoneList* elms = zone->New>(2, zone); elms->Add(TextElement::CharClass( - zone->New(zone, lead_ranges, flags)), + zone->New(zone, lead_ranges)), zone); elms->Add(TextElement::CharClass( - zone->New(zone, trail_ranges, flags)), + zone->New(zone, trail_ranges)), zone); return zone->New(elms, read_backward, on_success); } @@ -2479,26 +2519,23 @@ void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) { bound_checked_up_to_ = std::max(0, bound_checked_up_to_ - by); } -void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) { +void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte, + RegExpFlags flags) { + if (!IsIgnoreCase(flags)) return; +#ifdef V8_INTL_SUPPORT + if (NeedsUnicodeCaseEquivalents(flags)) return; +#endif + int element_count = elements()->length(); for (int i = 0; i < element_count; i++) { TextElement elm = elements()->at(i); if (elm.text_type() == TextElement::CHAR_CLASS) { RegExpCharacterClass* cc = elm.char_class(); -#ifdef V8_INTL_SUPPORT - bool case_equivalents_already_added = - NeedsUnicodeCaseEquivalents(cc->flags()); -#else - bool case_equivalents_already_added = false; -#endif - if (IgnoreCase(cc->flags()) && !case_equivalents_already_added) { - // None of the standard character classes is different in the case - // independent case and it slows us down if we don't know that. - if (cc->is_standard(zone())) continue; - ZoneList* ranges = cc->ranges(zone()); - CharacterRange::AddCaseEquivalents(isolate, zone(), ranges, - is_one_byte); - } + // None of the standard character classes is different in the case + // independent case and it slows us down if we don't know that. + if (cc->is_standard(zone())) continue; + ZoneList* ranges = cc->ranges(zone()); + CharacterRange::AddCaseEquivalents(isolate, zone(), ranges, is_one_byte); } } } @@ -2518,7 +2555,7 @@ RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode( return ranges->length() == 0 ? on_success() : nullptr; } if (ranges->length() != 1) return nullptr; - const uc32 max_char = MaxCodeUnit(compiler->one_byte()); + const base::uc32 max_char = MaxCodeUnit(compiler->one_byte()); return ranges->at(0).IsEverything(max_char) ? on_success() : nullptr; } @@ -2681,7 +2718,7 @@ ContainedInLattice AddRange(ContainedInLattice containment, const int* ranges, } int BitsetFirstSetBit(BoyerMoorePositionInfo::Bitset bitset) { - STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize == + static_assert(BoyerMoorePositionInfo::kMapSize == 2 * kInt64Size * kBitsPerByte); // Slight fiddling is needed here, since the bitset is of length 128 while @@ -2692,7 +2729,7 @@ int BitsetFirstSetBit(BoyerMoorePositionInfo::Bitset bitset) { { static constexpr BoyerMoorePositionInfo::Bitset mask(~uint64_t{0}); BoyerMoorePositionInfo::Bitset masked_bitset = bitset & mask; - STATIC_ASSERT(kInt64Size >= sizeof(decltype(masked_bitset.to_ullong()))); + static_assert(kInt64Size >= sizeof(decltype(masked_bitset.to_ullong()))); uint64_t lsb = masked_bitset.to_ullong(); if (lsb != 0) return base::bits::CountTrailingZeros(lsb); } @@ -3436,7 +3473,7 @@ void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) { RecursionCheck rc(compiler); DCHECK_EQ(start_reg_ + 1, end_reg_); - if (IgnoreCase(flags_)) { + if (IsIgnoreCase(flags_)) { bool unicode = IsUnicode(flags_); assembler->CheckNotBackReferenceIgnoreCase(start_reg_, read_backward(), unicode, trace->backtrack()); @@ -3626,9 +3663,10 @@ class EatsAtLeastPropagator : public AllStatic { template class Analysis : public NodeVisitor { public: - Analysis(Isolate* isolate, bool is_one_byte) + Analysis(Isolate* isolate, bool is_one_byte, RegExpFlags flags) : isolate_(isolate), is_one_byte_(is_one_byte), + flags_(flags), error_(RegExpError::kNone) {} void EnsureAnalyzed(RegExpNode* that) { @@ -3669,7 +3707,7 @@ class Analysis : public NodeVisitor { } while (false) void VisitText(TextNode* that) override { - that->MakeCaseIndependent(isolate(), is_one_byte_); + that->MakeCaseIndependent(isolate(), is_one_byte_, flags_); EnsureAnalyzed(that->on_success()); if (has_failed()) return; that->CalculateOffsets(); @@ -3736,16 +3774,17 @@ class Analysis : public NodeVisitor { private: Isolate* isolate_; - bool is_one_byte_; + const bool is_one_byte_; + const RegExpFlags flags_; RegExpError error_; DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis); }; -RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, +RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpFlags flags, RegExpNode* node) { - Analysis analysis(isolate, - is_one_byte); + Analysis analysis( + isolate, is_one_byte, flags); DCHECK_EQ(node->info()->been_analyzed, false); analysis.EnsureAnalyzed(node); DCHECK_IMPLIES(analysis.has_failed(), analysis.error() != RegExpError::kNone); @@ -3761,7 +3800,7 @@ void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget, SaveBMInfo(bm, not_at_start, offset); } -STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize == +static_assert(BoyerMoorePositionInfo::kMapSize == RegExpMacroAssembler::kTableSize); void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget, @@ -3798,14 +3837,14 @@ void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget, if (initial_offset == 0) set_bm_info(not_at_start, bm); return; } - uc16 character = atom->data()[j]; - if (IgnoreCase(atom->flags())) { + base::uc16 character = atom->data()[j]; + if (IsIgnoreCase(bm->compiler()->flags())) { unibrow::uchar chars[4]; int length = GetCaseIndependentLetters( isolate, character, bm->max_char() == String::kMaxOneByteCharCode, chars, 4); - for (int j = 0; j < length; j++) { - bm->Set(offset, chars[j]); + for (int k = 0; k < length; k++) { + bm->Set(offset, chars[k]); } } else { if (character <= max_char) bm->Set(offset, character); @@ -3838,7 +3877,7 @@ void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget, } RegExpNode* RegExpCompiler::OptionallyStepBackToLeadSurrogate( - RegExpNode* on_success, JSRegExp::Flags flags) { + RegExpNode* on_success) { DCHECK(!read_backward()); ZoneList* lead_surrogates = CharacterRange::List( zone(), CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd)); @@ -3850,11 +3889,11 @@ RegExpNode* RegExpCompiler::OptionallyStepBackToLeadSurrogate( int stack_register = UnicodeLookaroundStackRegister(); int position_register = UnicodeLookaroundPositionRegister(); RegExpNode* step_back = TextNode::CreateForCharacterRanges( - zone(), lead_surrogates, true, on_success, flags); + zone(), lead_surrogates, true, on_success); RegExpLookaround::Builder builder(true, step_back, stack_register, position_register); RegExpNode* match_trail = TextNode::CreateForCharacterRanges( - zone(), trail_surrogates, false, builder.on_match_success(), flags); + zone(), trail_surrogates, false, builder.on_match_success()); optional_step_back->AddAlternative( GuardedAlternative(builder.ForMatch(match_trail))); @@ -3864,7 +3903,7 @@ RegExpNode* RegExpCompiler::OptionallyStepBackToLeadSurrogate( } RegExpNode* RegExpCompiler::PreprocessRegExp(RegExpCompileData* data, - JSRegExp::Flags flags, + RegExpFlags flags, bool is_one_byte) { // Wrap the body of the regexp in capture #0. RegExpNode* captured_body = @@ -3873,11 +3912,10 @@ RegExpNode* RegExpCompiler::PreprocessRegExp(RegExpCompileData* data, if (!data->tree->IsAnchoredAtStart() && !IsSticky(flags)) { // Add a .*? at the beginning, outside the body capture, unless // this expression is anchored at the beginning or sticky. - JSRegExp::Flags default_flags = JSRegExp::Flags(); RegExpNode* loop_node = RegExpQuantifier::ToNode( 0, RegExpTree::kInfinity, false, - zone()->New('*', default_flags), this, - captured_body, data->contains_anchor); + zone()->New(StandardCharacterSet::kEverything), + this, captured_body, data->contains_anchor); if (data->contains_anchor) { // Unroll loop once, to take care of the case that might start @@ -3885,27 +3923,33 @@ RegExpNode* RegExpCompiler::PreprocessRegExp(RegExpCompileData* data, ChoiceNode* first_step_node = zone()->New(2, zone()); first_step_node->AddAlternative(GuardedAlternative(captured_body)); first_step_node->AddAlternative(GuardedAlternative(zone()->New( - zone()->New('*', default_flags), false, - loop_node))); + zone()->New(StandardCharacterSet::kEverything), + false, loop_node))); node = first_step_node; } else { node = loop_node; } } if (is_one_byte) { - node = node->FilterOneByte(RegExpCompiler::kMaxRecursion); + node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, flags); // Do it again to propagate the new nodes to places where they were not // put because they had not been calculated yet. if (node != nullptr) { - node = node->FilterOneByte(RegExpCompiler::kMaxRecursion); + node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, flags); } } else if (IsUnicode(flags) && (IsGlobal(flags) || IsSticky(flags))) { - node = OptionallyStepBackToLeadSurrogate(node, flags); + node = OptionallyStepBackToLeadSurrogate(node); } if (node == nullptr) node = zone()->New(EndNode::BACKTRACK, zone()); return node; } +void RegExpCompiler::ToNodeCheckForStackOverflow() { + if (StackLimitCheck{isolate()}.HasOverflowed()) { + V8::FatalProcessOutOfMemory(isolate(), "RegExpCompiler"); + } +} + } // namespace internal } // namespace v8 diff --git a/js/src/irregexp/imported/regexp-compiler.h b/js/src/irregexp/imported/regexp-compiler.h index 5499a7115b76..2dd8a6f0b68c 100644 --- a/js/src/irregexp/imported/regexp-compiler.h +++ b/js/src/irregexp/imported/regexp-compiler.h @@ -20,7 +20,7 @@ namespace regexp_compiler_constants { // The '2' variant is has inclusive from and exclusive to. // This covers \s as defined in ECMA-262 5.1, 15.10.2.12, // which include WhiteSpace (7.2) or LineTerminator (7.3) values. -constexpr uc32 kRangeEndMarker = 0x110000; +constexpr base::uc32 kRangeEndMarker = 0x110000; constexpr int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680, 0x1681, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030, @@ -47,34 +47,10 @@ constexpr int kPatternTooShortForBoyerMoore = 2; } // namespace regexp_compiler_constants -inline bool IgnoreCase(JSRegExp::Flags flags) { - return (flags & JSRegExp::kIgnoreCase) != 0; -} - -inline bool IsUnicode(JSRegExp::Flags flags) { - return (flags & JSRegExp::kUnicode) != 0; -} - -inline bool IsSticky(JSRegExp::Flags flags) { - return (flags & JSRegExp::kSticky) != 0; -} - -inline bool IsGlobal(JSRegExp::Flags flags) { - return (flags & JSRegExp::kGlobal) != 0; -} - -inline bool DotAll(JSRegExp::Flags flags) { - return (flags & JSRegExp::kDotAll) != 0; -} - -inline bool Multiline(JSRegExp::Flags flags) { - return (flags & JSRegExp::kMultiline) != 0; -} - -inline bool NeedsUnicodeCaseEquivalents(JSRegExp::Flags flags) { +inline bool NeedsUnicodeCaseEquivalents(RegExpFlags flags) { // Both unicode and ignore_case flags are set. We need to use ICU to find // the closure over case equivalents. - return IsUnicode(flags) && IgnoreCase(flags); + return IsUnicode(flags) && IsIgnoreCase(flags); } // Details of a quick mask-compare check that can look ahead in the @@ -95,8 +71,8 @@ class QuickCheckDetails { void set_cannot_match() { cannot_match_ = true; } struct Position { Position() : mask(0), value(0), determines_perfectly(false) {} - uc32 mask; - uc32 value; + base::uc32 mask; + base::uc32 value; bool determines_perfectly; }; int characters() { return characters_; } @@ -422,7 +398,8 @@ struct PreloadState { // Analysis performs assertion propagation and computes eats_at_least_ values. // See the comments on AssertionPropagator and EatsAtLeastPropagator for more // details. -RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpNode* node); +RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpFlags flags, + RegExpNode* node); class FrequencyCollator { public: @@ -472,7 +449,7 @@ class FrequencyCollator { class RegExpCompiler { public: RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count, - bool is_one_byte); + RegExpFlags flags, bool is_one_byte); int AllocateRegister() { if (next_register_ >= RegExpMacroAssembler::kMaxRegister) { @@ -524,13 +501,12 @@ class RegExpCompiler { // - Inserting the implicit .* before/after the regexp if necessary. // - If the input is a one-byte string, filtering out nodes that can't match. // - Fixing up regexp matches that start within a surrogate pair. - RegExpNode* PreprocessRegExp(RegExpCompileData* data, JSRegExp::Flags flags, + RegExpNode* PreprocessRegExp(RegExpCompileData* data, RegExpFlags flags, bool is_one_byte); // If the regexp matching starts within a surrogate pair, step back to the // lead surrogate and start matching from there. - RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpNode* on_success, - JSRegExp::Flags flags); + RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpNode* on_success); inline void AddWork(RegExpNode* node) { if (!node->on_work_list() && !node->label()->is_bound()) { @@ -551,6 +527,8 @@ class RegExpCompiler { inline void IncrementRecursionDepth() { recursion_depth_++; } inline void DecrementRecursionDepth() { recursion_depth_--; } + RegExpFlags flags() const { return flags_; } + void SetRegExpTooBig() { reg_exp_too_big_ = true; } inline bool one_byte() { return one_byte_; } @@ -569,6 +547,18 @@ class RegExpCompiler { current_expansion_factor_ = value; } + // The recursive nature of ToNode node generation means we may run into stack + // overflow issues. We introduce periodic checks to detect these, and the + // tick counter helps limit overhead of these checks. + // TODO(jgruber): This is super hacky and should be replaced by an abort + // mechanism or iterative node generation. + void ToNodeMaybeCheckForStackOverflow() { + if ((to_node_overflow_check_ticks_++ % 16 == 0)) { + ToNodeCheckForStackOverflow(); + } + } + void ToNodeCheckForStackOverflow(); + Isolate* isolate() const { return isolate_; } Zone* zone() const { return zone_; } @@ -581,10 +571,12 @@ class RegExpCompiler { int unicode_lookaround_position_register_; ZoneVector* work_list_; int recursion_depth_; + const RegExpFlags flags_; RegExpMacroAssembler* macro_assembler_; bool one_byte_; bool reg_exp_too_big_; bool limiting_recursion_; + int to_node_overflow_check_ticks_ = 0; bool optimize_; bool read_backward_; int current_expansion_factor_; diff --git a/js/src/irregexp/imported/regexp-dotprinter.cc b/js/src/irregexp/imported/regexp-dotprinter.cc index 4fecf7afd37d..260d18ab93a3 100644 --- a/js/src/irregexp/imported/regexp-dotprinter.cc +++ b/js/src/irregexp/imported/regexp-dotprinter.cc @@ -127,9 +127,9 @@ void DotPrinterImpl::VisitText(TextNode* that) { TextElement elm = that->elements()->at(i); switch (elm.text_type()) { case TextElement::ATOM: { - Vector data = elm.atom()->data(); - for (int i = 0; i < data.length(); i++) { - os_ << static_cast(data[i]); + base::Vector data = elm.atom()->data(); + for (int j = 0; j < data.length(); j++) { + os_ << static_cast(data[j]); } break; } diff --git a/js/src/irregexp/imported/regexp-error.h b/js/src/irregexp/imported/regexp-error.h index efa8715ad5a3..c47ab4f53ab3 100644 --- a/js/src/irregexp/imported/regexp-error.h +++ b/js/src/irregexp/imported/regexp-error.h @@ -52,6 +52,11 @@ enum class RegExpError : uint32_t { V8_EXPORT_PRIVATE const char* RegExpErrorString(RegExpError error); +inline constexpr bool RegExpErrorIsStackOverflow(RegExpError error) { + return error == RegExpError::kStackOverflow || + error == RegExpError::kAnalysisStackOverflow; +} + } // namespace internal } // namespace v8 diff --git a/js/src/irregexp/imported/regexp-interpreter.cc b/js/src/irregexp/imported/regexp-interpreter.cc index 4a5a8d4962c8..2d8ffafc7858 100644 --- a/js/src/irregexp/imported/regexp-interpreter.cc +++ b/js/src/irregexp/imported/regexp-interpreter.cc @@ -28,12 +28,13 @@ namespace internal { namespace { bool BackRefMatchesNoCase(Isolate* isolate, int from, int current, int len, - Vector subject, bool unicode) { + base::Vector subject, + bool unicode) { Address offset_a = - reinterpret_cast
(const_cast(&subject.at(from))); + reinterpret_cast
(const_cast(&subject.at(from))); Address offset_b = - reinterpret_cast
(const_cast(&subject.at(current))); - size_t length = len * kUC16Size; + reinterpret_cast
(const_cast(&subject.at(current))); + size_t length = len * base::kUC16Size; bool result = unicode ? RegExpMacroAssembler::CaseInsensitiveCompareUnicode( @@ -44,7 +45,7 @@ bool BackRefMatchesNoCase(Isolate* isolate, int from, int current, int len, } bool BackRefMatchesNoCase(Isolate* isolate, int from, int current, int len, - Vector subject, bool unicode) { + base::Vector subject, bool unicode) { // For Latin1 characters the unicode flag makes no difference. for (int i = 0; i < len; i++) { unsigned int old_char = subject[from++]; @@ -170,7 +171,7 @@ class InterpreterRegisters { output_register_count_(output_register_count) { // TODO(jgruber): Use int32_t consistently for registers. Currently, CSA // uses int32_t while runtime uses int. - STATIC_ASSERT(sizeof(int) == sizeof(int32_t)); + static_assert(sizeof(int) == sizeof(int32_t)); DCHECK_GE(output_register_count, 2); // At least 2 for the match itself. DCHECK_GE(total_register_count, output_register_count); DCHECK_LE(total_register_count, RegExpMacroAssembler::kMaxRegisterCount); @@ -222,7 +223,7 @@ void UpdateCodeAndSubjectReferences( Isolate* isolate, Handle code_array, Handle subject_string, ByteArray* code_array_out, const byte** code_base_out, const byte** pc_out, String* subject_string_out, - Vector* subject_string_vector_out) { + base::Vector* subject_string_vector_out) { DisallowGarbageCollection no_gc; if (*code_base_out != code_array->GetDataStartAddress()) { @@ -244,7 +245,7 @@ template IrregexpInterpreter::Result HandleInterrupts( Isolate* isolate, RegExp::CallOrigin call_origin, ByteArray* code_array_out, String* subject_string_out, const byte** code_base_out, - Vector* subject_string_vector_out, const byte** pc_out) { + base::Vector* subject_string_vector_out, const byte** pc_out) { DisallowGarbageCollection no_gc; StackLimitCheck check(isolate); @@ -282,8 +283,8 @@ IrregexpInterpreter::Result HandleInterrupts( return IrregexpInterpreter::EXCEPTION; } - // If we changed between a LATIN1 and a UC16 string, we need to restart - // regexp matching with the appropriate template instantiation of + // If we changed between a LATIN1 and a UC16 string, we need to + // restart regexp matching with the appropriate template instantiation of // RawMatch. if (String::IsOneByteRepresentationUnderneath(*subject_handle) != was_one_byte) { @@ -373,7 +374,7 @@ bool IndexIsInBounds(int index, int length) { template IrregexpInterpreter::Result RawMatch( Isolate* isolate, ByteArray code_array, String subject_string, - Vector subject, int* output_registers, + base::Vector subject, int* output_registers, int output_register_count, int total_register_count, int current, uint32_t current_char, RegExp::CallOrigin call_origin, const uint32_t backtrack_limit) { @@ -414,8 +415,8 @@ IrregexpInterpreter::Result RawMatch( base::bits::RoundUpToPowerOfTwo32(kRegExpBytecodeCount)); // Make sure every bytecode we get by using BYTECODE_MASK is well defined. - STATIC_ASSERT(kRegExpBytecodeCount <= kRegExpPaddedBytecodeCount); - STATIC_ASSERT(kRegExpBytecodeCount + kRegExpBytecodeFillerCount == + static_assert(kRegExpBytecodeCount <= kRegExpPaddedBytecodeCount); + static_assert(kRegExpBytecodeCount + kRegExpBytecodeFillerCount == kRegExpPaddedBytecodeCount); #define DECLARE_DISPATCH_TABLE_ENTRY(name, ...) &&BC_##name, @@ -512,7 +513,7 @@ IrregexpInterpreter::Result RawMatch( DISPATCH(); } BYTECODE(POP_BT) { - STATIC_ASSERT(JSRegExp::kNoBacktrackLimit == 0); + static_assert(JSRegExp::kNoBacktrackLimit == 0); if (++backtrack_count == backtrack_limit) { int return_code = LoadPacked24Signed(insn); return static_cast(return_code); @@ -1050,12 +1051,12 @@ IrregexpInterpreter::Result IrregexpInterpreter::Match( if (FLAG_regexp_tier_up) regexp.TierUpTick(); bool is_one_byte = String::IsOneByteRepresentationUnderneath(subject_string); - ByteArray code_array = ByteArray::cast(regexp.Bytecode(is_one_byte)); - int total_register_count = regexp.MaxRegisterCount(); + ByteArray code_array = ByteArray::cast(regexp.bytecode(is_one_byte)); + int total_register_count = regexp.max_register_count(); return MatchInternal(isolate, code_array, subject_string, output_registers, output_register_count, total_register_count, - start_position, call_origin, regexp.BacktrackLimit()); + start_position, call_origin, regexp.backtrack_limit()); } IrregexpInterpreter::Result IrregexpInterpreter::MatchInternal( @@ -1065,6 +1066,9 @@ IrregexpInterpreter::Result IrregexpInterpreter::MatchInternal( uint32_t backtrack_limit) { DCHECK(subject_string.IsFlat()); + // TODO(chromium:1262676): Remove this CHECK once fixed. + CHECK(code_array.IsByteArray()); + // Note: Heap allocation *is* allowed in two situations if calling from // Runtime: // 1. When creating & throwing a stack overflow exception. The interpreter @@ -1073,10 +1077,15 @@ IrregexpInterpreter::Result IrregexpInterpreter::MatchInternal( // after interrupts have run. DisallowGarbageCollection no_gc; - uc16 previous_char = '\n'; + base::uc16 previous_char = '\n'; String::FlatContent subject_content = subject_string.GetFlatContent(no_gc); + // Because interrupts can result in GC and string content relocation, the + // checksum verification in FlatContent may fail even though this code is + // safe. See (2) above. + subject_content.UnsafeDisableChecksumVerification(); if (subject_content.IsOneByte()) { - Vector subject_vector = subject_content.ToOneByteVector(); + base::Vector subject_vector = + subject_content.ToOneByteVector(); if (start_position != 0) previous_char = subject_vector[start_position - 1]; return RawMatch(isolate, code_array, subject_string, subject_vector, output_registers, output_register_count, @@ -1084,7 +1093,8 @@ IrregexpInterpreter::Result IrregexpInterpreter::MatchInternal( call_origin, backtrack_limit); } else { DCHECK(subject_content.IsTwoByte()); - Vector subject_vector = subject_content.ToUC16Vector(); + base::Vector subject_vector = + subject_content.ToUC16Vector(); if (start_position != 0) previous_char = subject_vector[start_position - 1]; return RawMatch(isolate, code_array, subject_string, subject_vector, output_registers, output_register_count, @@ -1099,7 +1109,7 @@ IrregexpInterpreter::Result IrregexpInterpreter::MatchInternal( // builtin. IrregexpInterpreter::Result IrregexpInterpreter::MatchForCallFromJs( Address subject, int32_t start_position, Address, Address, - int* output_registers, int32_t output_register_count, Address, + int* output_registers, int32_t output_register_count, RegExp::CallOrigin call_origin, Isolate* isolate, Address regexp) { DCHECK_NOT_NULL(isolate); DCHECK_NOT_NULL(output_registers); diff --git a/js/src/irregexp/imported/regexp-interpreter.h b/js/src/irregexp/imported/regexp-interpreter.h index be1106732838..bc55be2b8cf9 100644 --- a/js/src/irregexp/imported/regexp-interpreter.h +++ b/js/src/irregexp/imported/regexp-interpreter.h @@ -12,6 +12,8 @@ namespace v8 { namespace internal { +class ByteArray; + class V8_EXPORT_PRIVATE IrregexpInterpreter : public AllStatic { public: enum Result { @@ -34,9 +36,8 @@ class V8_EXPORT_PRIVATE IrregexpInterpreter : public AllStatic { // RETRY is returned if a retry through the runtime is needed (e.g. when // interrupts have been scheduled or the regexp is marked for tier-up). // - // Arguments input_start, input_end and backtrack_stack are - // unused. They are only passed to match the signature of the native irregex - // code. + // Arguments input_start and input_end are unused. They are only passed to + // match the signature of the native irregex code. // // Arguments output_registers and output_register_count describe the results // array, which will contain register values of all captures if SUCCESS is @@ -45,7 +46,6 @@ class V8_EXPORT_PRIVATE IrregexpInterpreter : public AllStatic { Address input_start, Address input_end, int* output_registers, int32_t output_register_count, - Address backtrack_stack, RegExp::CallOrigin call_origin, Isolate* isolate, Address regexp); diff --git a/js/src/irregexp/imported/regexp-macro-assembler-tracer.cc b/js/src/irregexp/imported/regexp-macro-assembler-tracer.cc index cf1da256cf9b..ce887b9fbd2d 100644 --- a/js/src/irregexp/imported/regexp-macro-assembler-tracer.cc +++ b/js/src/irregexp/imported/regexp-macro-assembler-tracer.cc @@ -170,9 +170,11 @@ void RegExpMacroAssemblerTracer::LoadCurrentCharacterImpl( characters, eats_at_least); } +namespace { + class PrintablePrinter { public: - explicit PrintablePrinter(uc16 character) : character_(character) { } + explicit PrintablePrinter(base::uc16 character) : character_(character) {} const char* operator*() { if (character_ >= ' ' && character_ <= '~') { @@ -187,12 +189,14 @@ class PrintablePrinter { } private: - uc16 character_; + base::uc16 character_; char buffer_[4]; }; +} // namespace -void RegExpMacroAssemblerTracer::CheckCharacterLT(uc16 limit, Label* on_less) { +void RegExpMacroAssemblerTracer::CheckCharacterLT(base::uc16 limit, + Label* on_less) { PrintablePrinter printable(limit); PrintF(" CheckCharacterLT(c=0x%04x%s, label[%08x]);\n", limit, @@ -201,8 +205,7 @@ void RegExpMacroAssemblerTracer::CheckCharacterLT(uc16 limit, Label* on_less) { assembler_->CheckCharacterLT(limit, on_less); } - -void RegExpMacroAssemblerTracer::CheckCharacterGT(uc16 limit, +void RegExpMacroAssemblerTracer::CheckCharacterGT(base::uc16 limit, Label* on_greater) { PrintablePrinter printable(limit); PrintF(" CheckCharacterGT(c=0x%04x%s, label[%08x]);\n", @@ -212,7 +215,6 @@ void RegExpMacroAssemblerTracer::CheckCharacterGT(uc16 limit, assembler_->CheckCharacterGT(limit, on_greater); } - void RegExpMacroAssemblerTracer::CheckCharacter(unsigned c, Label* on_equal) { PrintablePrinter printable(c); PrintF(" CheckCharacter(c=0x%04x%s, label[%08x]);\n", @@ -275,12 +277,8 @@ void RegExpMacroAssemblerTracer::CheckNotCharacterAfterAnd( assembler_->CheckNotCharacterAfterAnd(c, mask, on_not_equal); } - void RegExpMacroAssemblerTracer::CheckNotCharacterAfterMinusAnd( - uc16 c, - uc16 minus, - uc16 mask, - Label* on_not_equal) { + base::uc16 c, base::uc16 minus, base::uc16 mask, Label* on_not_equal) { PrintF(" CheckNotCharacterAfterMinusAnd(c=0x%04x, minus=%04x, mask=0x%04x, " "label[%08x]);\n", c, @@ -290,11 +288,9 @@ void RegExpMacroAssemblerTracer::CheckNotCharacterAfterMinusAnd( assembler_->CheckNotCharacterAfterMinusAnd(c, minus, mask, on_not_equal); } - -void RegExpMacroAssemblerTracer::CheckCharacterInRange( - uc16 from, - uc16 to, - Label* on_not_in_range) { +void RegExpMacroAssemblerTracer::CheckCharacterInRange(base::uc16 from, + base::uc16 to, + Label* on_not_in_range) { PrintablePrinter printable_from(from); PrintablePrinter printable_to(to); PrintF(" CheckCharacterInRange(from=0x%04x%s, to=0x%04x%s, label[%08x]);\n", @@ -306,11 +302,9 @@ void RegExpMacroAssemblerTracer::CheckCharacterInRange( assembler_->CheckCharacterInRange(from, to, on_not_in_range); } - -void RegExpMacroAssemblerTracer::CheckCharacterNotInRange( - uc16 from, - uc16 to, - Label* on_in_range) { +void RegExpMacroAssemblerTracer::CheckCharacterNotInRange(base::uc16 from, + base::uc16 to, + Label* on_in_range) { PrintablePrinter printable_from(from); PrintablePrinter printable_to(to); PrintF( @@ -323,6 +317,40 @@ void RegExpMacroAssemblerTracer::CheckCharacterNotInRange( assembler_->CheckCharacterNotInRange(from, to, on_in_range); } +namespace { + +void PrintRangeArray(const ZoneList* ranges) { + for (int i = 0; i < ranges->length(); i++) { + base::uc16 from = ranges->at(i).from(); + base::uc16 to = ranges->at(i).to(); + PrintablePrinter printable_from(from); + PrintablePrinter printable_to(to); + PrintF(" [from=0x%04x%s, to=%04x%s],\n", from, *printable_from, to, + *printable_to); + } +} + +} // namespace + +bool RegExpMacroAssemblerTracer::CheckCharacterInRangeArray( + const ZoneList* ranges, Label* on_in_range) { + PrintF( + " CheckCharacterInRangeArray(\n" + " label[%08x]);\n", + LabelToInt(on_in_range)); + PrintRangeArray(ranges); + return assembler_->CheckCharacterInRangeArray(ranges, on_in_range); +} + +bool RegExpMacroAssemblerTracer::CheckCharacterNotInRangeArray( + const ZoneList* ranges, Label* on_not_in_range) { + PrintF( + " CheckCharacterNotInRangeArray(\n" + " label[%08x]);\n", + LabelToInt(on_not_in_range)); + PrintRangeArray(ranges); + return assembler_->CheckCharacterNotInRangeArray(ranges, on_not_in_range); +} void RegExpMacroAssemblerTracer::CheckBitInTable( Handle table, Label* on_bit_set) { @@ -362,20 +390,16 @@ void RegExpMacroAssemblerTracer::CheckPosition(int cp_offset, assembler_->CheckPosition(cp_offset, on_outside_input); } - bool RegExpMacroAssemblerTracer::CheckSpecialCharacterClass( - uc16 type, - Label* on_no_match) { + StandardCharacterSet type, Label* on_no_match) { bool supported = assembler_->CheckSpecialCharacterClass(type, on_no_match); PrintF(" CheckSpecialCharacterClass(type='%c', label[%08x]): %s;\n", - type, - LabelToInt(on_no_match), + static_cast(type), LabelToInt(on_no_match), supported ? "true" : "false"); return supported; } - void RegExpMacroAssemblerTracer::IfRegisterLT(int register_index, int comparand, Label* if_lt) { PrintF(" IfRegisterLT(register=%d, number=%d, label[%08x]);\n", diff --git a/js/src/irregexp/imported/regexp-macro-assembler-tracer.h b/js/src/irregexp/imported/regexp-macro-assembler-tracer.h index 8db175495ac6..9ccd36edf9d8 100644 --- a/js/src/irregexp/imported/regexp-macro-assembler-tracer.h +++ b/js/src/irregexp/imported/regexp-macro-assembler-tracer.h @@ -17,7 +17,9 @@ class RegExpMacroAssemblerTracer: public RegExpMacroAssembler { ~RegExpMacroAssemblerTracer() override; void AbortedCodeGeneration() override; int stack_limit_slack() override { return assembler_->stack_limit_slack(); } - bool CanReadUnaligned() override { return assembler_->CanReadUnaligned(); } + bool CanReadUnaligned() const override { + return assembler_->CanReadUnaligned(); + } void AdvanceCurrentPosition(int by) override; // Signed cp change. void AdvanceRegister(int reg, int by) override; // r[reg] += by. void Backtrack() override; @@ -25,8 +27,8 @@ class RegExpMacroAssemblerTracer: public RegExpMacroAssembler { void CheckCharacter(unsigned c, Label* on_equal) override; void CheckCharacterAfterAnd(unsigned c, unsigned and_with, Label* on_equal) override; - void CheckCharacterGT(uc16 limit, Label* on_greater) override; - void CheckCharacterLT(uc16 limit, Label* on_less) override; + void CheckCharacterGT(base::uc16 limit, Label* on_greater) override; + void CheckCharacterLT(base::uc16 limit, Label* on_less) override; void CheckGreedyLoop(Label* on_tos_equals_current_position) override; void CheckAtStart(int cp_offset, Label* on_at_start) override; void CheckNotAtStart(int cp_offset, Label* on_not_at_start) override; @@ -38,14 +40,21 @@ class RegExpMacroAssemblerTracer: public RegExpMacroAssembler { void CheckNotCharacter(unsigned c, Label* on_not_equal) override; void CheckNotCharacterAfterAnd(unsigned c, unsigned and_with, Label* on_not_equal) override; - void CheckNotCharacterAfterMinusAnd(uc16 c, uc16 minus, uc16 and_with, + void CheckNotCharacterAfterMinusAnd(base::uc16 c, base::uc16 minus, + base::uc16 and_with, Label* on_not_equal) override; - void CheckCharacterInRange(uc16 from, uc16 to, Label* on_in_range) override; - void CheckCharacterNotInRange(uc16 from, uc16 to, + void CheckCharacterInRange(base::uc16 from, base::uc16 to, + Label* on_in_range) override; + void CheckCharacterNotInRange(base::uc16 from, base::uc16 to, Label* on_not_in_range) override; + bool CheckCharacterInRangeArray(const ZoneList* ranges, + Label* on_in_range) override; + bool CheckCharacterNotInRangeArray(const ZoneList* ranges, + Label* on_not_in_range) override; void CheckBitInTable(Handle table, Label* on_bit_set) override; void CheckPosition(int cp_offset, Label* on_outside_input) override; - bool CheckSpecialCharacterClass(uc16 type, Label* on_no_match) override; + bool CheckSpecialCharacterClass(StandardCharacterSet type, + Label* on_no_match) override; void Fail() override; Handle GetCode(Handle source) override; void GoTo(Label* label) override; diff --git a/js/src/irregexp/imported/regexp-macro-assembler.cc b/js/src/irregexp/imported/regexp-macro-assembler.cc index 3d65831af11d..1de290817c5d 100644 --- a/js/src/irregexp/imported/regexp-macro-assembler.cc +++ b/js/src/irregexp/imported/regexp-macro-assembler.cc @@ -17,12 +17,16 @@ namespace internal { RegExpMacroAssembler::RegExpMacroAssembler(Isolate* isolate, Zone* zone) : slow_safe_compiler_(false), + backtrack_limit_(JSRegExp::kNoBacktrackLimit), global_mode_(NOT_GLOBAL), isolate_(isolate), zone_(zone) {} -RegExpMacroAssembler::~RegExpMacroAssembler() = default; +bool RegExpMacroAssembler::has_backtrack_limit() const { + return backtrack_limit_ != JSRegExp::kNoBacktrackLimit; +} +// static int RegExpMacroAssembler::CaseInsensitiveCompareNonUnicode(Address byte_offset1, Address byte_offset2, size_t byte_length, @@ -34,8 +38,8 @@ int RegExpMacroAssembler::CaseInsensitiveCompareNonUnicode(Address byte_offset1, DisallowGarbageCollection no_gc; DCHECK_EQ(0, byte_length % 2); size_t length = byte_length / 2; - uc16* substring1 = reinterpret_cast(byte_offset1); - uc16* substring2 = reinterpret_cast(byte_offset2); + base::uc16* substring1 = reinterpret_cast(byte_offset1); + base::uc16* substring2 = reinterpret_cast(byte_offset2); for (size_t i = 0; i < length; i++) { UChar32 c1 = RegExpCaseFolding::Canonicalize(substring1[i]); @@ -51,6 +55,7 @@ int RegExpMacroAssembler::CaseInsensitiveCompareNonUnicode(Address byte_offset1, #endif } +// static int RegExpMacroAssembler::CaseInsensitiveCompareUnicode(Address byte_offset1, Address byte_offset2, size_t byte_length, @@ -68,8 +73,8 @@ int RegExpMacroAssembler::CaseInsensitiveCompareUnicode(Address byte_offset1, return uni_str_1.caseCompare(reinterpret_cast(byte_offset2), length, U_FOLD_CASE_DEFAULT) == 0; #else - uc16* substring1 = reinterpret_cast(byte_offset1); - uc16* substring2 = reinterpret_cast(byte_offset2); + base::uc16* substring1 = reinterpret_cast(byte_offset1); + base::uc16* substring2 = reinterpret_cast(byte_offset2); size_t length = byte_length >> 1; DCHECK_NOT_NULL(isolate); unibrow::Mapping* canonicalize = @@ -93,6 +98,130 @@ int RegExpMacroAssembler::CaseInsensitiveCompareUnicode(Address byte_offset1, #endif // V8_INTL_SUPPORT } +namespace { + +uint32_t Hash(const ZoneList* ranges) { + size_t seed = 0; + for (int i = 0; i < ranges->length(); i++) { + const CharacterRange& r = ranges->at(i); + seed = base::hash_combine(seed, r.from(), r.to()); + } + return static_cast(seed); +} + +constexpr base::uc32 MaskEndOfRangeMarker(base::uc32 c) { + // CharacterRanges may use 0x10ffff as the end-of-range marker irrespective + // of whether the regexp IsUnicode or not; translate the marker value here. + DCHECK_IMPLIES(c > kMaxUInt16, c == String::kMaxCodePoint); + return c & 0xffff; +} + +int RangeArrayLengthFor(const ZoneList* ranges) { + const int ranges_length = ranges->length(); + return MaskEndOfRangeMarker(ranges->at(ranges_length - 1).to()) == kMaxUInt16 + ? ranges_length * 2 - 1 + : ranges_length * 2; +} + +bool Equals(const ZoneList* lhs, const Handle& rhs) { + DCHECK_EQ(rhs->length() % kUInt16Size, 0); // uc16 elements. + const int rhs_length = rhs->length() / kUInt16Size; + if (rhs_length != RangeArrayLengthFor(lhs)) return false; + for (int i = 0; i < lhs->length(); i++) { + const CharacterRange& r = lhs->at(i); + if (rhs->get_uint16(i * 2 + 0) != r.from()) return false; + if (i * 2 + 1 == rhs_length) break; + if (rhs->get_uint16(i * 2 + 1) != r.to() + 1) return false; + } + return true; +} + +Handle MakeRangeArray(Isolate* isolate, + const ZoneList* ranges) { + const int ranges_length = ranges->length(); + const int byte_array_length = RangeArrayLengthFor(ranges); + const int size_in_bytes = byte_array_length * kUInt16Size; + Handle range_array = + isolate->factory()->NewByteArray(size_in_bytes); + for (int i = 0; i < ranges_length; i++) { + const CharacterRange& r = ranges->at(i); + DCHECK_LE(r.from(), kMaxUInt16); + range_array->set_uint16(i * 2 + 0, r.from()); + const base::uc32 to = MaskEndOfRangeMarker(r.to()); + if (i == ranges_length - 1 && to == kMaxUInt16) { + DCHECK_EQ(byte_array_length, ranges_length * 2 - 1); + break; // Avoid overflow by leaving the last range open-ended. + } + DCHECK_LT(to, kMaxUInt16); + range_array->set_uint16(i * 2 + 1, to + 1); // Exclusive. + } + return range_array; +} + +} // namespace + +Handle NativeRegExpMacroAssembler::GetOrAddRangeArray( + const ZoneList* ranges) { + const uint32_t hash = Hash(ranges); + + if (range_array_cache_.count(hash) != 0) { + Handle range_array = range_array_cache_[hash]; + if (Equals(ranges, range_array)) return range_array; + } + + Handle range_array = MakeRangeArray(isolate(), ranges); + range_array_cache_[hash] = range_array; + return range_array; +} + +// static +uint32_t RegExpMacroAssembler::IsCharacterInRangeArray(uint32_t current_char, + Address raw_byte_array, + Isolate* isolate) { + // Use uint32_t to avoid complexity around bool return types (which may be + // optimized to use only the least significant byte). + static constexpr uint32_t kTrue = 1; + static constexpr uint32_t kFalse = 0; + + ByteArray ranges = ByteArray::cast(Object(raw_byte_array)); + + DCHECK_EQ(ranges.length() % kUInt16Size, 0); // uc16 elements. + const int length = ranges.length() / kUInt16Size; + DCHECK_GE(length, 1); + + // Shortcut for fully out of range chars. + if (current_char < ranges.get_uint16(0)) return kFalse; + if (current_char >= ranges.get_uint16(length - 1)) { + // The last range may be open-ended. + return (length % 2) == 0 ? kFalse : kTrue; + } + + // Binary search for the matching range. `ranges` is encoded as + // [from0, to0, from1, to1, ..., fromN, toN], or + // [from0, to0, from1, to1, ..., fromN] (open-ended last interval). + + int mid, lower = 0, upper = length; + do { + mid = lower + (upper - lower) / 2; + const base::uc16 elem = ranges.get_uint16(mid); + if (current_char < elem) { + upper = mid; + } else if (current_char > elem) { + lower = mid + 1; + } else { + DCHECK_EQ(current_char, elem); + break; + } + } while (lower < upper); + + const bool current_char_ge_last_elem = current_char >= ranges.get_uint16(mid); + const int current_range_start_index = + current_char_ge_last_elem ? mid : mid - 1; + + // Ranges start at even indices and end at odd indices. + return (current_range_start_index % 2) == 0 ? kTrue : kFalse; +} + void RegExpMacroAssembler::CheckNotInSurrogatePair(int cp_offset, Label* on_failure) { Label ok; @@ -124,17 +253,6 @@ void RegExpMacroAssembler::LoadCurrentCharacter(int cp_offset, eats_at_least); } -bool RegExpMacroAssembler::CheckSpecialCharacterClass(uc16 type, - Label* on_no_match) { - return false; -} - -NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate, - Zone* zone) - : RegExpMacroAssembler(isolate, zone) {} - -NativeRegExpMacroAssembler::~NativeRegExpMacroAssembler() = default; - void NativeRegExpMacroAssembler::LoadCurrentCharacterImpl( int cp_offset, Label* on_end_of_input, bool check_bounds, int characters, int eats_at_least) { @@ -153,13 +271,14 @@ void NativeRegExpMacroAssembler::LoadCurrentCharacterImpl( LoadCurrentCharacterUnchecked(cp_offset, characters); } -bool NativeRegExpMacroAssembler::CanReadUnaligned() { +bool NativeRegExpMacroAssembler::CanReadUnaligned() const { return FLAG_enable_regexp_unaligned_accesses && !slow_safe(); } #ifndef COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER // This method may only be called after an interrupt. +// static int NativeRegExpMacroAssembler::CheckStackGuardState( Isolate* isolate, int start_index, RegExp::CallOrigin call_origin, Address* return_address, Code re_code, Address* subject, @@ -287,6 +406,15 @@ int NativeRegExpMacroAssembler::Match(Handle regexp, offsets_vector_length, isolate, *regexp); } +// static +int NativeRegExpMacroAssembler::ExecuteForTesting( + String input, int start_offset, const byte* input_start, + const byte* input_end, int* output, int output_size, Isolate* isolate, + JSRegExp regexp) { + return Execute(input, start_offset, input_start, input_end, output, + output_size, isolate, regexp); +} + // Returns a {Result} sentinel, or the number of successful matches. // TODO(pthier): The JSRegExp object is passed to native irregexp code to match // the signature of the interpreter. We should get rid of JS objects passed to @@ -295,23 +423,21 @@ int NativeRegExpMacroAssembler::Execute( String input, // This needs to be the unpacked (sliced, cons) string. int start_offset, const byte* input_start, const byte* input_end, int* output, int output_size, Isolate* isolate, JSRegExp regexp) { - // Ensure that the minimum stack has been allocated. RegExpStackScope stack_scope(isolate); - Address stack_base = stack_scope.stack()->stack_base(); bool is_one_byte = String::IsOneByteRepresentationUnderneath(input); - Code code = Code::cast(regexp.Code(is_one_byte)); + Code code = FromCodeT(CodeT::cast(regexp.code(is_one_byte))); RegExp::CallOrigin call_origin = RegExp::CallOrigin::kFromRuntime; - using RegexpMatcherSig = int( - Address input_string, int start_offset, const byte* input_start, - const byte* input_end, int* output, int output_size, Address stack_base, - int call_origin, Isolate* isolate, Address regexp); + using RegexpMatcherSig = + // NOLINTNEXTLINE(readability/casting) + int(Address input_string, int start_offset, const byte* input_start, + const byte* input_end, int* output, int output_size, int call_origin, + Isolate* isolate, Address regexp); auto fn = GeneratedCode::FromCode(code); - int result = - fn.Call(input.ptr(), start_offset, input_start, input_end, output, - output_size, stack_base, call_origin, isolate, regexp.ptr()); + int result = fn.Call(input.ptr(), start_offset, input_start, input_end, + output, output_size, call_origin, isolate, regexp.ptr()); DCHECK_GE(result, SMALLEST_REGEXP_RESULT); if (result == EXCEPTION && !isolate->has_pending_exception()) { @@ -371,22 +497,24 @@ const byte NativeRegExpMacroAssembler::word_character_map[] = { }; // clang-format on -Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer, - Address* stack_base, - Isolate* isolate) { +// static +Address NativeRegExpMacroAssembler::GrowStack(Isolate* isolate) { + DisallowGarbageCollection no_gc; + RegExpStack* regexp_stack = isolate->regexp_stack(); - size_t size = regexp_stack->stack_capacity(); - Address old_stack_base = regexp_stack->stack_base(); - DCHECK(old_stack_base == *stack_base); - DCHECK(stack_pointer <= old_stack_base); - DCHECK(static_cast(old_stack_base - stack_pointer) <= size); - Address new_stack_base = regexp_stack->EnsureCapacity(size * 2); - if (new_stack_base == kNullAddress) { - return kNullAddress; - } - *stack_base = new_stack_base; - intptr_t stack_content_size = old_stack_base - stack_pointer; - return new_stack_base - stack_content_size; + const size_t old_size = regexp_stack->memory_size(); + +#ifdef DEBUG + const Address old_stack_top = regexp_stack->memory_top(); + const Address old_stack_pointer = regexp_stack->stack_pointer(); + CHECK_LE(old_stack_pointer, old_stack_top); + CHECK_LE(static_cast(old_stack_top - old_stack_pointer), old_size); +#endif // DEBUG + + Address new_stack_base = regexp_stack->EnsureCapacity(old_size * 2); + if (new_stack_base == kNullAddress) return kNullAddress; + + return regexp_stack->stack_pointer(); } } // namespace internal diff --git a/js/src/irregexp/imported/regexp-macro-assembler.h b/js/src/irregexp/imported/regexp-macro-assembler.h index 2f11d1e7eacb..1700f6e66d6e 100644 --- a/js/src/irregexp/imported/regexp-macro-assembler.h +++ b/js/src/irregexp/imported/regexp-macro-assembler.h @@ -12,18 +12,17 @@ namespace v8 { namespace internal { -static const uc32 kLeadSurrogateStart = 0xd800; -static const uc32 kLeadSurrogateEnd = 0xdbff; -static const uc32 kTrailSurrogateStart = 0xdc00; -static const uc32 kTrailSurrogateEnd = 0xdfff; -static const uc32 kNonBmpStart = 0x10000; -static const uc32 kNonBmpEnd = 0x10ffff; - -struct DisjunctDecisionRow { - RegExpCharacterClass cc; - Label* on_match; -}; +class ByteArray; +class JSRegExp; +class Label; +class String; +static const base::uc32 kLeadSurrogateStart = 0xd800; +static const base::uc32 kLeadSurrogateEnd = 0xdbff; +static const base::uc32 kTrailSurrogateStart = 0xdc00; +static const base::uc32 kTrailSurrogateEnd = 0xdfff; +static const base::uc32 kNonBmpStart = 0x10000; +static const base::uc32 kNonBmpEnd = 0x10ffff; class RegExpMacroAssembler { public: @@ -39,11 +38,134 @@ class RegExpMacroAssembler { static constexpr int kUseCharactersValue = -1; + RegExpMacroAssembler(Isolate* isolate, Zone* zone); + virtual ~RegExpMacroAssembler() = default; + + virtual Handle GetCode(Handle source) = 0; + + // This function is called when code generation is aborted, so that + // the assembler could clean up internal data structures. + virtual void AbortedCodeGeneration() {} + // The maximal number of pushes between stack checks. Users must supply + // kCheckStackLimit flag to push operations (instead of kNoStackLimitCheck) + // at least once for every stack_limit() pushes that are executed. + virtual int stack_limit_slack() = 0; + virtual bool CanReadUnaligned() const = 0; + + virtual void AdvanceCurrentPosition(int by) = 0; // Signed cp change. + virtual void AdvanceRegister(int reg, int by) = 0; // r[reg] += by. + // Continues execution from the position pushed on the top of the backtrack + // stack by an earlier PushBacktrack(Label*). + virtual void Backtrack() = 0; + virtual void Bind(Label* label) = 0; + // Dispatch after looking the current character up in a 2-bits-per-entry + // map. The destinations vector has up to 4 labels. + virtual void CheckCharacter(unsigned c, Label* on_equal) = 0; + // Bitwise and the current character with the given constant and then + // check for a match with c. + virtual void CheckCharacterAfterAnd(unsigned c, + unsigned and_with, + Label* on_equal) = 0; + virtual void CheckCharacterGT(base::uc16 limit, Label* on_greater) = 0; + virtual void CheckCharacterLT(base::uc16 limit, Label* on_less) = 0; + virtual void CheckGreedyLoop(Label* on_tos_equals_current_position) = 0; + virtual void CheckAtStart(int cp_offset, Label* on_at_start) = 0; + virtual void CheckNotAtStart(int cp_offset, Label* on_not_at_start) = 0; + virtual void CheckNotBackReference(int start_reg, bool read_backward, + Label* on_no_match) = 0; + virtual void CheckNotBackReferenceIgnoreCase(int start_reg, + bool read_backward, bool unicode, + Label* on_no_match) = 0; + // Check the current character for a match with a literal character. If we + // fail to match then goto the on_failure label. End of input always + // matches. If the label is nullptr then we should pop a backtrack address + // off the stack and go to that. + virtual void CheckNotCharacter(unsigned c, Label* on_not_equal) = 0; + virtual void CheckNotCharacterAfterAnd(unsigned c, + unsigned and_with, + Label* on_not_equal) = 0; + // Subtract a constant from the current character, then and with the given + // constant and then check for a match with c. + virtual void CheckNotCharacterAfterMinusAnd(base::uc16 c, base::uc16 minus, + base::uc16 and_with, + Label* on_not_equal) = 0; + virtual void CheckCharacterInRange(base::uc16 from, + base::uc16 to, // Both inclusive. + Label* on_in_range) = 0; + virtual void CheckCharacterNotInRange(base::uc16 from, + base::uc16 to, // Both inclusive. + Label* on_not_in_range) = 0; + // Returns true if the check was emitted, false otherwise. + virtual bool CheckCharacterInRangeArray( + const ZoneList* ranges, Label* on_in_range) = 0; + virtual bool CheckCharacterNotInRangeArray( + const ZoneList* ranges, Label* on_not_in_range) = 0; + + // The current character (modulus the kTableSize) is looked up in the byte + // array, and if the found byte is non-zero, we jump to the on_bit_set label. + virtual void CheckBitInTable(Handle table, Label* on_bit_set) = 0; + + // Checks whether the given offset from the current position is before + // the end of the string. May overwrite the current character. + virtual void CheckPosition(int cp_offset, Label* on_outside_input); + // Check whether a standard/default character class matches the current + // character. Returns false if the type of special character class does + // not have custom support. + // May clobber the current loaded character. + virtual bool CheckSpecialCharacterClass(StandardCharacterSet type, + Label* on_no_match) { + return false; + } + + // Control-flow integrity: + // Define a jump target and bind a label. + virtual void BindJumpTarget(Label* label) { Bind(label); } + + virtual void Fail() = 0; + virtual void GoTo(Label* label) = 0; + // Check whether a register is >= a given constant and go to a label if it + // is. Backtracks instead if the label is nullptr. + virtual void IfRegisterGE(int reg, int comparand, Label* if_ge) = 0; + // Check whether a register is < a given constant and go to a label if it is. + // Backtracks instead if the label is nullptr. + virtual void IfRegisterLT(int reg, int comparand, Label* if_lt) = 0; + // Check whether a register is == to the current position and go to a + // label if it is. + virtual void IfRegisterEqPos(int reg, Label* if_eq) = 0; + V8_EXPORT_PRIVATE void LoadCurrentCharacter( + int cp_offset, Label* on_end_of_input, bool check_bounds = true, + int characters = 1, int eats_at_least = kUseCharactersValue); + virtual void LoadCurrentCharacterImpl(int cp_offset, Label* on_end_of_input, + bool check_bounds, int characters, + int eats_at_least) = 0; + virtual void PopCurrentPosition() = 0; + virtual void PopRegister(int register_index) = 0; + // Pushes the label on the backtrack stack, so that a following Backtrack + // will go to this label. Always checks the backtrack stack limit. + virtual void PushBacktrack(Label* label) = 0; + virtual void PushCurrentPosition() = 0; + enum StackCheckFlag { kNoStackLimitCheck = false, kCheckStackLimit = true }; + virtual void PushRegister(int register_index, + StackCheckFlag check_stack_limit) = 0; + virtual void ReadCurrentPositionFromRegister(int reg) = 0; + virtual void ReadStackPointerFromRegister(int reg) = 0; + virtual void SetCurrentPositionFromEnd(int by) = 0; + virtual void SetRegister(int register_index, int to) = 0; + // Return whether the matching (with a global regexp) will be restarted. + virtual bool Succeed() = 0; + virtual void WriteCurrentPositionToRegister(int reg, int cp_offset) = 0; + virtual void ClearRegisters(int reg_from, int reg_to) = 0; + virtual void WriteStackPointerToRegister(int reg) = 0; + + // Check that we are not in the middle of a surrogate pair. + void CheckNotInSurrogatePair(int cp_offset, Label* on_failure); + #define IMPLEMENTATIONS_LIST(V) \ V(IA32) \ V(ARM) \ V(ARM64) \ V(MIPS) \ + V(LOONG64) \ V(RISCV) \ V(S390) \ V(PPC) \ @@ -65,123 +187,11 @@ class RegExpMacroAssembler { return kNames[impl]; } #undef IMPLEMENTATIONS_LIST - - enum StackCheckFlag { - kNoStackLimitCheck = false, - kCheckStackLimit = true - }; - - RegExpMacroAssembler(Isolate* isolate, Zone* zone); - virtual ~RegExpMacroAssembler(); - // This function is called when code generation is aborted, so that - // the assembler could clean up internal data structures. - virtual void AbortedCodeGeneration() {} - // The maximal number of pushes between stack checks. Users must supply - // kCheckStackLimit flag to push operations (instead of kNoStackLimitCheck) - // at least once for every stack_limit() pushes that are executed. - virtual int stack_limit_slack() = 0; - virtual bool CanReadUnaligned() = 0; - virtual void AdvanceCurrentPosition(int by) = 0; // Signed cp change. - virtual void AdvanceRegister(int reg, int by) = 0; // r[reg] += by. - // Continues execution from the position pushed on the top of the backtrack - // stack by an earlier PushBacktrack(Label*). - virtual void Backtrack() = 0; - virtual void Bind(Label* label) = 0; - // Dispatch after looking the current character up in a 2-bits-per-entry - // map. The destinations vector has up to 4 labels. - virtual void CheckCharacter(unsigned c, Label* on_equal) = 0; - // Bitwise and the current character with the given constant and then - // check for a match with c. - virtual void CheckCharacterAfterAnd(unsigned c, - unsigned and_with, - Label* on_equal) = 0; - virtual void CheckCharacterGT(uc16 limit, Label* on_greater) = 0; - virtual void CheckCharacterLT(uc16 limit, Label* on_less) = 0; - virtual void CheckGreedyLoop(Label* on_tos_equals_current_position) = 0; - virtual void CheckAtStart(int cp_offset, Label* on_at_start) = 0; - virtual void CheckNotAtStart(int cp_offset, Label* on_not_at_start) = 0; - virtual void CheckNotBackReference(int start_reg, bool read_backward, - Label* on_no_match) = 0; - virtual void CheckNotBackReferenceIgnoreCase(int start_reg, - bool read_backward, bool unicode, - Label* on_no_match) = 0; - // Check the current character for a match with a literal character. If we - // fail to match then goto the on_failure label. End of input always - // matches. If the label is nullptr then we should pop a backtrack address - // off the stack and go to that. - virtual void CheckNotCharacter(unsigned c, Label* on_not_equal) = 0; - virtual void CheckNotCharacterAfterAnd(unsigned c, - unsigned and_with, - Label* on_not_equal) = 0; - // Subtract a constant from the current character, then and with the given - // constant and then check for a match with c. - virtual void CheckNotCharacterAfterMinusAnd(uc16 c, - uc16 minus, - uc16 and_with, - Label* on_not_equal) = 0; - virtual void CheckCharacterInRange(uc16 from, - uc16 to, // Both inclusive. - Label* on_in_range) = 0; - virtual void CheckCharacterNotInRange(uc16 from, - uc16 to, // Both inclusive. - Label* on_not_in_range) = 0; - - // The current character (modulus the kTableSize) is looked up in the byte - // array, and if the found byte is non-zero, we jump to the on_bit_set label. - virtual void CheckBitInTable(Handle table, Label* on_bit_set) = 0; - - // Checks whether the given offset from the current position is before - // the end of the string. May overwrite the current character. - virtual void CheckPosition(int cp_offset, Label* on_outside_input); - // Check whether a standard/default character class matches the current - // character. Returns false if the type of special character class does - // not have custom support. - // May clobber the current loaded character. - virtual bool CheckSpecialCharacterClass(uc16 type, Label* on_no_match); - - // Control-flow integrity: - // Define a jump target and bind a label. - virtual void BindJumpTarget(Label* label) { Bind(label); } - - virtual void Fail() = 0; - virtual Handle GetCode(Handle source) = 0; - virtual void GoTo(Label* label) = 0; - // Check whether a register is >= a given constant and go to a label if it - // is. Backtracks instead if the label is nullptr. - virtual void IfRegisterGE(int reg, int comparand, Label* if_ge) = 0; - // Check whether a register is < a given constant and go to a label if it is. - // Backtracks instead if the label is nullptr. - virtual void IfRegisterLT(int reg, int comparand, Label* if_lt) = 0; - // Check whether a register is == to the current position and go to a - // label if it is. - virtual void IfRegisterEqPos(int reg, Label* if_eq) = 0; virtual IrregexpImplementation Implementation() = 0; - V8_EXPORT_PRIVATE void LoadCurrentCharacter( - int cp_offset, Label* on_end_of_input, bool check_bounds = true, - int characters = 1, int eats_at_least = kUseCharactersValue); - virtual void LoadCurrentCharacterImpl(int cp_offset, Label* on_end_of_input, - bool check_bounds, int characters, - int eats_at_least) = 0; - virtual void PopCurrentPosition() = 0; - virtual void PopRegister(int register_index) = 0; - // Pushes the label on the backtrack stack, so that a following Backtrack - // will go to this label. Always checks the backtrack stack limit. - virtual void PushBacktrack(Label* label) = 0; - virtual void PushCurrentPosition() = 0; - virtual void PushRegister(int register_index, - StackCheckFlag check_stack_limit) = 0; - virtual void ReadCurrentPositionFromRegister(int reg) = 0; - virtual void ReadStackPointerFromRegister(int reg) = 0; - virtual void SetCurrentPositionFromEnd(int by) = 0; - virtual void SetRegister(int register_index, int to) = 0; - // Return whether the matching (with a global regexp) will be restarted. - virtual bool Succeed() = 0; - virtual void WriteCurrentPositionToRegister(int reg, int cp_offset) = 0; - virtual void ClearRegisters(int reg_from, int reg_to) = 0; - virtual void WriteStackPointerToRegister(int reg) = 0; // Compare two-byte strings case insensitively. - // Called from generated RegExp code. + // + // Called from generated code. static int CaseInsensitiveCompareNonUnicode(Address byte_offset1, Address byte_offset2, size_t byte_length, @@ -191,12 +201,23 @@ class RegExpMacroAssembler { size_t byte_length, Isolate* isolate); - // Check that we are not in the middle of a surrogate pair. - void CheckNotInSurrogatePair(int cp_offset, Label* on_failure); + // `raw_byte_array` is a ByteArray containing a set of character ranges, + // where ranges are encoded as uint16_t elements: + // + // [from0, to0, from1, to1, ..., fromN, toN], or + // [from0, to0, from1, to1, ..., fromN] (open-ended last interval). + // + // fromN is inclusive, toN is exclusive. Returns zero if not in a range, + // non-zero otherwise. + // + // Called from generated code. + static uint32_t IsCharacterInRangeArray(uint32_t current_char, + Address raw_byte_array, + Isolate* isolate); // Controls the generation of large inlined constants in the code. void set_slow_safe(bool ssc) { slow_safe_compiler_ = ssc; } - bool slow_safe() { return slow_safe_compiler_; } + bool slow_safe() const { return slow_safe_compiler_; } // Controls after how many backtracks irregexp should abort execution. If it // can fall back to the experimental engine (see `set_can_fallback`), it will @@ -220,30 +241,28 @@ class RegExpMacroAssembler { // Set whether the regular expression has the global flag. Exiting due to // a failure in a global regexp may still mean success overall. inline void set_global_mode(GlobalMode mode) { global_mode_ = mode; } - inline bool global() { return global_mode_ != NOT_GLOBAL; } - inline bool global_with_zero_length_check() { + inline bool global() const { return global_mode_ != NOT_GLOBAL; } + inline bool global_with_zero_length_check() const { return global_mode_ == GLOBAL || global_mode_ == GLOBAL_UNICODE; } - inline bool global_unicode() { return global_mode_ == GLOBAL_UNICODE; } + inline bool global_unicode() const { return global_mode_ == GLOBAL_UNICODE; } Isolate* isolate() const { return isolate_; } Zone* zone() const { return zone_; } protected: - bool has_backtrack_limit() const { - return backtrack_limit_ != JSRegExp::kNoBacktrackLimit; - } + bool has_backtrack_limit() const; uint32_t backtrack_limit() const { return backtrack_limit_; } bool can_fallback() const { return can_fallback_; } private: bool slow_safe_compiler_; - uint32_t backtrack_limit_ = JSRegExp::kNoBacktrackLimit; + uint32_t backtrack_limit_; bool can_fallback_ = false; GlobalMode global_mode_; - Isolate* isolate_; - Zone* zone_; + Isolate* const isolate_; + Zone* const zone_; }; class NativeRegExpMacroAssembler: public RegExpMacroAssembler { @@ -271,44 +290,24 @@ class NativeRegExpMacroAssembler: public RegExpMacroAssembler { SMALLEST_REGEXP_RESULT = RegExp::kInternalRegExpSmallestResult, }; - NativeRegExpMacroAssembler(Isolate* isolate, Zone* zone); - ~NativeRegExpMacroAssembler() override; - bool CanReadUnaligned() override; + NativeRegExpMacroAssembler(Isolate* isolate, Zone* zone) + : RegExpMacroAssembler(isolate, zone), range_array_cache_(zone) {} + ~NativeRegExpMacroAssembler() override = default; // Returns a {Result} sentinel, or the number of successful matches. static int Match(Handle regexp, Handle subject, int* offsets_vector, int offsets_vector_length, int previous_index, Isolate* isolate); - // Called from RegExp if the backtrack stack limit is hit. - // Tries to expand the stack. Returns the new stack-pointer if - // successful, and updates the stack_top address, or returns 0 if unable - // to grow the stack. - // This function must not trigger a garbage collection. - static Address GrowStack(Address stack_pointer, Address* stack_top, - Isolate* isolate); + V8_EXPORT_PRIVATE static int ExecuteForTesting(String input, int start_offset, + const byte* input_start, + const byte* input_end, + int* output, int output_size, + Isolate* isolate, + JSRegExp regexp); - static int CheckStackGuardState(Isolate* isolate, int start_index, - RegExp::CallOrigin call_origin, - Address* return_address, Code re_code, - Address* subject, const byte** input_start, - const byte** input_end); + bool CanReadUnaligned() const override; - // Byte map of one byte characters with a 0xff if the character is a word - // character (digit, letter or underscore) and 0x00 otherwise. - // Used by generated RegExp code. - static const byte word_character_map[256]; - - static Address word_character_map_address() { - return reinterpret_cast
(&word_character_map[0]); - } - - // Returns a {Result} sentinel, or the number of successful matches. - V8_EXPORT_PRIVATE static int Execute(String input, int start_offset, - const byte* input_start, - const byte* input_end, int* output, - int output_size, Isolate* isolate, - JSRegExp regexp); void LoadCurrentCharacterImpl(int cp_offset, Label* on_end_of_input, bool check_bounds, int characters, int eats_at_least) override; @@ -316,6 +315,41 @@ class NativeRegExpMacroAssembler: public RegExpMacroAssembler { // current position, into the current-character register. virtual void LoadCurrentCharacterUnchecked(int cp_offset, int character_count) = 0; + + // Called from RegExp if the backtrack stack limit is hit. Tries to expand + // the stack. Returns the new stack-pointer if successful, or returns 0 if + // unable to grow the stack. + // This function must not trigger a garbage collection. + // + // Called from generated code. + static Address GrowStack(Isolate* isolate); + + // Called from generated code. + static int CheckStackGuardState(Isolate* isolate, int start_index, + RegExp::CallOrigin call_origin, + Address* return_address, Code re_code, + Address* subject, const byte** input_start, + const byte** input_end); + + static Address word_character_map_address() { + return reinterpret_cast
(&word_character_map[0]); + } + + protected: + // Byte map of one byte characters with a 0xff if the character is a word + // character (digit, letter or underscore) and 0x00 otherwise. + // Used by generated RegExp code. + static const byte word_character_map[256]; + + Handle GetOrAddRangeArray(const ZoneList* ranges); + + private: + // Returns a {Result} sentinel, or the number of successful matches. + static int Execute(String input, int start_offset, const byte* input_start, + const byte* input_end, int* output, int output_size, + Isolate* isolate, JSRegExp regexp); + + ZoneUnorderedMap> range_array_cache_; }; } // namespace internal diff --git a/js/src/irregexp/imported/regexp-nodes.h b/js/src/irregexp/imported/regexp-nodes.h index 45841363f2f2..220b60f2b450 100644 --- a/js/src/irregexp/imported/regexp-nodes.h +++ b/js/src/irregexp/imported/regexp-nodes.h @@ -13,7 +13,6 @@ namespace internal { class AlternativeGenerationList; class BoyerMooreLookahead; class GreedyLoopState; -class Label; class NodeVisitor; class QuickCheckDetails; class RegExpCompiler; @@ -204,7 +203,9 @@ class RegExpNode : public ZoneObject { // If we know that the input is one-byte then there are some nodes that can // never match. This method returns a node that can be substituted for // itself, or nullptr if the node can never match. - virtual RegExpNode* FilterOneByte(int depth) { return this; } + virtual RegExpNode* FilterOneByte(int depth, RegExpFlags flags) { + return this; + } // Helper for FilterOneByte. RegExpNode* replacement() { DCHECK(info()->replacement_calculated); @@ -293,7 +294,7 @@ class SeqRegExpNode : public RegExpNode { : RegExpNode(on_success->zone()), on_success_(on_success) {} RegExpNode* on_success() { return on_success_; } void set_on_success(RegExpNode* node) { on_success_ = node; } - RegExpNode* FilterOneByte(int depth) override; + RegExpNode* FilterOneByte(int depth, RegExpFlags flags) override; void FillInBMInfo(Isolate* isolate, int offset, int budget, BoyerMooreLookahead* bm, bool not_at_start) override { on_success_->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start); @@ -301,7 +302,7 @@ class SeqRegExpNode : public RegExpNode { } protected: - RegExpNode* FilterSuccessor(int depth); + RegExpNode* FilterSuccessor(int depth, RegExpFlags flags); private: RegExpNode* on_success_; @@ -405,15 +406,17 @@ class TextNode : public SeqRegExpNode { static TextNode* CreateForCharacterRanges(Zone* zone, ZoneList* ranges, bool read_backward, - RegExpNode* on_success, - JSRegExp::Flags flags); - // Create TextNode for a surrogate pair with a range given for the - // lead and the trail surrogate each. - static TextNode* CreateForSurrogatePair(Zone* zone, CharacterRange lead, + RegExpNode* on_success); + // Create TextNode for a surrogate pair (i.e. match a sequence of two uc16 + // code unit ranges). + static TextNode* CreateForSurrogatePair( + Zone* zone, CharacterRange lead, ZoneList* trail_ranges, + bool read_backward, RegExpNode* on_success); + static TextNode* CreateForSurrogatePair(Zone* zone, + ZoneList* lead_ranges, CharacterRange trail, bool read_backward, - RegExpNode* on_success, - JSRegExp::Flags flags); + RegExpNode* on_success); void Accept(NodeVisitor* visitor) override; void Emit(RegExpCompiler* compiler, Trace* trace) override; void GetQuickCheckDetails(QuickCheckDetails* details, @@ -421,14 +424,15 @@ class TextNode : public SeqRegExpNode { bool not_at_start) override; ZoneList* elements() { return elms_; } bool read_backward() { return read_backward_; } - void MakeCaseIndependent(Isolate* isolate, bool is_one_byte); + void MakeCaseIndependent(Isolate* isolate, bool is_one_byte, + RegExpFlags flags); int GreedyLoopTextLength() override; RegExpNode* GetSuccessorOfOmnivorousTextNode( RegExpCompiler* compiler) override; void FillInBMInfo(Isolate* isolate, int offset, int budget, BoyerMooreLookahead* bm, bool not_at_start) override; void CalculateOffsets(); - RegExpNode* FilterOneByte(int depth) override; + RegExpNode* FilterOneByte(int depth, RegExpFlags flags) override; int Length(); private: @@ -496,7 +500,7 @@ class AssertionNode : public SeqRegExpNode { class BackReferenceNode : public SeqRegExpNode { public: - BackReferenceNode(int start_reg, int end_reg, JSRegExp::Flags flags, + BackReferenceNode(int start_reg, int end_reg, RegExpFlags flags, bool read_backward, RegExpNode* on_success) : SeqRegExpNode(on_success), start_reg_(start_reg), @@ -519,7 +523,7 @@ class BackReferenceNode : public SeqRegExpNode { private: int start_reg_; int end_reg_; - JSRegExp::Flags flags_; + RegExpFlags flags_; bool read_backward_; }; @@ -621,7 +625,7 @@ class ChoiceNode : public RegExpNode { virtual bool try_to_emit_quick_check_for_alternative(bool is_first) { return true; } - RegExpNode* FilterOneByte(int depth) override; + RegExpNode* FilterOneByte(int depth, RegExpFlags flags) override; virtual bool read_backward() { return false; } protected: @@ -693,7 +697,7 @@ class NegativeLookaroundChoiceNode : public ChoiceNode { return !is_first; } void Accept(NodeVisitor* visitor) override; - RegExpNode* FilterOneByte(int depth) override; + RegExpNode* FilterOneByte(int depth, RegExpFlags flags) override; }; class LoopChoiceNode : public ChoiceNode { @@ -726,7 +730,7 @@ class LoopChoiceNode : public ChoiceNode { int min_loop_iterations() const { return min_loop_iterations_; } bool read_backward() override { return read_backward_; } void Accept(NodeVisitor* visitor) override; - RegExpNode* FilterOneByte(int depth) override; + RegExpNode* FilterOneByte(int depth, RegExpFlags flags) override; private: // AddAlternative is made private for loop nodes because alternatives diff --git a/js/src/irregexp/imported/regexp-parser.cc b/js/src/irregexp/imported/regexp-parser.cc index 67ff58e4a212..06ca20715a5a 100644 --- a/js/src/irregexp/imported/regexp-parser.cc +++ b/js/src/irregexp/imported/regexp-parser.cc @@ -4,9 +4,8 @@ #include "irregexp/imported/regexp-parser.h" -#include - #include "irregexp/imported/property-sequences.h" +#include "irregexp/imported/regexp-ast.h" #include "irregexp/imported/regexp-macro-assembler.h" #include "irregexp/imported/regexp.h" @@ -17,14 +16,347 @@ namespace v8 { namespace internal { -RegExpParser::RegExpParser(FlatStringReader* in, JSRegExp::Flags flags, - Isolate* isolate, Zone* zone) - : isolate_(isolate), - zone_(zone), +namespace { + +// Whether we're currently inside the ClassEscape production +// (tc39.es/ecma262/#prod-annexB-CharacterEscape). +enum class InClassEscapeState { + kInClass, + kNotInClass, +}; + +// Accumulates RegExp atoms and assertions into lists of terms and alternatives. +class RegExpBuilder { + public: + RegExpBuilder(Zone* zone, RegExpFlags flags) + : zone_(zone), + flags_(flags), + terms_(ZoneAllocator{zone}), + text_(ZoneAllocator{zone}), + alternatives_(ZoneAllocator{zone}) {} + void AddCharacter(base::uc16 character); + void AddUnicodeCharacter(base::uc32 character); + void AddEscapedUnicodeCharacter(base::uc32 character); + // "Adds" an empty expression. Does nothing except consume a + // following quantifier + void AddEmpty(); + void AddCharacterClass(RegExpCharacterClass* cc); + void AddCharacterClassForDesugaring(base::uc32 c); + void AddAtom(RegExpTree* tree); + void AddTerm(RegExpTree* tree); + void AddAssertion(RegExpTree* tree); + void NewAlternative(); // '|' + bool AddQuantifierToAtom(int min, int max, + RegExpQuantifier::QuantifierType type); + void FlushText(); + RegExpTree* ToRegExp(); + RegExpFlags flags() const { return flags_; } + + bool ignore_case() const { return IsIgnoreCase(flags_); } + bool multiline() const { return IsMultiline(flags_); } + bool dotall() const { return IsDotAll(flags_); } + + private: + static const base::uc16 kNoPendingSurrogate = 0; + void AddLeadSurrogate(base::uc16 lead_surrogate); + void AddTrailSurrogate(base::uc16 trail_surrogate); + void FlushPendingSurrogate(); + void FlushCharacters(); + void FlushTerms(); + bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc); + bool NeedsDesugaringForIgnoreCase(base::uc32 c); + Zone* zone() const { return zone_; } + bool unicode() const { return IsUnicode(flags_); } + + Zone* const zone_; + bool pending_empty_ = false; + const RegExpFlags flags_; + ZoneList* characters_ = nullptr; + base::uc16 pending_surrogate_ = kNoPendingSurrogate; + + using SmallRegExpTreeVector = + base::SmallVector>; + SmallRegExpTreeVector terms_; + SmallRegExpTreeVector text_; + SmallRegExpTreeVector alternatives_; +#ifdef DEBUG + enum { + ADD_NONE, + ADD_CHAR, + ADD_TERM, + ADD_ASSERT, + ADD_ATOM + } last_added_ = ADD_NONE; +#define LAST(x) last_added_ = x; +#else +#define LAST(x) +#endif +}; + +enum SubexpressionType { + INITIAL, + CAPTURE, // All positive values represent captures. + POSITIVE_LOOKAROUND, + NEGATIVE_LOOKAROUND, + GROUPING +}; + +class RegExpParserState : public ZoneObject { + public: + // Push a state on the stack. + RegExpParserState(RegExpParserState* previous_state, + SubexpressionType group_type, + RegExpLookaround::Type lookaround_type, + int disjunction_capture_index, + const ZoneVector* capture_name, + RegExpFlags flags, Zone* zone) + : previous_state_(previous_state), + builder_(zone, flags), + group_type_(group_type), + lookaround_type_(lookaround_type), + disjunction_capture_index_(disjunction_capture_index), + capture_name_(capture_name) {} + // Parser state of containing expression, if any. + RegExpParserState* previous_state() const { return previous_state_; } + bool IsSubexpression() { return previous_state_ != nullptr; } + // RegExpBuilder building this regexp's AST. + RegExpBuilder* builder() { return &builder_; } + // Type of regexp being parsed (parenthesized group or entire regexp). + SubexpressionType group_type() const { return group_type_; } + // Lookahead or Lookbehind. + RegExpLookaround::Type lookaround_type() const { return lookaround_type_; } + // Index in captures array of first capture in this sub-expression, if any. + // Also the capture index of this sub-expression itself, if group_type + // is CAPTURE. + int capture_index() const { return disjunction_capture_index_; } + // The name of the current sub-expression, if group_type is CAPTURE. Only + // used for named captures. + const ZoneVector* capture_name() const { return capture_name_; } + + bool IsNamedCapture() const { return capture_name_ != nullptr; } + + // Check whether the parser is inside a capture group with the given index. + bool IsInsideCaptureGroup(int index) const { + for (const RegExpParserState* s = this; s != nullptr; + s = s->previous_state()) { + if (s->group_type() != CAPTURE) continue; + // Return true if we found the matching capture index. + if (index == s->capture_index()) return true; + // Abort if index is larger than what has been parsed up till this state. + if (index > s->capture_index()) return false; + } + return false; + } + + // Check whether the parser is inside a capture group with the given name. + bool IsInsideCaptureGroup(const ZoneVector* name) const { + DCHECK_NOT_NULL(name); + for (const RegExpParserState* s = this; s != nullptr; + s = s->previous_state()) { + if (s->capture_name() == nullptr) continue; + if (*s->capture_name() == *name) return true; + } + return false; + } + + private: + // Linked list implementation of stack of states. + RegExpParserState* const previous_state_; + // Builder for the stored disjunction. + RegExpBuilder builder_; + // Stored disjunction type (capture, look-ahead or grouping), if any. + const SubexpressionType group_type_; + // Stored read direction. + const RegExpLookaround::Type lookaround_type_; + // Stored disjunction's capture index (if any). + const int disjunction_capture_index_; + // Stored capture name (if any). + const ZoneVector* const capture_name_; +}; + +template +class RegExpParserImpl final { + private: + RegExpParserImpl(const CharT* input, int input_length, RegExpFlags flags, + uintptr_t stack_limit, Zone* zone, + const DisallowGarbageCollection& no_gc); + + bool Parse(RegExpCompileData* result); + + RegExpTree* ParsePattern(); + RegExpTree* ParseDisjunction(); + RegExpTree* ParseGroup(); + + // Parses a {...,...} quantifier and stores the range in the given + // out parameters. + bool ParseIntervalQuantifier(int* min_out, int* max_out); + + // Checks whether the following is a length-digit hexadecimal number, + // and sets the value if it is. + bool ParseHexEscape(int length, base::uc32* value); + bool ParseUnicodeEscape(base::uc32* value); + bool ParseUnlimitedLengthHexNumber(int max_value, base::uc32* value); + + bool ParsePropertyClassName(ZoneVector* name_1, + ZoneVector* name_2); + bool AddPropertyClassRange(ZoneList* add_to, bool negate, + const ZoneVector& name_1, + const ZoneVector& name_2); + + RegExpTree* ParseCharacterClass(const RegExpBuilder* state); + + base::uc32 ParseOctalLiteral(); + + // Tries to parse the input as a back reference. If successful it + // stores the result in the output parameter and returns true. If + // it fails it will push back the characters read so the same characters + // can be reparsed. + bool ParseBackReferenceIndex(int* index_out); + + // Parse inside a class. Either add escaped class to the range, or return + // false and pass parsed single character through |char_out|. + void ParseClassEscape(ZoneList* ranges, Zone* zone, + bool add_unicode_case_equivalents, base::uc32* char_out, + bool* is_class_escape); + // Returns true iff parsing was successful. + bool TryParseCharacterClassEscape(base::uc32 next, + InClassEscapeState in_class_escape_state, + ZoneList* ranges, + Zone* zone, + bool add_unicode_case_equivalents); + // Parses and returns a single escaped character. + base::uc32 ParseCharacterEscape(InClassEscapeState in_class_escape_state, + bool* is_escaped_unicode_character); + + RegExpTree* ReportError(RegExpError error); + void Advance(); + void Advance(int dist); + void RewindByOneCodepoint(); // Rewinds to before the previous Advance(). + void Reset(int pos); + + // Reports whether the pattern might be used as a literal search string. + // Only use if the result of the parse is a single atom node. + bool simple() const { return simple_; } + bool contains_anchor() const { return contains_anchor_; } + void set_contains_anchor() { contains_anchor_ = true; } + int captures_started() const { return captures_started_; } + int position() const { return next_pos_ - 1; } + bool failed() const { return failed_; } + bool unicode() const { return IsUnicode(top_level_flags_) || force_unicode_; } + + static bool IsSyntaxCharacterOrSlash(base::uc32 c); + + static const base::uc32 kEndMarker = (1 << 21); + + private: + // Return the 1-indexed RegExpCapture object, allocate if necessary. + RegExpCapture* GetCapture(int index); + + // Creates a new named capture at the specified index. Must be called exactly + // once for each named capture. Fails if a capture with the same name is + // encountered. + bool CreateNamedCaptureAtIndex(const ZoneVector* name, int index); + + // Parses the name of a capture group (?pattern). The name must adhere + // to IdentifierName in the ECMAScript standard. + const ZoneVector* ParseCaptureGroupName(); + + bool ParseNamedBackReference(RegExpBuilder* builder, + RegExpParserState* state); + RegExpParserState* ParseOpenParenthesis(RegExpParserState* state); + + // After the initial parsing pass, patch corresponding RegExpCapture objects + // into all RegExpBackReferences. This is done after initial parsing in order + // to avoid complicating cases in which references comes before the capture. + void PatchNamedBackReferences(); + + ZoneVector* GetNamedCaptures() const; + + // Returns true iff the pattern contains named captures. May call + // ScanForCaptures to look ahead at the remaining pattern. + bool HasNamedCaptures(InClassEscapeState in_class_escape_state); + + Zone* zone() const { return zone_; } + + base::uc32 current() const { return current_; } + bool has_more() const { return has_more_; } + bool has_next() const { return next_pos_ < input_length(); } + base::uc32 Next(); + template + base::uc32 ReadNext(); + CharT InputAt(int index) const { + DCHECK(0 <= index && index < input_length()); + return input_[index]; + } + int input_length() const { return input_length_; } + void ScanForCaptures(InClassEscapeState in_class_escape_state); + + struct RegExpCaptureNameLess { + bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const { + DCHECK_NOT_NULL(lhs); + DCHECK_NOT_NULL(rhs); + return *lhs->name() < *rhs->name(); + } + }; + + class ForceUnicodeScope final { + public: + explicit ForceUnicodeScope(RegExpParserImpl* parser) + : parser_(parser) { + DCHECK(!parser_->force_unicode_); + parser_->force_unicode_ = true; + } + ~ForceUnicodeScope() { + DCHECK(parser_->force_unicode_); + parser_->force_unicode_ = false; + } + + private: + RegExpParserImpl* const parser_; + }; + + const DisallowGarbageCollection no_gc_; + Zone* const zone_; + RegExpError error_ = RegExpError::kNone; + int error_pos_ = 0; + ZoneList* captures_; + ZoneSet* named_captures_; + ZoneList* named_back_references_; + const CharT* const input_; + const int input_length_; + base::uc32 current_; + const RegExpFlags top_level_flags_; + bool force_unicode_ = false; // Force parser to act as if unicode were set. + int next_pos_; + int captures_started_; + int capture_count_; // Only valid after we have scanned for captures. + bool has_more_; + bool simple_; + bool contains_anchor_; + bool is_scanned_for_captures_; + bool has_named_captures_; // Only valid after we have scanned for captures. + bool failed_; + const uintptr_t stack_limit_; + + friend bool RegExpParser::ParseRegExpFromHeapString(Isolate*, Zone*, + Handle, + RegExpFlags, + RegExpCompileData*); + friend bool RegExpParser::VerifyRegExpSyntax( + Zone*, uintptr_t, const CharT*, int, RegExpFlags, RegExpCompileData*, + const DisallowGarbageCollection&); +}; + +template +RegExpParserImpl::RegExpParserImpl( + const CharT* input, int input_length, RegExpFlags flags, + uintptr_t stack_limit, Zone* zone, const DisallowGarbageCollection& no_gc) + : zone_(zone), captures_(nullptr), named_captures_(nullptr), named_back_references_(nullptr), - in_(in), + input_(input), + input_length_(input_length), current_(kEndMarker), top_level_flags_(flags), next_pos_(0), @@ -35,30 +367,44 @@ RegExpParser::RegExpParser(FlatStringReader* in, JSRegExp::Flags flags, contains_anchor_(false), is_scanned_for_captures_(false), has_named_captures_(false), - failed_(false) { + failed_(false), + stack_limit_(stack_limit) { Advance(); } +template <> template -inline uc32 RegExpParser::ReadNext() { +inline base::uc32 RegExpParserImpl::ReadNext() { int position = next_pos_; - uc32 c0 = in()->Get(position); + base::uc16 c0 = InputAt(position); position++; - // Read the whole surrogate pair in case of unicode flag, if possible. - if (unicode() && position < in()->length() && - unibrow::Utf16::IsLeadSurrogate(static_cast(c0))) { - uc16 c1 = in()->Get(position); - if (unibrow::Utf16::IsTrailSurrogate(c1)) { - c0 = unibrow::Utf16::CombineSurrogatePair(static_cast(c0), c1); - position++; - } - } + DCHECK(!unibrow::Utf16::IsLeadSurrogate(c0)); if (update_position) next_pos_ = position; return c0; } +template <> +template +inline base::uc32 RegExpParserImpl::ReadNext() { + int position = next_pos_; + base::uc16 c0 = InputAt(position); + base::uc32 result = c0; + position++; + // Read the whole surrogate pair in case of unicode flag, if possible. + if (unicode() && position < input_length() && + unibrow::Utf16::IsLeadSurrogate(c0)) { + base::uc16 c1 = InputAt(position); + if (unibrow::Utf16::IsTrailSurrogate(c1)) { + result = unibrow::Utf16::CombineSurrogatePair(c0, c1); + position++; + } + } + if (update_position) next_pos_ = position; + return result; +} -uc32 RegExpParser::Next() { +template +base::uc32 RegExpParserImpl::Next() { if (has_next()) { return ReadNext(); } else { @@ -66,19 +412,14 @@ uc32 RegExpParser::Next() { } } -void RegExpParser::Advance() { +template +void RegExpParserImpl::Advance() { if (has_next()) { - StackLimitCheck check(isolate()); - if (check.HasOverflowed()) { + if (GetCurrentStackPosition() < stack_limit_) { if (FLAG_correctness_fuzzer_suppressions) { FATAL("Aborting on stack overflow"); } ReportError(RegExpError::kStackOverflow); - } else if (zone()->excess_allocation()) { - if (FLAG_correctness_fuzzer_suppressions) { - FATAL("Aborting on excess zone allocation"); - } - ReportError(RegExpError::kTooLarge); } else { current_ = ReadNext(); } @@ -86,27 +427,37 @@ void RegExpParser::Advance() { current_ = kEndMarker; // Advance so that position() points to 1-after-the-last-character. This is // important so that Reset() to this position works correctly. - next_pos_ = in()->length() + 1; + next_pos_ = input_length() + 1; has_more_ = false; } } +template +void RegExpParserImpl::RewindByOneCodepoint() { + if (current() == kEndMarker) return; + // Rewinds by one code point, i.e.: two code units if `current` is outside + // the basic multilingual plane (= composed of a lead and trail surrogate), + // or one code unit otherwise. + const int rewind_by = + current() > unibrow::Utf16::kMaxNonSurrogateCharCode ? -2 : -1; + Advance(rewind_by); // Undo the last Advance. +} -void RegExpParser::Reset(int pos) { +template +void RegExpParserImpl::Reset(int pos) { next_pos_ = pos; - has_more_ = (pos < in()->length()); + has_more_ = (pos < input_length()); Advance(); } -void RegExpParser::Advance(int dist) { +template +void RegExpParserImpl::Advance(int dist) { next_pos_ += dist - 1; Advance(); } - -bool RegExpParser::simple() { return simple_; } - -bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) { +template +bool RegExpParserImpl::IsSyntaxCharacterOrSlash(base::uc32 c) { switch (c) { case '^': case '$': @@ -130,14 +481,15 @@ bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) { return false; } -RegExpTree* RegExpParser::ReportError(RegExpError error) { +template +RegExpTree* RegExpParserImpl::ReportError(RegExpError error) { if (failed_) return nullptr; // Do not overwrite any existing error. failed_ = true; error_ = error; error_pos_ = position(); // Zip to the end to make sure no more input is read. current_ = kEndMarker; - next_pos_ = in()->length(); + next_pos_ = input_length(); return nullptr; } @@ -147,19 +499,19 @@ RegExpTree* RegExpParser::ReportError(RegExpError error) { // Pattern :: // Disjunction -RegExpTree* RegExpParser::ParsePattern() { +template +RegExpTree* RegExpParserImpl::ParsePattern() { RegExpTree* result = ParseDisjunction(CHECK_FAILED); PatchNamedBackReferences(CHECK_FAILED); DCHECK(!has_more()); // If the result of parsing is a literal string atom, and it has the // same length as the input, then the atom is identical to the input. - if (result->IsAtom() && result->AsAtom()->length() == in()->length()) { + if (result->IsAtom() && result->AsAtom()->length() == input_length()) { simple_ = true; } return result; } - // Disjunction :: // Alternative // Alternative | Disjunction @@ -170,7 +522,8 @@ RegExpTree* RegExpParser::ParsePattern() { // Assertion // Atom // Atom Quantifier -RegExpTree* RegExpParser::ParseDisjunction() { +template +RegExpTree* RegExpParserImpl::ParseDisjunction() { // Used to store current state while parsing subexpressions. RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD, 0, nullptr, top_level_flags_, zone()); @@ -180,6 +533,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { while (true) { switch (current()) { case kEndMarker: + if (failed()) return nullptr; // E.g. the initial Advance failed. if (state->IsSubexpression()) { // Inside a parenthesized group when hitting end of input. return ReportError(RegExpError::kUnterminatedGroup); @@ -213,12 +567,12 @@ RegExpTree* RegExpParser::ParseDisjunction() { capture->set_body(body); body = capture; } else if (group_type == GROUPING) { - body = zone()->New(body); + body = zone()->template New(body); } else { DCHECK(group_type == POSITIVE_LOOKAROUND || group_type == NEGATIVE_LOOKAROUND); bool is_positive = (group_type == POSITIVE_LOOKAROUND); - body = zone()->New( + body = zone()->template New( body, is_positive, end_capture_index - capture_index, capture_index, state->lookaround_type()); } @@ -243,40 +597,38 @@ RegExpTree* RegExpParser::ParseDisjunction() { return ReportError(RegExpError::kNothingToRepeat); case '^': { Advance(); - if (builder->multiline()) { - builder->AddAssertion(zone()->New( - RegExpAssertion::START_OF_LINE, builder->flags())); - } else { - builder->AddAssertion(zone()->New( - RegExpAssertion::START_OF_INPUT, builder->flags())); - set_contains_anchor(); - } + builder->AddAssertion(zone()->template New( + builder->multiline() ? RegExpAssertion::Type::START_OF_LINE + : RegExpAssertion::Type::START_OF_INPUT)); + set_contains_anchor(); continue; } case '$': { Advance(); - RegExpAssertion::AssertionType assertion_type = - builder->multiline() ? RegExpAssertion::END_OF_LINE - : RegExpAssertion::END_OF_INPUT; + RegExpAssertion::Type assertion_type = + builder->multiline() ? RegExpAssertion::Type::END_OF_LINE + : RegExpAssertion::Type::END_OF_INPUT; builder->AddAssertion( - zone()->New(assertion_type, builder->flags())); + zone()->template New(assertion_type)); continue; } case '.': { Advance(); ZoneList* ranges = - zone()->New>(2, zone()); + zone()->template New>(2, zone()); if (builder->dotall()) { // Everything. - CharacterRange::AddClassEscape('*', ranges, false, zone()); + CharacterRange::AddClassEscape(StandardCharacterSet::kEverything, + ranges, false, zone()); } else { - // Everything except \x0A, \x0D, \u2028 and \u2029 - CharacterRange::AddClassEscape('.', ranges, false, zone()); + // Everything except \x0A, \x0D, \u2028 and \u2029. + CharacterRange::AddClassEscape( + StandardCharacterSet::kNotLineTerminator, ranges, false, zone()); } RegExpCharacterClass* cc = - zone()->New(zone(), ranges, builder->flags()); + zone()->template New(zone(), ranges); builder->AddCharacterClass(cc); break; } @@ -296,68 +648,19 @@ RegExpTree* RegExpParser::ParseDisjunction() { switch (Next()) { case kEndMarker: return ReportError(RegExpError::kEscapeAtEndOfPattern); - case 'b': - Advance(2); - builder->AddAssertion(zone()->New( - RegExpAssertion::BOUNDARY, builder->flags())); - continue; - case 'B': - Advance(2); - builder->AddAssertion(zone()->New( - RegExpAssertion::NON_BOUNDARY, builder->flags())); - continue; // AtomEscape :: - // CharacterClassEscape + // [+UnicodeMode] DecimalEscape + // [~UnicodeMode] DecimalEscape but only if the CapturingGroupNumber + // of DecimalEscape is ≤ NcapturingParens + // CharacterEscape (some cases of this mixed in too) // - // CharacterClassEscape :: one of - // d D s S w W - case 'd': - case 'D': - case 's': - case 'S': - case 'w': - case 'W': { - uc32 c = Next(); - Advance(2); - ZoneList* ranges = - zone()->New>(2, zone()); - CharacterRange::AddClassEscape( - c, ranges, unicode() && builder->ignore_case(), zone()); - RegExpCharacterClass* cc = zone()->New( - zone(), ranges, builder->flags()); - builder->AddCharacterClass(cc); - break; - } - case 'p': - case 'P': { - uc32 p = Next(); - Advance(2); - if (unicode()) { - ZoneList* ranges = - zone()->New>(2, zone()); - ZoneVector name_1(zone()); - ZoneVector name_2(zone()); - if (ParsePropertyClassName(&name_1, &name_2)) { - if (AddPropertyClassRange(ranges, p == 'P', name_1, name_2)) { - RegExpCharacterClass* cc = zone()->New( - zone(), ranges, builder->flags()); - builder->AddCharacterClass(cc); - break; - } - if (p == 'p' && name_2.empty()) { - RegExpTree* sequence = GetPropertySequence(name_1); - if (sequence != nullptr) { - builder->AddAtom(sequence); - break; - } - } - } - return ReportError(RegExpError::kInvalidPropertyName); - } else { - builder->AddCharacter(p); - } - break; - } + // TODO(jgruber): It may make sense to disentangle all the different + // cases and make the structure mirror the spec, e.g. for AtomEscape: + // + // if (TryParseDecimalEscape(...)) return; + // if (TryParseCharacterClassEscape(...)) return; + // if (TryParseCharacterEscape(...)) return; + // if (TryParseGroupName(...)) return; case '1': case '2': case '3': @@ -368,7 +671,8 @@ RegExpTree* RegExpParser::ParseDisjunction() { case '8': case '9': { int index = 0; - bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED); + const bool is_backref = + ParseBackReferenceIndex(&index CHECK_FAILED); if (is_backref) { if (state->IsInsideCaptureGroup(index)) { // The back reference is inside the capture group it refers to. @@ -379,8 +683,8 @@ RegExpTree* RegExpParser::ParseDisjunction() { builder->AddEmpty(); } else { RegExpCapture* capture = GetCapture(index); - RegExpTree* atom = - zone()->New(capture, builder->flags()); + RegExpTree* atom = zone()->template New( + capture, builder->flags()); builder->AddAtom(atom); } break; @@ -390,7 +694,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { if (unicode()) { return ReportError(RegExpError::kInvalidEscape); } - uc32 first_digit = Next(); + base::uc32 first_digit = Next(); if (first_digit == '8' || first_digit == '9') { builder->AddCharacter(first_digit); Advance(2); @@ -404,103 +708,81 @@ RegExpTree* RegExpParser::ParseDisjunction() { // With /u, decimal escape with leading 0 are not parsed as octal. return ReportError(RegExpError::kInvalidDecimalEscape); } - uc32 octal = ParseOctalLiteral(); + base::uc32 octal = ParseOctalLiteral(); builder->AddCharacter(octal); break; } - // ControlEscape :: one of - // f n r t v - case 'f': + case 'b': Advance(2); - builder->AddCharacter('\f'); - break; - case 'n': + builder->AddAssertion(zone()->template New( + RegExpAssertion::Type::BOUNDARY)); + continue; + case 'B': Advance(2); - builder->AddCharacter('\n'); - break; - case 'r': - Advance(2); - builder->AddCharacter('\r'); - break; - case 't': - Advance(2); - builder->AddCharacter('\t'); - break; - case 'v': - Advance(2); - builder->AddCharacter('\v'); - break; - case 'c': { - Advance(); - uc32 controlLetter = Next(); - // Special case if it is an ASCII letter. - // Convert lower case letters to uppercase. - uc32 letter = controlLetter & ~('a' ^ 'A'); - if (letter < 'A' || 'Z' < letter) { - // controlLetter is not in range 'A'-'Z' or 'a'-'z'. - // Read the backslash as a literal character instead of as - // starting an escape. - // ES#prod-annexB-ExtendedPatternCharacter - if (unicode()) { - // With /u, invalid escapes are not treated as identity escapes. - return ReportError(RegExpError::kInvalidUnicodeEscape); - } - builder->AddCharacter('\\'); + builder->AddAssertion(zone()->template New( + RegExpAssertion::Type::NON_BOUNDARY)); + continue; + // AtomEscape :: + // CharacterClassEscape + case 'd': + case 'D': + case 's': + case 'S': + case 'w': + case 'W': + case 'p': + case 'P': { + base::uc32 next = Next(); + ZoneList* ranges = + zone()->template New>(2, zone()); + bool add_unicode_case_equivalents = + unicode() && builder->ignore_case(); + bool parsed_character_class_escape = TryParseCharacterClassEscape( + next, InClassEscapeState::kNotInClass, ranges, zone(), + add_unicode_case_equivalents CHECK_FAILED); + + if (parsed_character_class_escape) { + RegExpCharacterClass* cc = + zone()->template New(zone(), ranges); + builder->AddCharacterClass(cc); } else { + CHECK(!unicode()); Advance(2); - builder->AddCharacter(controlLetter & 0x1F); + builder->AddCharacter(next); // IdentityEscape. } break; } - case 'x': { - Advance(2); - uc32 value; - if (ParseHexEscape(2, &value)) { - builder->AddCharacter(value); - } else if (!unicode()) { - builder->AddCharacter('x'); - } else { - // With /u, invalid escapes are not treated as identity escapes. - return ReportError(RegExpError::kInvalidEscape); - } - break; - } - case 'u': { - Advance(2); - uc32 value; - if (ParseUnicodeEscape(&value)) { - builder->AddEscapedUnicodeCharacter(value); - } else if (!unicode()) { - builder->AddCharacter('u'); - } else { - // With /u, invalid escapes are not treated as identity escapes. - return ReportError(RegExpError::kInvalidUnicodeEscape); - } - break; - } - case 'k': + // AtomEscape :: + // k GroupName + case 'k': { // Either an identity escape or a named back-reference. The two // interpretations are mutually exclusive: '\k' is interpreted as // an identity escape for non-Unicode patterns without named // capture groups, and as the beginning of a named back-reference // in all other cases. - if (unicode() || HasNamedCaptures()) { + const bool has_named_captures = + HasNamedCaptures(InClassEscapeState::kNotInClass CHECK_FAILED); + if (unicode() || has_named_captures) { Advance(2); ParseNamedBackReference(builder, state CHECK_FAILED); break; } + } V8_FALLTHROUGH; - default: - Advance(); - // With /u, no identity escapes except for syntax characters - // are allowed. Otherwise, all identity escapes are allowed. - if (!unicode() || IsSyntaxCharacterOrSlash(current())) { - builder->AddCharacter(current()); - Advance(); + // AtomEscape :: + // CharacterEscape + default: { + bool is_escaped_unicode_character = false; + base::uc32 c = ParseCharacterEscape( + InClassEscapeState::kNotInClass, + &is_escaped_unicode_character CHECK_FAILED); + if (is_escaped_unicode_character) { + builder->AddEscapedUnicodeCharacter(c); } else { - return ReportError(RegExpError::kInvalidEscape); + builder->AddCharacter(c); } break; + } } break; case '{': { @@ -573,13 +855,12 @@ RegExpTree* RegExpParser::ParseDisjunction() { } } -RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis( +template +RegExpParserState* RegExpParserImpl::ParseOpenParenthesis( RegExpParserState* state) { RegExpLookaround::Type lookaround_type = state->lookaround_type(); bool is_named_capture = false; - JSRegExp::Flags switch_on = JSRegExp::kNone; - JSRegExp::Flags switch_off = JSRegExp::kNone; - const ZoneVector* capture_name = nullptr; + const ZoneVector* capture_name = nullptr; SubexpressionType subexpr_type = CAPTURE; Advance(); if (current() == '?') { @@ -598,68 +879,6 @@ RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis( lookaround_type = RegExpLookaround::LOOKAHEAD; subexpr_type = NEGATIVE_LOOKAROUND; break; - case '-': - case 'i': - case 's': - case 'm': { - if (!FLAG_regexp_mode_modifiers) { - ReportError(RegExpError::kInvalidGroup); - return nullptr; - } - Advance(); - bool flags_sense = true; // Switching on flags. - while (subexpr_type != GROUPING) { - switch (current()) { - case '-': - if (!flags_sense) { - ReportError(RegExpError::kMultipleFlagDashes); - return nullptr; - } - flags_sense = false; - Advance(); - continue; - case 's': - case 'i': - case 'm': { - JSRegExp::Flags bit = JSRegExp::kUnicode; - if (current() == 'i') bit = JSRegExp::kIgnoreCase; - if (current() == 'm') bit = JSRegExp::kMultiline; - if (current() == 's') bit = JSRegExp::kDotAll; - if (((switch_on | switch_off) & bit) != 0) { - ReportError(RegExpError::kRepeatedFlag); - return nullptr; - } - if (flags_sense) { - switch_on |= bit; - } else { - switch_off |= bit; - } - Advance(); - continue; - } - case ')': { - Advance(); - state->builder() - ->FlushText(); // Flush pending text using old flags. - // These (?i)-style flag switches don't put us in a subexpression - // at all, they just modify the flags in the rest of the current - // subexpression. - JSRegExp::Flags flags = - (state->builder()->flags() | switch_on) & ~switch_off; - state->builder()->set_flags(flags); - return state; - } - case ':': - Advance(); - subexpr_type = GROUPING; // Will break us out of the outer loop. - continue; - default: - ReportError(RegExpError::kInvalidFlagGroup); - return nullptr; - } - } - break; - } case '<': Advance(); if (Next() == '=') { @@ -683,7 +902,7 @@ RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis( } } if (subexpr_type == CAPTURE) { - if (captures_started_ >= JSRegExp::kMaxCaptures) { + if (captures_started_ >= RegExpMacroAssembler::kMaxRegisterCount) { ReportError(RegExpError::kTooManyCaptures); return nullptr; } @@ -693,16 +912,16 @@ RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis( capture_name = ParseCaptureGroupName(CHECK_FAILED); } } - JSRegExp::Flags flags = (state->builder()->flags() | switch_on) & ~switch_off; // Store current state and begin new disjunction parsing. - return zone()->New(state, subexpr_type, lookaround_type, - captures_started_, capture_name, flags, - zone()); + return zone()->template New( + state, subexpr_type, lookaround_type, captures_started_, capture_name, + state->builder()->flags(), zone()); } #ifdef DEBUG -// Currently only used in an DCHECK. -static bool IsSpecialClassEscape(uc32 c) { +namespace { + +bool IsSpecialClassEscape(base::uc32 c) { switch (c) { case 'd': case 'D': @@ -715,8 +934,9 @@ static bool IsSpecialClassEscape(uc32 c) { return false; } } -#endif +} // namespace +#endif // In order to know whether an escape is a backreference or not we have to scan // the entire regexp and find the number of capturing parentheses. However we @@ -724,11 +944,28 @@ static bool IsSpecialClassEscape(uc32 c) { // is called when needed. It can see the difference between capturing and // noncapturing parentheses and can skip character classes and backslash-escaped // characters. -void RegExpParser::ScanForCaptures() { +// +// Important: The scanner has to be in a consistent state when calling +// ScanForCaptures, e.g. not in the middle of an escape sequence '\['. +template +void RegExpParserImpl::ScanForCaptures( + InClassEscapeState in_class_escape_state) { DCHECK(!is_scanned_for_captures_); const int saved_position = position(); // Start with captures started previous to current position int capture_count = captures_started(); + // When we start inside a character class, skip everything inside the class. + if (in_class_escape_state == InClassEscapeState::kInClass) { + int c; + while ((c = current()) != kEndMarker) { + Advance(); + if (c == '\\') { + Advance(); + } else { + if (c == ']') break; + } + } + } // Add count of captures after this position. int n; while ((n = current()) != kEndMarker) { @@ -778,8 +1015,8 @@ void RegExpParser::ScanForCaptures() { Reset(saved_position); } - -bool RegExpParser::ParseBackReferenceIndex(int* index_out) { +template +bool RegExpParserImpl::ParseBackReferenceIndex(int* index_out) { DCHECK_EQ('\\', current()); DCHECK('1' <= Next() && Next() <= '9'); // Try to parse a decimal literal that is no greater than the total number @@ -788,10 +1025,10 @@ bool RegExpParser::ParseBackReferenceIndex(int* index_out) { int value = Next() - '0'; Advance(2); while (true) { - uc32 c = current(); + base::uc32 c = current(); if (IsDecimalDigit(c)) { value = 10 * value + (c - '0'); - if (value > JSRegExp::kMaxCaptures) { + if (value > RegExpMacroAssembler::kMaxRegisterCount) { Reset(start); return false; } @@ -801,7 +1038,8 @@ bool RegExpParser::ParseBackReferenceIndex(int* index_out) { } } if (value > captures_started()) { - if (!is_scanned_for_captures_) ScanForCaptures(); + if (!is_scanned_for_captures_) + ScanForCaptures(InClassEscapeState::kNotInClass); if (value > capture_count_) { Reset(start); return false; @@ -811,7 +1049,9 @@ bool RegExpParser::ParseBackReferenceIndex(int* index_out) { return true; } -static void push_code_unit(ZoneVector* v, uint32_t code_unit) { +namespace { + +void push_code_unit(ZoneVector* v, uint32_t code_unit) { if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) { v->push_back(code_unit); } else { @@ -820,53 +1060,83 @@ static void push_code_unit(ZoneVector* v, uint32_t code_unit) { } } -const ZoneVector* RegExpParser::ParseCaptureGroupName() { - ZoneVector* name = zone()->New>(zone()); +} // namespace - bool at_start = true; - while (true) { - uc32 c = current(); - Advance(); +template +const ZoneVector* RegExpParserImpl::ParseCaptureGroupName() { + // Due to special Advance requirements (see the next comment), rewind by one + // such that names starting with a surrogate pair are parsed correctly for + // patterns where the unicode flag is unset. + // + // Note that we use this odd pattern of rewinding the last advance in order + // to adhere to the common parser behavior of expecting `current` to point at + // the first candidate character for a function (e.g. when entering ParseFoo, + // `current` should point at the first character of Foo). + RewindByOneCodepoint(); - // Convert unicode escapes. - if (c == '\\' && current() == 'u') { + ZoneVector* name = + zone()->template New>(zone()); + + { + // Advance behavior inside this function is tricky since + // RegExpIdentifierName explicitly enables unicode (in spec terms, sets +U) + // and thus allows surrogate pairs and \u{}-style escapes even in + // non-unicode patterns. Therefore Advance within the capture group name + // has to force-enable unicode, and outside the name revert to default + // behavior. + ForceUnicodeScope force_unicode(this); + + bool at_start = true; + while (true) { Advance(); - if (!ParseUnicodeEscape(&c)) { - ReportError(RegExpError::kInvalidUnicodeEscape); - return nullptr; + base::uc32 c = current(); + + // Convert unicode escapes. + if (c == '\\' && Next() == 'u') { + Advance(2); + if (!ParseUnicodeEscape(&c)) { + ReportError(RegExpError::kInvalidUnicodeEscape); + return nullptr; + } + RewindByOneCodepoint(); } - } - // The backslash char is misclassified as both ID_Start and ID_Continue. - if (c == '\\') { - ReportError(RegExpError::kInvalidCaptureGroupName); - return nullptr; - } - - if (at_start) { - if (!IsIdentifierStart(c)) { + // The backslash char is misclassified as both ID_Start and ID_Continue. + if (c == '\\') { ReportError(RegExpError::kInvalidCaptureGroupName); return nullptr; } - push_code_unit(name, c); - at_start = false; - } else { - if (c == '>') { - break; - } else if (IsIdentifierPart(c)) { + + if (at_start) { + if (!IsIdentifierStart(c)) { + ReportError(RegExpError::kInvalidCaptureGroupName); + return nullptr; + } push_code_unit(name, c); + at_start = false; } else { - ReportError(RegExpError::kInvalidCaptureGroupName); - return nullptr; + if (c == '>') { + break; + } else if (IsIdentifierPart(c)) { + push_code_unit(name, c); + } else { + ReportError(RegExpError::kInvalidCaptureGroupName); + return nullptr; + } } } } + // This final advance goes back into the state of pointing at the next + // relevant char, which the rest of the parser expects. See also the previous + // comments in this function. + Advance(); return name; } -bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector* name, - int index) { +template +bool RegExpParserImpl::CreateNamedCaptureAtIndex( + const ZoneVector* name, int index) { DCHECK(0 < index && index <= captures_started_); DCHECK_NOT_NULL(name); @@ -877,7 +1147,8 @@ bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector* name, if (named_captures_ == nullptr) { named_captures_ = - zone_->New>(zone()); + zone_->template New>( + zone()); } else { // Check for duplicates and bail if we find any. @@ -893,8 +1164,9 @@ bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector* name, return true; } -bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder, - RegExpParserState* state) { +template +bool RegExpParserImpl::ParseNamedBackReference( + RegExpBuilder* builder, RegExpParserState* state) { // The parser is assumed to be on the '<' in \k. if (current() != '<') { ReportError(RegExpError::kInvalidNamedReference); @@ -902,7 +1174,7 @@ bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder, } Advance(); - const ZoneVector* name = ParseCaptureGroupName(); + const ZoneVector* name = ParseCaptureGroupName(); if (name == nullptr) { return false; } @@ -911,14 +1183,14 @@ bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder, builder->AddEmpty(); } else { RegExpBackReference* atom = - zone()->New(builder->flags()); + zone()->template New(builder->flags()); atom->set_name(name); builder->AddAtom(atom); if (named_back_references_ == nullptr) { named_back_references_ = - zone()->New>(1, zone()); + zone()->template New>(1, zone()); } named_back_references_->Add(atom, zone()); } @@ -926,7 +1198,8 @@ bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder, return true; } -void RegExpParser::PatchNamedBackReferences() { +template +void RegExpParserImpl::PatchNamedBackReferences() { if (named_back_references_ == nullptr) return; if (named_captures_ == nullptr) { @@ -942,7 +1215,8 @@ void RegExpParser::PatchNamedBackReferences() { // Capture used to search the named_captures_ by name, index of the // capture is never used. static const int kInvalidIndex = 0; - RegExpCapture* search_capture = zone()->New(kInvalidIndex); + RegExpCapture* search_capture = + zone()->template New(kInvalidIndex); DCHECK_NULL(search_capture->name()); search_capture->set_name(ref->name()); @@ -959,100 +1233,46 @@ void RegExpParser::PatchNamedBackReferences() { } } -RegExpCapture* RegExpParser::GetCapture(int index) { +template +RegExpCapture* RegExpParserImpl::GetCapture(int index) { // The index for the capture groups are one-based. Its index in the list is // zero-based. - int know_captures = + const int known_captures = is_scanned_for_captures_ ? capture_count_ : captures_started_; - DCHECK(index <= know_captures); + DCHECK(index <= known_captures); if (captures_ == nullptr) { - captures_ = zone()->New>(know_captures, zone()); + captures_ = + zone()->template New>(known_captures, zone()); } - while (captures_->length() < know_captures) { - captures_->Add(zone()->New(captures_->length() + 1), zone()); + while (captures_->length() < known_captures) { + captures_->Add(zone()->template New(captures_->length() + 1), + zone()); } return captures_->at(index - 1); } -namespace { - -struct RegExpCaptureIndexLess { - bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const { - DCHECK_NOT_NULL(lhs); - DCHECK_NOT_NULL(rhs); - return lhs->index() < rhs->index(); - } -}; - -} // namespace - -Handle RegExpParser::CreateCaptureNameMap() { +template +ZoneVector* RegExpParserImpl::GetNamedCaptures() const { if (named_captures_ == nullptr || named_captures_->empty()) { - return Handle(); + return nullptr; } - // Named captures are sorted by name (because the set is used to ensure - // name uniqueness). But the capture name map must to be sorted by index. - - ZoneVector sorted_named_captures( + return zone()->template New>( named_captures_->begin(), named_captures_->end(), zone()); - std::sort(sorted_named_captures.begin(), sorted_named_captures.end(), - RegExpCaptureIndexLess{}); - DCHECK_EQ(sorted_named_captures.size(), named_captures_->size()); - - Factory* factory = isolate()->factory(); - - int len = static_cast(sorted_named_captures.size()) * 2; - Handle array = factory->NewFixedArray(len); - - int i = 0; - for (const auto& capture : sorted_named_captures) { - Vector capture_name(capture->name()->data(), - capture->name()->size()); - // CSA code in ConstructNewResultFromMatchInfo requires these strings to be - // internalized so they can be used as property names in the 'exec' results. - Handle name = factory->InternalizeString(capture_name); - array->set(i * 2, *name); - array->set(i * 2 + 1, Smi::FromInt(capture->index())); - - i++; - } - DCHECK_EQ(i * 2, len); - - return array; } -bool RegExpParser::HasNamedCaptures() { +template +bool RegExpParserImpl::HasNamedCaptures( + InClassEscapeState in_class_escape_state) { if (has_named_captures_ || is_scanned_for_captures_) { return has_named_captures_; } - ScanForCaptures(); + ScanForCaptures(in_class_escape_state); DCHECK(is_scanned_for_captures_); return has_named_captures_; } -bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) { - for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) { - if (s->group_type() != CAPTURE) continue; - // Return true if we found the matching capture index. - if (index == s->capture_index()) return true; - // Abort if index is larger than what has been parsed up till this state. - if (index > s->capture_index()) return false; - } - return false; -} - -bool RegExpParser::RegExpParserState::IsInsideCaptureGroup( - const ZoneVector* name) { - DCHECK_NOT_NULL(name); - for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) { - if (s->capture_name() == nullptr) continue; - if (*s->capture_name() == *name) return true; - } - return false; -} - // QuantifierPrefix :: // { DecimalDigits } // { DecimalDigits , } @@ -1060,7 +1280,9 @@ bool RegExpParser::RegExpParserState::IsInsideCaptureGroup( // // Returns true if parsing succeeds, and set the min_out and max_out // values. Values are truncated to RegExpTree::kInfinity if they overflow. -bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) { +template +bool RegExpParserImpl::ParseIntervalQuantifier(int* min_out, + int* max_out) { DCHECK_EQ(current(), '{'); int start = position(); Advance(); @@ -1119,13 +1341,13 @@ bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) { return true; } - -uc32 RegExpParser::ParseOctalLiteral() { +template +base::uc32 RegExpParserImpl::ParseOctalLiteral() { DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker); // For compatibility with some other browsers (not all), we parse // up to three octal digits with a value below 256. // ES#prod-annexB-LegacyOctalEscapeSequence - uc32 value = current() - '0'; + base::uc32 value = current() - '0'; Advance(); if ('0' <= current() && current() <= '7') { value = value * 8 + current() - '0'; @@ -1138,13 +1360,13 @@ uc32 RegExpParser::ParseOctalLiteral() { return value; } - -bool RegExpParser::ParseHexEscape(int length, uc32* value) { +template +bool RegExpParserImpl::ParseHexEscape(int length, base::uc32* value) { int start = position(); - uc32 val = 0; + base::uc32 val = 0; for (int i = 0; i < length; ++i) { - uc32 c = current(); - int d = HexValue(c); + base::uc32 c = current(); + int d = base::HexValue(c); if (d < 0) { Reset(start); return false; @@ -1157,7 +1379,8 @@ bool RegExpParser::ParseHexEscape(int length, uc32* value) { } // This parses RegExpUnicodeEscapeSequence as described in ECMA262. -bool RegExpParser::ParseUnicodeEscape(uc32* value) { +template +bool RegExpParserImpl::ParseUnicodeEscape(base::uc32* value) { // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are // allowed). In the latter case, the number of hex digits between { } is // arbitrary. \ and u have already been read. @@ -1181,11 +1404,11 @@ bool RegExpParser::ParseUnicodeEscape(uc32* value) { int start = position(); if (Next() == 'u') { Advance(2); - uc32 trail; + base::uc32 trail; if (ParseHexEscape(4, &trail) && unibrow::Utf16::IsTrailSurrogate(trail)) { - *value = unibrow::Utf16::CombineSurrogatePair(static_cast(*value), - static_cast(trail)); + *value = unibrow::Utf16::CombineSurrogatePair( + static_cast(*value), static_cast(trail)); return true; } } @@ -1370,10 +1593,11 @@ bool IsUnicodePropertyValueCharacter(char c) { return (c == '_'); } -} // anonymous namespace +} // namespace -bool RegExpParser::ParsePropertyClassName(ZoneVector* name_1, - ZoneVector* name_2) { +template +bool RegExpParserImpl::ParsePropertyClassName(ZoneVector* name_1, + ZoneVector* name_2) { DCHECK(name_1->empty()); DCHECK(name_2->empty()); // Parse the property class as follows: @@ -1410,10 +1634,10 @@ bool RegExpParser::ParsePropertyClassName(ZoneVector* name_1, return true; } -bool RegExpParser::AddPropertyClassRange(ZoneList* add_to, - bool negate, - const ZoneVector& name_1, - const ZoneVector& name_2) { +template +bool RegExpParserImpl::AddPropertyClassRange( + ZoneList* add_to, bool negate, + const ZoneVector& name_1, const ZoneVector& name_2) { if (name_2.empty()) { // First attempt to interpret as general category property value name. const char* name = name_1.data(); @@ -1450,119 +1674,58 @@ bool RegExpParser::AddPropertyClassRange(ZoneList* add_to, } } -RegExpTree* RegExpParser::GetPropertySequence(const ZoneVector& name_1) { - if (!FLAG_harmony_regexp_sequence) return nullptr; - const char* name = name_1.data(); - const uc32* sequence_list = nullptr; - JSRegExp::Flags flags = JSRegExp::kUnicode; - if (NameEquals(name, "Emoji_Flag_Sequence")) { - sequence_list = UnicodePropertySequences::kEmojiFlagSequences; - } else if (NameEquals(name, "Emoji_Tag_Sequence")) { - sequence_list = UnicodePropertySequences::kEmojiTagSequences; - } else if (NameEquals(name, "Emoji_ZWJ_Sequence")) { - sequence_list = UnicodePropertySequences::kEmojiZWJSequences; - } - if (sequence_list != nullptr) { - // TODO(yangguo): this creates huge regexp code. Alternative to this is - // to create a new operator that checks for these sequences at runtime. - RegExpBuilder builder(zone(), flags); - while (true) { // Iterate through list of sequences. - while (*sequence_list != 0) { // Iterate through sequence. - builder.AddUnicodeCharacter(*sequence_list); - sequence_list++; - } - sequence_list++; - if (*sequence_list == 0) break; - builder.NewAlternative(); - } - return builder.ToRegExp(); - } - - if (NameEquals(name, "Emoji_Keycap_Sequence")) { - // https://unicode.org/reports/tr51/#def_emoji_keycap_sequence - // emoji_keycap_sequence := [0-9#*] \x{FE0F 20E3} - RegExpBuilder builder(zone(), flags); - ZoneList* prefix_ranges = - zone()->New>(2, zone()); - prefix_ranges->Add(CharacterRange::Range('0', '9'), zone()); - prefix_ranges->Add(CharacterRange::Singleton('#'), zone()); - prefix_ranges->Add(CharacterRange::Singleton('*'), zone()); - builder.AddCharacterClass( - zone()->New(zone(), prefix_ranges, flags)); - builder.AddCharacter(0xFE0F); - builder.AddCharacter(0x20E3); - return builder.ToRegExp(); - } else if (NameEquals(name, "Emoji_Modifier_Sequence")) { - // https://unicode.org/reports/tr51/#def_emoji_modifier_sequence - // emoji_modifier_sequence := emoji_modifier_base emoji_modifier - RegExpBuilder builder(zone(), flags); - ZoneList* modifier_base_ranges = - zone()->New>(2, zone()); - LookupPropertyValueName(UCHAR_EMOJI_MODIFIER_BASE, "Y", false, - modifier_base_ranges, zone()); - builder.AddCharacterClass( - zone()->New(zone(), modifier_base_ranges, flags)); - ZoneList* modifier_ranges = - zone()->New>(2, zone()); - LookupPropertyValueName(UCHAR_EMOJI_MODIFIER, "Y", false, modifier_ranges, - zone()); - builder.AddCharacterClass( - zone()->New(zone(), modifier_ranges, flags)); - return builder.ToRegExp(); - } - - return nullptr; -} - #else // V8_INTL_SUPPORT -bool RegExpParser::ParsePropertyClassName(ZoneVector* name_1, - ZoneVector* name_2) { +template +bool RegExpParserImpl::ParsePropertyClassName(ZoneVector* name_1, + ZoneVector* name_2) { return false; } -bool RegExpParser::AddPropertyClassRange(ZoneList* add_to, - bool negate, - const ZoneVector& name_1, - const ZoneVector& name_2) { +template +bool RegExpParserImpl::AddPropertyClassRange( + ZoneList* add_to, bool negate, + const ZoneVector& name_1, const ZoneVector& name_2) { return false; } -RegExpTree* RegExpParser::GetPropertySequence(const ZoneVector& name) { - return nullptr; -} - #endif // V8_INTL_SUPPORT -bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) { - uc32 x = 0; - int d = HexValue(current()); +template +bool RegExpParserImpl::ParseUnlimitedLengthHexNumber(int max_value, + base::uc32* value) { + base::uc32 x = 0; + int d = base::HexValue(current()); if (d < 0) { return false; } while (d >= 0) { x = x * 16 + d; - if (x > static_cast(max_value)) { + if (x > static_cast(max_value)) { return false; } Advance(); - d = HexValue(current()); + d = base::HexValue(current()); } *value = x; return true; } - -uc32 RegExpParser::ParseClassCharacterEscape() { +// https://tc39.es/ecma262/#prod-CharacterEscape +template +base::uc32 RegExpParserImpl::ParseCharacterEscape( + InClassEscapeState in_class_escape_state, + bool* is_escaped_unicode_character) { DCHECK_EQ('\\', current()); DCHECK(has_next() && !IsSpecialClassEscape(Next())); + Advance(); - switch (current()) { - case 'b': - Advance(); - return '\b'; - // ControlEscape :: one of - // f n r t v + + const base::uc32 c = current(); + switch (c) { + // CharacterEscape :: + // ControlEscape :: one of + // f n r t v case 'f': Advance(); return '\f'; @@ -1578,12 +1741,11 @@ uc32 RegExpParser::ParseClassCharacterEscape() { case 'v': Advance(); return '\v'; + // CharacterEscape :: + // c ControlLetter case 'c': { - uc32 controlLetter = Next(); - uc32 letter = controlLetter & ~('A' ^ 'a'); - // Inside a character class, we also accept digits and underscore as - // control characters, unless with /u. See Annex B: - // ES#prod-annexB-ClassControlLetter + base::uc32 controlLetter = Next(); + base::uc32 letter = controlLetter & ~('A' ^ 'a'); if (letter >= 'A' && letter <= 'Z') { Advance(2); // Control letters mapped to ASCII control characters in the range @@ -1592,22 +1754,29 @@ uc32 RegExpParser::ParseClassCharacterEscape() { } if (unicode()) { // With /u, invalid escapes are not treated as identity escapes. - ReportError(RegExpError::kInvalidClassEscape); + ReportError(RegExpError::kInvalidUnicodeEscape); return 0; } - if ((controlLetter >= '0' && controlLetter <= '9') || - controlLetter == '_') { - Advance(2); - return controlLetter & 0x1F; + if (in_class_escape_state == InClassEscapeState::kInClass) { + // Inside a character class, we also accept digits and underscore as + // control characters, unless with /u. See Annex B: + // ES#prod-annexB-ClassControlLetter + if ((controlLetter >= '0' && controlLetter <= '9') || + controlLetter == '_') { + Advance(2); + return controlLetter & 0x1F; + } } // We match JSC in reading the backslash as a literal // character instead of as starting an escape. - // TODO(v8:6201): Not yet covered by the spec. return '\\'; } + // CharacterEscape :: + // 0 [lookahead ∉ DecimalDigit] + // [~UnicodeMode] LegacyOctalEscapeSequence case '0': - // With /u, \0 is interpreted as NUL if not followed by another digit. - if (unicode() && !(Next() >= '0' && Next() <= '9')) { + // \0 is interpreted as NUL if not followed by another digit. + if (Next() < '0' || Next() > '9') { Advance(); return 0; } @@ -1629,9 +1798,11 @@ uc32 RegExpParser::ParseClassCharacterEscape() { return 0; } return ParseOctalLiteral(); + // CharacterEscape :: + // HexEscapeSequence case 'x': { Advance(); - uc32 value; + base::uc32 value; if (ParseHexEscape(2, &value)) return value; if (unicode()) { // With /u, invalid escapes are not treated as identity escapes. @@ -1642,10 +1813,15 @@ uc32 RegExpParser::ParseClassCharacterEscape() { // as an identity escape. return 'x'; } + // CharacterEscape :: + // RegExpUnicodeEscapeSequence [?UnicodeMode] case 'u': { Advance(); - uc32 value; - if (ParseUnicodeEscape(&value)) return value; + base::uc32 value; + if (ParseUnicodeEscape(&value)) { + *is_escaped_unicode_character = true; + return value; + } if (unicode()) { // With /u, invalid escapes are not treated as identity escapes. ReportError(RegExpError::kInvalidUnicodeEscape); @@ -1655,71 +1831,131 @@ uc32 RegExpParser::ParseClassCharacterEscape() { // as an identity escape. return 'u'; } - default: { - uc32 result = current(); - // With /u, no identity escapes except for syntax characters and '-' are - // allowed. Otherwise, all identity escapes are allowed. - if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') { - Advance(); - return result; - } + default: + break; + } + + // CharacterEscape :: + // IdentityEscape[?UnicodeMode, ?N] + // + // * With /u, no identity escapes except for syntax characters are + // allowed. + // * Without /u: + // * '\c' is not an IdentityEscape. + // * '\k' is not an IdentityEscape when named captures exist. + // * Otherwise, all identity escapes are allowed. + if (unicode()) { + if (!IsSyntaxCharacterOrSlash(c)) { ReportError(RegExpError::kInvalidEscape); return 0; } + Advance(); + return c; } - UNREACHABLE(); + DCHECK(!unicode()); + if (c == 'c') { + ReportError(RegExpError::kInvalidEscape); + return 0; + } + Advance(); + // Note: It's important to Advance before the HasNamedCaptures call s.t. we + // don't start scanning in the middle of an escape. + if (c == 'k' && HasNamedCaptures(in_class_escape_state)) { + ReportError(RegExpError::kInvalidEscape); + return 0; + } + return c; } -void RegExpParser::ParseClassEscape(ZoneList* ranges, - Zone* zone, - bool add_unicode_case_equivalents, - uc32* char_out, bool* is_class_escape) { - uc32 current_char = current(); - if (current_char == '\\') { - switch (Next()) { - case 'w': - case 'W': - case 'd': - case 'D': - case 's': - case 'S': { - CharacterRange::AddClassEscape(static_cast(Next()), ranges, - add_unicode_case_equivalents, zone); +// https://tc39.es/ecma262/#prod-ClassEscape +template +void RegExpParserImpl::ParseClassEscape( + ZoneList* ranges, Zone* zone, + bool add_unicode_case_equivalents, base::uc32* char_out, + bool* is_class_escape) { + *is_class_escape = false; + + if (current() != '\\') { + // Not a ClassEscape. + *char_out = current(); + Advance(); + return; + } + + const base::uc32 next = Next(); + switch (next) { + case 'b': + *char_out = '\b'; + Advance(2); + return; + case '-': + if (unicode()) { + *char_out = next; Advance(2); - *is_class_escape = true; return; } - case kEndMarker: - ReportError(RegExpError::kEscapeAtEndOfPattern); - return; - case 'p': - case 'P': - if (unicode()) { - bool negate = Next() == 'P'; - Advance(2); - ZoneVector name_1(zone); - ZoneVector name_2(zone); - if (!ParsePropertyClassName(&name_1, &name_2) || - !AddPropertyClassRange(ranges, negate, name_1, name_2)) { - ReportError(RegExpError::kInvalidClassPropertyName); - } - *is_class_escape = true; - return; - } - break; - default: - break; + break; + case kEndMarker: + ReportError(RegExpError::kEscapeAtEndOfPattern); + return; + default: + break; + } + + static constexpr InClassEscapeState kInClassEscape = + InClassEscapeState::kInClass; + *is_class_escape = TryParseCharacterClassEscape( + next, kInClassEscape, ranges, zone, add_unicode_case_equivalents); + if (*is_class_escape) return; + + bool dummy = false; // Unused. + *char_out = ParseCharacterEscape(kInClassEscape, &dummy); +} + +// https://tc39.es/ecma262/#prod-CharacterClassEscape +template +bool RegExpParserImpl::TryParseCharacterClassEscape( + base::uc32 next, InClassEscapeState in_class_escape_state, + ZoneList* ranges, Zone* zone, + bool add_unicode_case_equivalents) { + DCHECK_EQ(current(), '\\'); + DCHECK_EQ(Next(), next); + + switch (next) { + case 'd': + case 'D': + case 's': + case 'S': + case 'w': + case 'W': + CharacterRange::AddClassEscape(static_cast(next), + ranges, add_unicode_case_equivalents, + zone); + Advance(2); + return true; + case 'p': + case 'P': { + if (!unicode()) return false; + bool negate = next == 'P'; + Advance(2); + ZoneVector name_1(zone); + ZoneVector name_2(zone); + if (!ParsePropertyClassName(&name_1, &name_2) || + !AddPropertyClassRange(ranges, negate, name_1, name_2)) { + ReportError(in_class_escape_state == InClassEscapeState::kInClass + ? RegExpError::kInvalidClassPropertyName + : RegExpError::kInvalidPropertyName); + } + return true; } - *char_out = ParseClassCharacterEscape(); - *is_class_escape = false; - } else { - Advance(); - *char_out = current_char; - *is_class_escape = false; + default: + return false; } } -RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) { +template +RegExpTree* RegExpParserImpl::ParseCharacterClass( + const RegExpBuilder* builder) { DCHECK_EQ(current(), '['); Advance(); bool is_negated = false; @@ -1728,10 +1964,10 @@ RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) { Advance(); } ZoneList* ranges = - zone()->New>(2, zone()); + zone()->template New>(2, zone()); bool add_unicode_case_equivalents = unicode() && builder->ignore_case(); while (has_more() && current() != ']') { - uc32 char_1, char_2; + base::uc32 char_1, char_2; bool is_class_1, is_class_2; ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_1, &is_class_1 CHECK_FAILED); @@ -1774,103 +2010,65 @@ RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) { Advance(); RegExpCharacterClass::CharacterClassFlags character_class_flags; if (is_negated) character_class_flags = RegExpCharacterClass::NEGATED; - return zone()->New(zone(), ranges, builder->flags(), - character_class_flags); + return zone()->template New(zone(), ranges, + character_class_flags); } - #undef CHECK_FAILED -bool RegExpParser::Parse(RegExpCompileData* result, - const DisallowGarbageCollection&) { - DCHECK(result != nullptr); +template +bool RegExpParserImpl::Parse(RegExpCompileData* result) { + DCHECK_NOT_NULL(result); RegExpTree* tree = ParsePattern(); + if (failed()) { - DCHECK(tree == nullptr); - DCHECK(error_ != RegExpError::kNone); + DCHECK_NULL(tree); + DCHECK_NE(error_, RegExpError::kNone); result->error = error_; result->error_pos = error_pos_; - } else { - DCHECK(tree != nullptr); - DCHECK(error_ == RegExpError::kNone); - if (FLAG_trace_regexp_parser) { - StdoutStream os; - tree->Print(os, zone()); - os << "\n"; - } - result->tree = tree; - int capture_count = captures_started(); - result->simple = tree->IsAtom() && simple() && capture_count == 0; - result->contains_anchor = contains_anchor(); - result->capture_count = capture_count; + return false; } - return !failed(); -} -bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone, - FlatStringReader* input, JSRegExp::Flags flags, - RegExpCompileData* result) { - RegExpParser parser(input, flags, isolate, zone); - bool success; - { - DisallowGarbageCollection no_gc; - success = parser.Parse(result, no_gc); + DCHECK_NOT_NULL(tree); + DCHECK_EQ(error_, RegExpError::kNone); + if (FLAG_trace_regexp_parser) { + StdoutStream os; + tree->Print(os, zone()); + os << "\n"; } - if (success) { - result->capture_name_map = parser.CreateCaptureNameMap(); - } - return success; + + result->tree = tree; + const int capture_count = captures_started(); + result->simple = tree->IsAtom() && simple() && capture_count == 0; + result->contains_anchor = contains_anchor(); + result->capture_count = capture_count; + result->named_captures = GetNamedCaptures(); + return true; } -bool RegExpParser::VerifyRegExpSyntax(Isolate* isolate, Zone* zone, - FlatStringReader* input, - JSRegExp::Flags flags, - RegExpCompileData* result, - const DisallowGarbageCollection& no_gc) { - RegExpParser parser(input, flags, isolate, zone); - return parser.Parse(result, no_gc); -} - -RegExpBuilder::RegExpBuilder(Zone* zone, JSRegExp::Flags flags) - : zone_(zone), - pending_empty_(false), - flags_(flags), - characters_(nullptr), - pending_surrogate_(kNoPendingSurrogate), - terms_(), - alternatives_() -#ifdef DEBUG - , - last_added_(ADD_NONE) -#endif -{ -} - - -void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) { +void RegExpBuilder::AddLeadSurrogate(base::uc16 lead_surrogate) { DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); FlushPendingSurrogate(); // Hold onto the lead surrogate, waiting for a trail surrogate to follow. pending_surrogate_ = lead_surrogate; } - -void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) { +void RegExpBuilder::AddTrailSurrogate(base::uc16 trail_surrogate) { DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate)); if (pending_surrogate_ != kNoPendingSurrogate) { - uc16 lead_surrogate = pending_surrogate_; + base::uc16 lead_surrogate = pending_surrogate_; pending_surrogate_ = kNoPendingSurrogate; DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); - uc32 combined = + base::uc32 combined = unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate); if (NeedsDesugaringForIgnoreCase(combined)) { AddCharacterClassForDesugaring(combined); } else { - ZoneList surrogate_pair(2, zone()); + ZoneList surrogate_pair(2, zone()); surrogate_pair.Add(lead_surrogate, zone()); surrogate_pair.Add(trail_surrogate, zone()); RegExpAtom* atom = - zone()->New(surrogate_pair.ToConstVector(), flags_); + zone()->New(surrogate_pair.ToConstVector()); AddAtom(atom); } } else { @@ -1879,63 +2077,59 @@ void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) { } } - void RegExpBuilder::FlushPendingSurrogate() { if (pending_surrogate_ != kNoPendingSurrogate) { DCHECK(unicode()); - uc32 c = pending_surrogate_; + base::uc32 c = pending_surrogate_; pending_surrogate_ = kNoPendingSurrogate; AddCharacterClassForDesugaring(c); } } - void RegExpBuilder::FlushCharacters() { FlushPendingSurrogate(); pending_empty_ = false; if (characters_ != nullptr) { - RegExpTree* atom = - zone()->New(characters_->ToConstVector(), flags_); + RegExpTree* atom = zone()->New(characters_->ToConstVector()); characters_ = nullptr; - text_.Add(atom, zone()); + text_.emplace_back(atom); LAST(ADD_ATOM); } } - void RegExpBuilder::FlushText() { FlushCharacters(); - int num_text = text_.length(); + size_t num_text = text_.size(); if (num_text == 0) { return; } else if (num_text == 1) { - terms_.Add(text_.last(), zone()); + terms_.emplace_back(text_.back()); } else { RegExpText* text = zone()->New(zone()); - for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone()); - terms_.Add(text, zone()); + for (size_t i = 0; i < num_text; i++) { + text_[i]->AppendToText(text, zone()); + } + terms_.emplace_back(text); } - text_.Clear(); + text_.clear(); } - -void RegExpBuilder::AddCharacter(uc16 c) { +void RegExpBuilder::AddCharacter(base::uc16 c) { FlushPendingSurrogate(); pending_empty_ = false; if (NeedsDesugaringForIgnoreCase(c)) { AddCharacterClassForDesugaring(c); } else { if (characters_ == nullptr) { - characters_ = zone()->New>(4, zone()); + characters_ = zone()->New>(4, zone()); } characters_->Add(c, zone()); LAST(ADD_CHAR); } } - -void RegExpBuilder::AddUnicodeCharacter(uc32 c) { - if (c > static_cast(unibrow::Utf16::kMaxNonSurrogateCharCode)) { +void RegExpBuilder::AddUnicodeCharacter(base::uc32 c) { + if (c > static_cast(unibrow::Utf16::kMaxNonSurrogateCharCode)) { DCHECK(unicode()); AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c)); AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c)); @@ -1944,11 +2138,11 @@ void RegExpBuilder::AddUnicodeCharacter(uc32 c) { } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) { AddTrailSurrogate(c); } else { - AddCharacter(static_cast(c)); + AddCharacter(static_cast(c)); } } -void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) { +void RegExpBuilder::AddEscapedUnicodeCharacter(base::uc32 character) { // A lead or trail surrogate parsed via escape sequence will not // pair up with any preceding lead or following trail surrogate. FlushPendingSurrogate(); @@ -1958,7 +2152,6 @@ void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) { void RegExpBuilder::AddEmpty() { pending_empty_ = true; } - void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { if (NeedsDesugaringForUnicode(cc)) { // With /u, character class needs to be desugared, so it @@ -1969,13 +2162,11 @@ void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { } } -void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) { +void RegExpBuilder::AddCharacterClassForDesugaring(base::uc32 c) { AddTerm(zone()->New( - zone(), CharacterRange::List(zone(), CharacterRange::Singleton(c)), - flags_)); + zone(), CharacterRange::List(zone(), CharacterRange::Singleton(c)))); } - void RegExpBuilder::AddAtom(RegExpTree* term) { if (term->IsEmpty()) { AddEmpty(); @@ -1983,49 +2174,46 @@ void RegExpBuilder::AddAtom(RegExpTree* term) { } if (term->IsTextElement()) { FlushCharacters(); - text_.Add(term, zone()); + text_.emplace_back(term); } else { FlushText(); - terms_.Add(term, zone()); + terms_.emplace_back(term); } LAST(ADD_ATOM); } - void RegExpBuilder::AddTerm(RegExpTree* term) { FlushText(); - terms_.Add(term, zone()); + terms_.emplace_back(term); LAST(ADD_ATOM); } - void RegExpBuilder::AddAssertion(RegExpTree* assert) { FlushText(); - terms_.Add(assert, zone()); + terms_.emplace_back(assert); LAST(ADD_ASSERT); } - void RegExpBuilder::NewAlternative() { FlushTerms(); } - void RegExpBuilder::FlushTerms() { FlushText(); - int num_terms = terms_.length(); + size_t num_terms = terms_.size(); RegExpTree* alternative; if (num_terms == 0) { alternative = zone()->New(); } else if (num_terms == 1) { - alternative = terms_.last(); + alternative = terms_.back(); } else { - alternative = zone()->New(terms_.GetList(zone())); + alternative = + zone()->New(zone()->New>( + base::VectorOf(terms_.begin(), terms_.size()), zone())); } - alternatives_.Add(alternative, zone()); - terms_.Clear(); + alternatives_.emplace_back(alternative); + terms_.clear(); LAST(ADD_NONE); } - bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) { if (!unicode()) return false; // TODO(yangguo): we could be smarter than this. Case-insensitivity does not @@ -2035,8 +2223,8 @@ bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) { ZoneList* ranges = cc->ranges(zone()); CharacterRange::Canonicalize(ranges); for (int i = ranges->length() - 1; i >= 0; i--) { - uc32 from = ranges->at(i).from(); - uc32 to = ranges->at(i).to(); + base::uc32 from = ranges->at(i).from(); + base::uc32 to = ranges->at(i).to(); // Check for non-BMP characters. if (to >= kNonBmpStart) return true; // Check for lone surrogates. @@ -2045,8 +2233,7 @@ bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) { return false; } - -bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) { +bool RegExpBuilder::NeedsDesugaringForIgnoreCase(base::uc32 c) { #ifdef V8_INTL_SUPPORT if (unicode() && ignore_case()) { icu::UnicodeSet set(c, c); @@ -2060,13 +2247,13 @@ bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) { return false; } - RegExpTree* RegExpBuilder::ToRegExp() { FlushTerms(); - int num_alternatives = alternatives_.length(); + size_t num_alternatives = alternatives_.size(); if (num_alternatives == 0) return zone()->New(); - if (num_alternatives == 1) return alternatives_.last(); - return zone()->New(alternatives_.GetList(zone())); + if (num_alternatives == 1) return alternatives_.back(); + return zone()->New(zone()->New>( + base::VectorOf(alternatives_.begin(), alternatives_.size()), zone())); } bool RegExpBuilder::AddQuantifierToAtom( @@ -2080,23 +2267,26 @@ bool RegExpBuilder::AddQuantifierToAtom( if (characters_ != nullptr) { DCHECK(last_added_ == ADD_CHAR); // Last atom was character. - Vector char_vector = characters_->ToConstVector(); + base::Vector char_vector = characters_->ToConstVector(); int num_chars = char_vector.length(); if (num_chars > 1) { - Vector prefix = char_vector.SubVector(0, num_chars - 1); - text_.Add(zone()->New(prefix, flags_), zone()); + base::Vector prefix = + char_vector.SubVector(0, num_chars - 1); + text_.emplace_back(zone()->New(prefix)); char_vector = char_vector.SubVector(num_chars - 1, num_chars); } characters_ = nullptr; - atom = zone()->New(char_vector, flags_); + atom = zone()->New(char_vector); FlushText(); - } else if (text_.length() > 0) { + } else if (text_.size() > 0) { DCHECK(last_added_ == ADD_ATOM); - atom = text_.RemoveLast(); + atom = text_.back(); + text_.pop_back(); FlushText(); - } else if (terms_.length() > 0) { + } else if (terms_.size() > 0) { DCHECK(last_added_ == ADD_ATOM); - atom = terms_.RemoveLast(); + atom = terms_.back(); + terms_.pop_back(); if (atom->IsLookaround()) { // With /u, lookarounds are not quantifiable. if (unicode()) return false; @@ -2111,18 +2301,65 @@ bool RegExpBuilder::AddQuantifierToAtom( if (min == 0) { return true; } - terms_.Add(atom, zone()); + terms_.emplace_back(atom); return true; } } else { // Only call immediately after adding an atom or character! UNREACHABLE(); } - terms_.Add(zone()->New(min, max, quantifier_type, atom), - zone()); + terms_.emplace_back( + zone()->New(min, max, quantifier_type, atom)); LAST(ADD_TERM); return true; } +template class RegExpParserImpl; +template class RegExpParserImpl; + +} // namespace + +// static +bool RegExpParser::ParseRegExpFromHeapString(Isolate* isolate, Zone* zone, + Handle input, + RegExpFlags flags, + RegExpCompileData* result) { + DisallowGarbageCollection no_gc; + uintptr_t stack_limit = isolate->stack_guard()->real_climit(); + String::FlatContent content = input->GetFlatContent(no_gc); + if (content.IsOneByte()) { + base::Vector v = content.ToOneByteVector(); + return RegExpParserImpl{v.begin(), v.length(), flags, + stack_limit, zone, no_gc} + .Parse(result); + } else { + base::Vector v = content.ToUC16Vector(); + return RegExpParserImpl{v.begin(), v.length(), flags, + stack_limit, zone, no_gc} + .Parse(result); + } +} + +// static +template +bool RegExpParser::VerifyRegExpSyntax(Zone* zone, uintptr_t stack_limit, + const CharT* input, int input_length, + RegExpFlags flags, + RegExpCompileData* result, + const DisallowGarbageCollection& no_gc) { + return RegExpParserImpl{input, input_length, flags, + stack_limit, zone, no_gc} + .Parse(result); +} + +template bool RegExpParser::VerifyRegExpSyntax( + Zone*, uintptr_t, const uint8_t*, int, RegExpFlags, RegExpCompileData*, + const DisallowGarbageCollection&); +template bool RegExpParser::VerifyRegExpSyntax( + Zone*, uintptr_t, const base::uc16*, int, RegExpFlags, RegExpCompileData*, + const DisallowGarbageCollection&); + +#undef LAST + } // namespace internal } // namespace v8 diff --git a/js/src/irregexp/imported/regexp-parser.h b/js/src/irregexp/imported/regexp-parser.h index 61ecb7b18c1d..1e45d9753230 100644 --- a/js/src/irregexp/imported/regexp-parser.h +++ b/js/src/irregexp/imported/regexp-parser.h @@ -5,363 +5,27 @@ #ifndef V8_REGEXP_REGEXP_PARSER_H_ #define V8_REGEXP_REGEXP_PARSER_H_ -#include "irregexp/imported/regexp-ast.h" -#include "irregexp/imported/regexp-error.h" +#include "irregexp/RegExpShim.h" namespace v8 { namespace internal { +class String; +class Zone; + struct RegExpCompileData; -// A BufferedZoneList is an automatically growing list, just like (and backed -// by) a ZoneList, that is optimized for the case of adding and removing -// a single element. The last element added is stored outside the backing list, -// and if no more than one element is ever added, the ZoneList isn't even -// allocated. -// Elements must not be nullptr pointers. -template -class BufferedZoneList { +class V8_EXPORT_PRIVATE RegExpParser : public AllStatic { public: - BufferedZoneList() : list_(nullptr), last_(nullptr) {} + static bool ParseRegExpFromHeapString(Isolate* isolate, Zone* zone, + Handle input, RegExpFlags flags, + RegExpCompileData* result); - // Adds element at end of list. This element is buffered and can - // be read using last() or removed using RemoveLast until a new Add or until - // RemoveLast or GetList has been called. - void Add(T* value, Zone* zone) { - if (last_ != nullptr) { - if (list_ == nullptr) { - list_ = zone->New>(initial_size, zone); - } - list_->Add(last_, zone); - } - last_ = value; - } - - T* last() { - DCHECK(last_ != nullptr); - return last_; - } - - T* RemoveLast() { - DCHECK(last_ != nullptr); - T* result = last_; - if ((list_ != nullptr) && (list_->length() > 0)) - last_ = list_->RemoveLast(); - else - last_ = nullptr; - return result; - } - - T* Get(int i) { - DCHECK((0 <= i) && (i < length())); - if (list_ == nullptr) { - DCHECK_EQ(0, i); - return last_; - } else { - if (i == list_->length()) { - DCHECK(last_ != nullptr); - return last_; - } else { - return list_->at(i); - } - } - } - - void Clear() { - list_ = nullptr; - last_ = nullptr; - } - - int length() { - int length = (list_ == nullptr) ? 0 : list_->length(); - return length + ((last_ == nullptr) ? 0 : 1); - } - - ZoneList* GetList(Zone* zone) { - if (list_ == nullptr) { - list_ = zone->New>(initial_size, zone); - } - if (last_ != nullptr) { - list_->Add(last_, zone); - last_ = nullptr; - } - return list_; - } - - private: - ZoneList* list_; - T* last_; -}; - - -// Accumulates RegExp atoms and assertions into lists of terms and alternatives. -class RegExpBuilder : public ZoneObject { - public: - RegExpBuilder(Zone* zone, JSRegExp::Flags flags); - void AddCharacter(uc16 character); - void AddUnicodeCharacter(uc32 character); - void AddEscapedUnicodeCharacter(uc32 character); - // "Adds" an empty expression. Does nothing except consume a - // following quantifier - void AddEmpty(); - void AddCharacterClass(RegExpCharacterClass* cc); - void AddCharacterClassForDesugaring(uc32 c); - void AddAtom(RegExpTree* tree); - void AddTerm(RegExpTree* tree); - void AddAssertion(RegExpTree* tree); - void NewAlternative(); // '|' - bool AddQuantifierToAtom(int min, int max, - RegExpQuantifier::QuantifierType type); - void FlushText(); - RegExpTree* ToRegExp(); - JSRegExp::Flags flags() const { return flags_; } - void set_flags(JSRegExp::Flags flags) { flags_ = flags; } - - bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; } - bool multiline() const { return (flags_ & JSRegExp::kMultiline) != 0; } - bool dotall() const { return (flags_ & JSRegExp::kDotAll) != 0; } - - private: - static const uc16 kNoPendingSurrogate = 0; - void AddLeadSurrogate(uc16 lead_surrogate); - void AddTrailSurrogate(uc16 trail_surrogate); - void FlushPendingSurrogate(); - void FlushCharacters(); - void FlushTerms(); - bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc); - bool NeedsDesugaringForIgnoreCase(uc32 c); - Zone* zone() const { return zone_; } - bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; } - - Zone* zone_; - bool pending_empty_; - JSRegExp::Flags flags_; - ZoneList* characters_; - uc16 pending_surrogate_; - BufferedZoneList terms_; - BufferedZoneList text_; - BufferedZoneList alternatives_; -#ifdef DEBUG - enum { ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM } last_added_; -#define LAST(x) last_added_ = x; -#else -#define LAST(x) -#endif -}; - -class V8_EXPORT_PRIVATE RegExpParser { - public: - RegExpParser(FlatStringReader* in, JSRegExp::Flags flags, Isolate* isolate, - Zone* zone); - - static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input, - JSRegExp::Flags flags, RegExpCompileData* result); - - // Used by the SpiderMonkey embedding of irregexp. - static bool VerifyRegExpSyntax(Isolate* isolate, Zone* zone, - FlatStringReader* input, JSRegExp::Flags flags, - RegExpCompileData* result, - const DisallowGarbageCollection& nogc); - - private: - bool Parse(RegExpCompileData* result, const DisallowGarbageCollection&); - - RegExpTree* ParsePattern(); - RegExpTree* ParseDisjunction(); - RegExpTree* ParseGroup(); - - // Parses a {...,...} quantifier and stores the range in the given - // out parameters. - bool ParseIntervalQuantifier(int* min_out, int* max_out); - - // Parses and returns a single escaped character. The character - // must not be 'b' or 'B' since they are usually handle specially. - uc32 ParseClassCharacterEscape(); - - // Checks whether the following is a length-digit hexadecimal number, - // and sets the value if it is. - bool ParseHexEscape(int length, uc32* value); - bool ParseUnicodeEscape(uc32* value); - bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value); - - bool ParsePropertyClassName(ZoneVector* name_1, - ZoneVector* name_2); - bool AddPropertyClassRange(ZoneList* add_to, bool negate, - const ZoneVector& name_1, - const ZoneVector& name_2); - - RegExpTree* GetPropertySequence(const ZoneVector& name_1); - RegExpTree* ParseCharacterClass(const RegExpBuilder* state); - - uc32 ParseOctalLiteral(); - - // Tries to parse the input as a back reference. If successful it - // stores the result in the output parameter and returns true. If - // it fails it will push back the characters read so the same characters - // can be reparsed. - bool ParseBackReferenceIndex(int* index_out); - - // Parse inside a class. Either add escaped class to the range, or return - // false and pass parsed single character through |char_out|. - void ParseClassEscape(ZoneList* ranges, Zone* zone, - bool add_unicode_case_equivalents, uc32* char_out, - bool* is_class_escape); - - char ParseClassEscape(); - - RegExpTree* ReportError(RegExpError error); - void Advance(); - void Advance(int dist); - void Reset(int pos); - - // Reports whether the pattern might be used as a literal search string. - // Only use if the result of the parse is a single atom node. - bool simple(); - bool contains_anchor() { return contains_anchor_; } - void set_contains_anchor() { contains_anchor_ = true; } - int captures_started() { return captures_started_; } - int position() { return next_pos_ - 1; } - bool failed() { return failed_; } - // The Unicode flag can't be changed using in-regexp syntax, so it's OK to - // just read the initial flag value here. - bool unicode() const { return (top_level_flags_ & JSRegExp::kUnicode) != 0; } - - static bool IsSyntaxCharacterOrSlash(uc32 c); - - static const uc32 kEndMarker = (1 << 21); - - private: - enum SubexpressionType { - INITIAL, - CAPTURE, // All positive values represent captures. - POSITIVE_LOOKAROUND, - NEGATIVE_LOOKAROUND, - GROUPING - }; - - class RegExpParserState : public ZoneObject { - public: - // Push a state on the stack. - RegExpParserState(RegExpParserState* previous_state, - SubexpressionType group_type, - RegExpLookaround::Type lookaround_type, - int disjunction_capture_index, - const ZoneVector* capture_name, - JSRegExp::Flags flags, Zone* zone) - : previous_state_(previous_state), - builder_(zone->New(zone, flags)), - group_type_(group_type), - lookaround_type_(lookaround_type), - disjunction_capture_index_(disjunction_capture_index), - capture_name_(capture_name) {} - // Parser state of containing expression, if any. - RegExpParserState* previous_state() const { return previous_state_; } - bool IsSubexpression() { return previous_state_ != nullptr; } - // RegExpBuilder building this regexp's AST. - RegExpBuilder* builder() const { return builder_; } - // Type of regexp being parsed (parenthesized group or entire regexp). - SubexpressionType group_type() const { return group_type_; } - // Lookahead or Lookbehind. - RegExpLookaround::Type lookaround_type() const { return lookaround_type_; } - // Index in captures array of first capture in this sub-expression, if any. - // Also the capture index of this sub-expression itself, if group_type - // is CAPTURE. - int capture_index() const { return disjunction_capture_index_; } - // The name of the current sub-expression, if group_type is CAPTURE. Only - // used for named captures. - const ZoneVector* capture_name() const { return capture_name_; } - - bool IsNamedCapture() const { return capture_name_ != nullptr; } - - // Check whether the parser is inside a capture group with the given index. - bool IsInsideCaptureGroup(int index); - // Check whether the parser is inside a capture group with the given name. - bool IsInsideCaptureGroup(const ZoneVector* name); - - private: - // Linked list implementation of stack of states. - RegExpParserState* const previous_state_; - // Builder for the stored disjunction. - RegExpBuilder* const builder_; - // Stored disjunction type (capture, look-ahead or grouping), if any. - const SubexpressionType group_type_; - // Stored read direction. - const RegExpLookaround::Type lookaround_type_; - // Stored disjunction's capture index (if any). - const int disjunction_capture_index_; - // Stored capture name (if any). - const ZoneVector* const capture_name_; - }; - - // Return the 1-indexed RegExpCapture object, allocate if necessary. - RegExpCapture* GetCapture(int index); - - // Creates a new named capture at the specified index. Must be called exactly - // once for each named capture. Fails if a capture with the same name is - // encountered. - bool CreateNamedCaptureAtIndex(const ZoneVector* name, int index); - - // Parses the name of a capture group (?pattern). The name must adhere - // to IdentifierName in the ECMAScript standard. - const ZoneVector* ParseCaptureGroupName(); - - bool ParseNamedBackReference(RegExpBuilder* builder, - RegExpParserState* state); - RegExpParserState* ParseOpenParenthesis(RegExpParserState* state); - - // After the initial parsing pass, patch corresponding RegExpCapture objects - // into all RegExpBackReferences. This is done after initial parsing in order - // to avoid complicating cases in which references comes before the capture. - void PatchNamedBackReferences(); - - Handle CreateCaptureNameMap(); - - // Returns true iff the pattern contains named captures. May call - // ScanForCaptures to look ahead at the remaining pattern. - bool HasNamedCaptures(); - - Isolate* isolate() { return isolate_; } - Zone* zone() const { return zone_; } - - uc32 current() { return current_; } - bool has_more() { return has_more_; } - bool has_next() { return next_pos_ < in()->length(); } - uc32 Next(); - template - uc32 ReadNext(); - FlatStringReader* in() { return in_; } - void ScanForCaptures(); - - struct RegExpCaptureNameLess { - bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const { - DCHECK_NOT_NULL(lhs); - DCHECK_NOT_NULL(rhs); - return *lhs->name() < *rhs->name(); - } - }; - - Isolate* isolate_; - Zone* zone_; - RegExpError error_ = RegExpError::kNone; - int error_pos_ = 0; - ZoneList* captures_; - ZoneSet* named_captures_; - ZoneList* named_back_references_; - FlatStringReader* in_; - uc32 current_; - // These are the flags specified outside the regexp syntax ie after the - // terminating '/' or in the second argument to the constructor. The current - // flags are stored on the RegExpBuilder. - JSRegExp::Flags top_level_flags_; - int next_pos_; - int captures_started_; - int capture_count_; // Only valid after we have scanned for captures. - bool has_more_; - bool simple_; - bool contains_anchor_; - bool is_scanned_for_captures_; - bool has_named_captures_; // Only valid after we have scanned for captures. - bool failed_; + template + static bool VerifyRegExpSyntax(Zone* zone, uintptr_t stack_limit, + const CharT* input, int input_length, + RegExpFlags flags, RegExpCompileData* result, + const DisallowGarbageCollection& no_gc); }; } // namespace internal diff --git a/js/src/irregexp/imported/regexp-stack.cc b/js/src/irregexp/imported/regexp-stack.cc index 8fb8d2396fa6..ad0aedc67a7d 100644 --- a/js/src/irregexp/imported/regexp-stack.cc +++ b/js/src/irregexp/imported/regexp-stack.cc @@ -9,23 +9,17 @@ namespace v8 { namespace internal { RegExpStackScope::RegExpStackScope(Isolate* isolate) - : regexp_stack_(isolate->regexp_stack()) { + : regexp_stack_(isolate->regexp_stack()), + old_sp_top_delta_(regexp_stack_->sp_top_delta()) { DCHECK(regexp_stack_->IsValid()); - // Irregexp is not reentrant in several ways; in particular, the - // RegExpStackScope is not reentrant since the destructor frees allocated - // memory. Protect against reentrancy here. - CHECK(!regexp_stack_->is_in_use()); - regexp_stack_->set_is_in_use(true); } - RegExpStackScope::~RegExpStackScope() { - // Reset the buffer if it has grown. - regexp_stack_->Reset(); - DCHECK(!regexp_stack_->is_in_use()); + CHECK_EQ(old_sp_top_delta_, regexp_stack_->sp_top_delta()); + regexp_stack_->ResetIfEmpty(); } -RegExpStack::RegExpStack() : thread_local_(this), isolate_(nullptr) {} +RegExpStack::RegExpStack() : thread_local_(this) {} RegExpStack::~RegExpStack() { thread_local_.FreeAndInvalidate(); } @@ -50,18 +44,16 @@ char* RegExpStack::RestoreStack(char* from) { return from + kThreadLocalSize; } -void RegExpStack::Reset() { thread_local_.ResetToStaticStack(this); } - void RegExpStack::ThreadLocal::ResetToStaticStack(RegExpStack* regexp_stack) { if (owns_memory_) DeleteArray(memory_); memory_ = regexp_stack->static_stack_; memory_top_ = regexp_stack->static_stack_ + kStaticStackSize; memory_size_ = kStaticStackSize; + stack_pointer_ = memory_top_; limit_ = reinterpret_cast
(regexp_stack->static_stack_) + kStackLimitSlack * kSystemPointerSize; owns_memory_ = false; - is_in_use_ = false; } void RegExpStack::ThreadLocal::FreeAndInvalidate() { @@ -72,6 +64,7 @@ void RegExpStack::ThreadLocal::FreeAndInvalidate() { memory_ = nullptr; memory_top_ = nullptr; memory_size_ = 0; + stack_pointer_ = nullptr; limit_ = kMemoryTop; } @@ -86,9 +79,11 @@ Address RegExpStack::EnsureCapacity(size_t size) { thread_local_.memory_, thread_local_.memory_size_); if (thread_local_.owns_memory_) DeleteArray(thread_local_.memory_); } + ptrdiff_t delta = sp_top_delta(); thread_local_.memory_ = new_memory; thread_local_.memory_top_ = new_memory + size; thread_local_.memory_size_ = size; + thread_local_.stack_pointer_ = thread_local_.memory_top_ + delta; thread_local_.limit_ = reinterpret_cast
(new_memory) + kStackLimitSlack * kSystemPointerSize; thread_local_.owns_memory_ = true; diff --git a/js/src/irregexp/imported/regexp-stack.h b/js/src/irregexp/imported/regexp-stack.h index e42ac16d8329..f03898bb0095 100644 --- a/js/src/irregexp/imported/regexp-stack.h +++ b/js/src/irregexp/imported/regexp-stack.h @@ -14,10 +14,7 @@ class RegExpStack; // Maintains a per-v8thread stack area that can be used by irregexp // implementation for its backtracking stack. -// Since there is only one stack area, the Irregexp implementation is not -// re-entrant. I.e., no regular expressions may be executed in the same thread -// during a preempted Irregexp execution. -class V8_NODISCARD RegExpStackScope { +class V8_NODISCARD RegExpStackScope final { public: // Create and delete an instance to control the life-time of a growing stack. @@ -30,46 +27,45 @@ class V8_NODISCARD RegExpStackScope { RegExpStack* stack() const { return regexp_stack_; } private: - RegExpStack* regexp_stack_; + RegExpStack* const regexp_stack_; + const ptrdiff_t old_sp_top_delta_; }; -class RegExpStack { +class RegExpStack final { public: RegExpStack(); ~RegExpStack(); RegExpStack(const RegExpStack&) = delete; RegExpStack& operator=(const RegExpStack&) = delete; - // Number of allocated locations on the stack below the limit. - // No sequence of pushes must be longer that this without doing a stack-limit - // check. + // Number of allocated locations on the stack below the limit. No sequence of + // pushes must be longer than this without doing a stack-limit check. static constexpr int kStackLimitSlack = 32; - // Gives the top of the memory used as stack. - Address stack_base() { + Address memory_top() const { DCHECK_NE(0, thread_local_.memory_size_); DCHECK_EQ(thread_local_.memory_top_, thread_local_.memory_ + thread_local_.memory_size_); return reinterpret_cast
(thread_local_.memory_top_); } - // The total size of the memory allocated for the stack. - size_t stack_capacity() { return thread_local_.memory_size_; } + Address stack_pointer() const { + return reinterpret_cast
(thread_local_.stack_pointer_); + } + + size_t memory_size() const { return thread_local_.memory_size_; } // If the stack pointer gets below the limit, we should react and // either grow the stack or report an out-of-stack exception. // There is only a limited number of locations below the stack limit, // so users of the stack should check the stack limit during any // sequence of pushes longer that this. - Address* limit_address_address() { return &(thread_local_.limit_); } + Address* limit_address_address() { return &thread_local_.limit_; } // Ensures that there is a memory area with at least the specified size. // If passing zero, the default/minimum size buffer is allocated. Address EnsureCapacity(size_t size); - bool is_in_use() const { return thread_local_.is_in_use_; } - void set_is_in_use(bool v) { thread_local_.is_in_use_ = v; } - // Thread local archiving. static constexpr int ArchiveSpacePerThread() { return static_cast(kThreadLocalSize); @@ -99,46 +95,61 @@ class RegExpStack { 2 * kStackLimitSlack * kSystemPointerSize; byte static_stack_[kStaticStackSize] = {0}; - STATIC_ASSERT(kStaticStackSize <= kMaximumStackSize); + static_assert(kStaticStackSize <= kMaximumStackSize); - // Structure holding the allocated memory, size and limit. + // Structure holding the allocated memory, size and limit. Thread switching + // archives and restores this struct. struct ThreadLocal { explicit ThreadLocal(RegExpStack* regexp_stack) { ResetToStaticStack(regexp_stack); } - // If memory_size_ > 0 then memory_ and memory_top_ must be non-nullptr - // and memory_top_ = memory_ + memory_size_ + // If memory_size_ > 0 then + // - memory_, memory_top_, stack_pointer_ must be non-nullptr + // - memory_top_ = memory_ + memory_size_ + // - memory_ <= stack_pointer_ <= memory_top_ byte* memory_ = nullptr; byte* memory_top_ = nullptr; size_t memory_size_ = 0; + byte* stack_pointer_ = nullptr; Address limit_ = kNullAddress; bool owns_memory_ = false; // Whether memory_ is owned and must be freed. - bool is_in_use_ = false; // To guard against reentrancy. void ResetToStaticStack(RegExpStack* regexp_stack); + void ResetToStaticStackIfEmpty(RegExpStack* regexp_stack) { + if (stack_pointer_ == memory_top_) ResetToStaticStack(regexp_stack); + } void FreeAndInvalidate(); }; static constexpr size_t kThreadLocalSize = sizeof(ThreadLocal); - // Address of top of memory used as stack. Address memory_top_address_address() { return reinterpret_cast
(&thread_local_.memory_top_); } - // Resets the buffer if it has grown beyond the default/minimum size. - // After this, the buffer is either the default size, or it is empty, so - // you have to call EnsureCapacity before using it again. - void Reset(); + Address stack_pointer_address() { + return reinterpret_cast
(&thread_local_.stack_pointer_); + } + + // A position-independent representation of the stack pointer. + ptrdiff_t sp_top_delta() const { + ptrdiff_t result = + reinterpret_cast(thread_local_.stack_pointer_) - + reinterpret_cast(thread_local_.memory_top_); + DCHECK_LE(result, 0); + return result; + } + + // Resets the buffer if it has grown beyond the default/minimum size and is + // empty. + void ResetIfEmpty() { thread_local_.ResetToStaticStackIfEmpty(this); } // Whether the ThreadLocal storage has been invalidated. bool IsValid() const { return thread_local_.memory_ != nullptr; } ThreadLocal thread_local_; - Isolate* isolate_; friend class ExternalReference; - friend class Isolate; friend class RegExpStackScope; }; diff --git a/js/src/irregexp/imported/regexp.h b/js/src/irregexp/imported/regexp.h index 1bb367fe3bea..3a29276e1fc6 100644 --- a/js/src/irregexp/imported/regexp.h +++ b/js/src/irregexp/imported/regexp.h @@ -11,6 +11,9 @@ namespace v8 { namespace internal { +class JSRegExp; +class RegExpCapture; +class RegExpMatchInfo; class RegExpNode; class RegExpTree; @@ -37,9 +40,9 @@ struct RegExpCompileData { // True, iff the pattern is anchored at the start of the string with '^'. bool contains_anchor = false; - // Only use if the pattern contains named captures. If so, this contains a - // mapping of capture names to capture indices. - Handle capture_name_map; + // Only set if the pattern contains named captures. + // Note: the lifetime equals that of the parse/compile zone. + ZoneVector* named_captures = nullptr; // The error message. Only used if an error occurred during parsing or // compilation. @@ -62,9 +65,15 @@ struct RegExpCompileData { class RegExp final : public AllStatic { public: // Whether the irregexp engine generates interpreter bytecode. - static bool CanGenerateBytecode() { - return FLAG_regexp_interpret_all || FLAG_regexp_tier_up; - } + static bool CanGenerateBytecode(); + + // Verify the given pattern, i.e. check that parsing succeeds. If + // verification fails, `regexp_error_out` is set. + template + static bool VerifySyntax(Zone* zone, uintptr_t stack_limit, + const CharT* input, int input_length, + RegExpFlags flags, RegExpError* regexp_error_out, + const DisallowGarbageCollection& no_gc); // Parses the RegExp pattern and prepares the JSRegExp object with // generic data and choice of implementation - as well as what @@ -72,7 +81,7 @@ class RegExp final : public AllStatic { // Returns false if compilation fails. V8_WARN_UNUSED_RESULT static MaybeHandle Compile( Isolate* isolate, Handle re, Handle pattern, - JSRegExp::Flags flags, uint32_t backtrack_limit); + RegExpFlags flags, uint32_t backtrack_limit); // Ensures that a regexp is fully compiled and ready to be executed on a // subject string. Returns true on success. Return false on failure, and @@ -131,12 +140,9 @@ class RegExp final : public AllStatic { Isolate* isolate, Handle last_match_info, Handle subject, int capture_count, int32_t* match); - V8_EXPORT_PRIVATE static bool CompileForTesting(Isolate* isolate, Zone* zone, - RegExpCompileData* input, - JSRegExp::Flags flags, - Handle pattern, - Handle sample_subject, - bool is_one_byte); + V8_EXPORT_PRIVATE static bool CompileForTesting( + Isolate* isolate, Zone* zone, RegExpCompileData* input, RegExpFlags flags, + Handle pattern, Handle sample_subject, bool is_one_byte); V8_EXPORT_PRIVATE static void DotPrintForTesting(const char* label, RegExpNode* node); @@ -152,6 +158,9 @@ class RegExp final : public AllStatic { RegExpError error_text); static bool IsUnmodifiedRegExp(Isolate* isolate, Handle regexp); + + static Handle CreateCaptureNameMap( + Isolate* isolate, ZoneVector* named_captures); }; // Uses a special global mode of irregexp-generated code to perform a global