зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1779849: Re-import irregexp r=mgaudet
This patch was generated by running import-irregexp.py. Depends on D152901 Differential Revision: https://phabricator.services.mozilla.com/D152902
This commit is contained in:
Родитель
9e068493ac
Коммит
a1e55c9f5a
|
@ -1,2 +1,2 @@
|
|||
Imported using import-irregexp.py from:
|
||||
https://github.com/v8/v8/tree/8732b2ee52b567ad4e15ca91d141fd6e27499e99/src/regexp
|
||||
https://github.com/v8/v8/tree/b8fe2724fc25af2c165180b2cd2930b2119ad831/src/regexp
|
||||
|
|
|
@ -12,9 +12,9 @@
|
|||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
static const uc32 kSurrogateStart = 0xd800;
|
||||
static const uc32 kSurrogateEnd = 0xdfff;
|
||||
static const uc32 kNonBmpStart = 0x10000;
|
||||
static const base::uc32 kSurrogateStart = 0xd800;
|
||||
static const base::uc32 kSurrogateEnd = 0xdfff;
|
||||
static const base::uc32 kNonBmpStart = 0x10000;
|
||||
|
||||
// The following code generates "src/regexp/special-case.cc".
|
||||
void PrintSet(std::ofstream& out, const char* name,
|
||||
|
|
|
@ -42,7 +42,7 @@ const generateData = (property) => {
|
|||
buffer.push(' ' + codePoints.join(', ') + ', 0,');
|
||||
}
|
||||
const output =
|
||||
`const uc32 UnicodePropertySequences::k${ id }[] = {\n` +
|
||||
`const base::uc32 UnicodePropertySequences::k${ id }[] = {\n` +
|
||||
`${ buffer.join('\n') }\n 0 // null-terminating the list\n};\n`;
|
||||
return output;
|
||||
};
|
||||
|
@ -60,7 +60,7 @@ for (const property of properties) {
|
|||
*/
|
||||
|
||||
// clang-format off
|
||||
const uc32 UnicodePropertySequences::kEmojiFlagSequences[] = {
|
||||
const base::uc32 UnicodePropertySequences::kEmojiFlagSequences[] = {
|
||||
0x01F1E6, 0x01F1E8, 0,
|
||||
0x01F1FF, 0x01F1FC, 0,
|
||||
0x01F1E6, 0x01F1EA, 0,
|
||||
|
@ -322,14 +322,14 @@ const uc32 UnicodePropertySequences::kEmojiFlagSequences[] = {
|
|||
0 // null-terminating the list
|
||||
};
|
||||
|
||||
const uc32 UnicodePropertySequences::kEmojiTagSequences[] = {
|
||||
const base::uc32 UnicodePropertySequences::kEmojiTagSequences[] = {
|
||||
0x01F3F4, 0x0E0067, 0x0E0062, 0x0E0065, 0x0E006E, 0x0E0067, 0x0E007F, 0,
|
||||
0x01F3F4, 0x0E0067, 0x0E0062, 0x0E0073, 0x0E0063, 0x0E0074, 0x0E007F, 0,
|
||||
0x01F3F4, 0x0E0067, 0x0E0062, 0x0E0077, 0x0E006C, 0x0E0073, 0x0E007F, 0,
|
||||
0 // null-terminating the list
|
||||
};
|
||||
|
||||
const uc32 UnicodePropertySequences::kEmojiZWJSequences[] = {
|
||||
const base::uc32 UnicodePropertySequences::kEmojiZWJSequences[] = {
|
||||
0x01F468, 0x00200D, 0x002764, 0x00FE0F, 0x00200D, 0x01F468, 0,
|
||||
0x01F441, 0x00FE0F, 0x00200D, 0x01F5E8, 0x00FE0F, 0,
|
||||
0x01F468, 0x00200D, 0x01F466, 0,
|
||||
|
|
|
@ -14,9 +14,9 @@ namespace internal {
|
|||
|
||||
class UnicodePropertySequences : public AllStatic {
|
||||
public:
|
||||
static const uc32 kEmojiFlagSequences[];
|
||||
static const uc32 kEmojiTagSequences[];
|
||||
static const uc32 kEmojiZWJSequences[];
|
||||
static const base::uc32 kEmojiFlagSequences[];
|
||||
static const base::uc32 kEmojiTagSequences[];
|
||||
static const base::uc32 kEmojiZWJSequences[];
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
|
|
@ -26,14 +26,16 @@ FOR_EACH_REG_EXP_TREE_TYPE(MAKE_TYPE_CASE)
|
|||
FOR_EACH_REG_EXP_TREE_TYPE(MAKE_TYPE_CASE)
|
||||
#undef MAKE_TYPE_CASE
|
||||
|
||||
namespace {
|
||||
|
||||
static Interval ListCaptureRegisters(ZoneList<RegExpTree*>* children) {
|
||||
Interval ListCaptureRegisters(ZoneList<RegExpTree*>* children) {
|
||||
Interval result = Interval::Empty();
|
||||
for (int i = 0; i < children->length(); i++)
|
||||
result = result.Union(children->at(i)->CaptureRegisters());
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
Interval RegExpAlternative::CaptureRegisters() {
|
||||
return ListCaptureRegisters(nodes());
|
||||
|
@ -62,12 +64,12 @@ Interval RegExpQuantifier::CaptureRegisters() {
|
|||
|
||||
|
||||
bool RegExpAssertion::IsAnchoredAtStart() {
|
||||
return assertion_type() == RegExpAssertion::START_OF_INPUT;
|
||||
return assertion_type() == RegExpAssertion::Type::START_OF_INPUT;
|
||||
}
|
||||
|
||||
|
||||
bool RegExpAssertion::IsAnchoredAtEnd() {
|
||||
return assertion_type() == RegExpAssertion::END_OF_INPUT;
|
||||
return assertion_type() == RegExpAssertion::Type::END_OF_INPUT;
|
||||
}
|
||||
|
||||
|
||||
|
@ -129,6 +131,7 @@ bool RegExpCapture::IsAnchoredAtStart() { return body()->IsAnchoredAtStart(); }
|
|||
|
||||
bool RegExpCapture::IsAnchoredAtEnd() { return body()->IsAnchoredAtEnd(); }
|
||||
|
||||
namespace {
|
||||
|
||||
// Convert regular expression trees to a simple sexp representation.
|
||||
// This representation should be different from the input grammar
|
||||
|
@ -147,6 +150,7 @@ class RegExpUnparser final : public RegExpVisitor {
|
|||
Zone* zone_;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
void* RegExpUnparser::VisitDisjunction(RegExpDisjunction* that, void* data) {
|
||||
os_ << "(|";
|
||||
|
@ -193,22 +197,22 @@ void* RegExpUnparser::VisitCharacterClass(RegExpCharacterClass* that,
|
|||
|
||||
void* RegExpUnparser::VisitAssertion(RegExpAssertion* that, void* data) {
|
||||
switch (that->assertion_type()) {
|
||||
case RegExpAssertion::START_OF_INPUT:
|
||||
case RegExpAssertion::Type::START_OF_INPUT:
|
||||
os_ << "@^i";
|
||||
break;
|
||||
case RegExpAssertion::END_OF_INPUT:
|
||||
case RegExpAssertion::Type::END_OF_INPUT:
|
||||
os_ << "@$i";
|
||||
break;
|
||||
case RegExpAssertion::START_OF_LINE:
|
||||
case RegExpAssertion::Type::START_OF_LINE:
|
||||
os_ << "@^l";
|
||||
break;
|
||||
case RegExpAssertion::END_OF_LINE:
|
||||
case RegExpAssertion::Type::END_OF_LINE:
|
||||
os_ << "@$l";
|
||||
break;
|
||||
case RegExpAssertion::BOUNDARY:
|
||||
case RegExpAssertion::Type::BOUNDARY:
|
||||
os_ << "@b";
|
||||
break;
|
||||
case RegExpAssertion::NON_BOUNDARY:
|
||||
case RegExpAssertion::Type::NON_BOUNDARY:
|
||||
os_ << "@B";
|
||||
break;
|
||||
}
|
||||
|
@ -218,7 +222,7 @@ void* RegExpUnparser::VisitAssertion(RegExpAssertion* that, void* data) {
|
|||
|
||||
void* RegExpUnparser::VisitAtom(RegExpAtom* that, void* data) {
|
||||
os_ << "'";
|
||||
Vector<const uc16> chardata = that->data();
|
||||
base::Vector<const base::uc16> chardata = that->data();
|
||||
for (int i = 0; i < chardata.length(); i++) {
|
||||
os_ << AsUC16(chardata[i]);
|
||||
}
|
||||
|
@ -311,8 +315,9 @@ RegExpDisjunction::RegExpDisjunction(ZoneList<RegExpTree*>* alternatives)
|
|||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
static int IncreaseBy(int previous, int increase) {
|
||||
int IncreaseBy(int previous, int increase) {
|
||||
if (RegExpTree::kInfinity - previous < increase) {
|
||||
return RegExpTree::kInfinity;
|
||||
} else {
|
||||
|
@ -320,6 +325,7 @@ static int IncreaseBy(int previous, int increase) {
|
|||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
RegExpAlternative::RegExpAlternative(ZoneList<RegExpTree*>* nodes)
|
||||
: nodes_(nodes) {
|
||||
|
|
|
@ -41,29 +41,25 @@ class RegExpVisitor {
|
|||
#undef MAKE_CASE
|
||||
};
|
||||
|
||||
|
||||
// A simple closed interval.
|
||||
class Interval {
|
||||
public:
|
||||
Interval() : from_(kNone), to_(kNone - 1) {} // '- 1' for branchless size().
|
||||
Interval(int from, int to) : from_(from), to_(to) {}
|
||||
Interval Union(Interval that) {
|
||||
if (that.from_ == kNone)
|
||||
return *this;
|
||||
else if (from_ == kNone)
|
||||
return that;
|
||||
else
|
||||
return Interval(std::min(from_, that.from_), std::max(to_, that.to_));
|
||||
if (that.from_ == kNone) return *this;
|
||||
if (from_ == kNone) return that;
|
||||
return Interval(std::min(from_, that.from_), std::max(to_, that.to_));
|
||||
}
|
||||
|
||||
bool Contains(int value) { return (from_ <= value) && (value <= to_); }
|
||||
bool is_empty() { return from_ == kNone; }
|
||||
static Interval Empty() { return Interval(); }
|
||||
|
||||
bool Contains(int value) const { return (from_ <= value) && (value <= to_); }
|
||||
bool is_empty() const { return from_ == kNone; }
|
||||
int from() const { return from_; }
|
||||
int to() const { return to_; }
|
||||
int size() const { return to_ - from_ + 1; }
|
||||
|
||||
static Interval Empty() { return Interval(); }
|
||||
|
||||
static constexpr int kNone = -1;
|
||||
|
||||
private:
|
||||
|
@ -71,32 +67,39 @@ class Interval {
|
|||
int to_;
|
||||
};
|
||||
|
||||
// Named standard character sets.
|
||||
enum class StandardCharacterSet : char {
|
||||
kWhitespace = 's', // Like /\s/.
|
||||
kNotWhitespace = 'S', // Like /\S/.
|
||||
kWord = 'w', // Like /\w/.
|
||||
kNotWord = 'W', // Like /\W/.
|
||||
kDigit = 'd', // Like /\d/.
|
||||
kNotDigit = 'D', // Like /\D/.
|
||||
kLineTerminator = 'n', // The inverse of /./.
|
||||
kNotLineTerminator = '.', // Like /./.
|
||||
kEverything = '*', // Matches every character, like /./s.
|
||||
};
|
||||
|
||||
// Represents code points (with values up to 0x10FFFF) in the range from from_
|
||||
// to to_, both ends are inclusive.
|
||||
class CharacterRange {
|
||||
public:
|
||||
CharacterRange() : from_(0), to_(0) {}
|
||||
// For compatibility with the CHECK_OK macro
|
||||
CharacterRange() = default;
|
||||
// For compatibility with the CHECK_OK macro.
|
||||
CharacterRange(void* null) { DCHECK_NULL(null); } // NOLINT
|
||||
V8_EXPORT_PRIVATE static void AddClassEscape(char type,
|
||||
ZoneList<CharacterRange>* ranges,
|
||||
Zone* zone);
|
||||
// Add class escapes. Add case equivalent closure for \w and \W if necessary.
|
||||
V8_EXPORT_PRIVATE static void AddClassEscape(
|
||||
char type, ZoneList<CharacterRange>* ranges,
|
||||
bool add_unicode_case_equivalents, Zone* zone);
|
||||
static Vector<const int> GetWordBounds();
|
||||
static inline CharacterRange Singleton(uc32 value) {
|
||||
|
||||
static inline CharacterRange Singleton(base::uc32 value) {
|
||||
return CharacterRange(value, value);
|
||||
}
|
||||
static inline CharacterRange Range(uc32 from, uc32 to) {
|
||||
DCHECK(0 <= from && to <= String::kMaxCodePoint);
|
||||
static inline CharacterRange Range(base::uc32 from, base::uc32 to) {
|
||||
DCHECK(0 <= from && to <= kMaxCodePoint);
|
||||
DCHECK(static_cast<uint32_t>(from) <= static_cast<uint32_t>(to));
|
||||
return CharacterRange(from, to);
|
||||
}
|
||||
static inline CharacterRange Everything() {
|
||||
return CharacterRange(0, String::kMaxCodePoint);
|
||||
return CharacterRange(0, kMaxCodePoint);
|
||||
}
|
||||
|
||||
static inline ZoneList<CharacterRange>* List(Zone* zone,
|
||||
CharacterRange range) {
|
||||
ZoneList<CharacterRange>* list =
|
||||
|
@ -104,17 +107,21 @@ class CharacterRange {
|
|||
list->Add(range, zone);
|
||||
return list;
|
||||
}
|
||||
bool Contains(uc32 i) { return from_ <= i && i <= to_; }
|
||||
uc32 from() const { return from_; }
|
||||
void set_from(uc32 value) { from_ = value; }
|
||||
uc32 to() const { return to_; }
|
||||
void set_to(uc32 value) { to_ = value; }
|
||||
bool is_valid() { return from_ <= to_; }
|
||||
bool IsEverything(uc32 max) { return from_ == 0 && to_ >= max; }
|
||||
bool IsSingleton() { return (from_ == to_); }
|
||||
|
||||
// Add class escapes. Add case equivalent closure for \w and \W if necessary.
|
||||
V8_EXPORT_PRIVATE static void AddClassEscape(
|
||||
StandardCharacterSet standard_character_set,
|
||||
ZoneList<CharacterRange>* ranges, bool add_unicode_case_equivalents,
|
||||
Zone* zone);
|
||||
V8_EXPORT_PRIVATE static void AddCaseEquivalents(
|
||||
Isolate* isolate, Zone* zone, ZoneList<CharacterRange>* ranges,
|
||||
bool is_one_byte);
|
||||
|
||||
bool Contains(base::uc32 i) const { return from_ <= i && i <= to_; }
|
||||
base::uc32 from() const { return from_; }
|
||||
base::uc32 to() const { return to_; }
|
||||
bool IsEverything(base::uc32 max) const { return from_ == 0 && to_ >= max; }
|
||||
bool IsSingleton() const { return from_ == to_; }
|
||||
// Whether a range list is in canonical form: Ranges ordered by from value,
|
||||
// and ranges non-overlapping and non-adjacent.
|
||||
V8_EXPORT_PRIVATE static bool IsCanonical(ZoneList<CharacterRange>* ranges);
|
||||
|
@ -126,35 +133,214 @@ class CharacterRange {
|
|||
// Negate the contents of a character range in canonical form.
|
||||
static void Negate(ZoneList<CharacterRange>* src,
|
||||
ZoneList<CharacterRange>* dst, Zone* zone);
|
||||
static const int kStartMarker = (1 << 24);
|
||||
static const int kPayloadMask = (1 << 24) - 1;
|
||||
|
||||
// Remove all ranges outside the one-byte range.
|
||||
static void ClampToOneByte(ZoneList<CharacterRange>* ranges);
|
||||
|
||||
private:
|
||||
CharacterRange(uc32 from, uc32 to) : from_(from), to_(to) {}
|
||||
CharacterRange(base::uc32 from, base::uc32 to) : from_(from), to_(to) {}
|
||||
|
||||
uc32 from_;
|
||||
uc32 to_;
|
||||
static constexpr int kMaxCodePoint = 0x10ffff;
|
||||
|
||||
base::uc32 from_ = 0;
|
||||
base::uc32 to_ = 0;
|
||||
};
|
||||
|
||||
#define DECL_BOILERPLATE(Name) \
|
||||
void* Accept(RegExpVisitor* visitor, void* data) override; \
|
||||
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) \
|
||||
override; \
|
||||
RegExp##Name* As##Name() override; \
|
||||
bool Is##Name() override
|
||||
|
||||
class RegExpTree : public ZoneObject {
|
||||
public:
|
||||
static const int kInfinity = kMaxInt;
|
||||
virtual ~RegExpTree() = default;
|
||||
virtual void* Accept(RegExpVisitor* visitor, void* data) = 0;
|
||||
virtual RegExpNode* ToNode(RegExpCompiler* compiler,
|
||||
RegExpNode* on_success) = 0;
|
||||
virtual bool IsTextElement() { return false; }
|
||||
virtual bool IsAnchoredAtStart() { return false; }
|
||||
virtual bool IsAnchoredAtEnd() { return false; }
|
||||
virtual int min_match() = 0;
|
||||
virtual int max_match() = 0;
|
||||
// Returns the interval of registers used for captures within this
|
||||
// expression.
|
||||
virtual Interval CaptureRegisters() { return Interval::Empty(); }
|
||||
virtual void AppendToText(RegExpText* text, Zone* zone);
|
||||
V8_EXPORT_PRIVATE std::ostream& Print(std::ostream& os, Zone* zone);
|
||||
#define MAKE_ASTYPE(Name) \
|
||||
virtual RegExp##Name* As##Name(); \
|
||||
virtual bool Is##Name();
|
||||
FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ASTYPE)
|
||||
#undef MAKE_ASTYPE
|
||||
};
|
||||
|
||||
|
||||
class RegExpDisjunction final : public RegExpTree {
|
||||
public:
|
||||
explicit RegExpDisjunction(ZoneList<RegExpTree*>* alternatives);
|
||||
|
||||
DECL_BOILERPLATE(Disjunction);
|
||||
|
||||
Interval CaptureRegisters() override;
|
||||
bool IsAnchoredAtStart() override;
|
||||
bool IsAnchoredAtEnd() override;
|
||||
int min_match() override { return min_match_; }
|
||||
int max_match() override { return max_match_; }
|
||||
ZoneList<RegExpTree*>* alternatives() const { return alternatives_; }
|
||||
|
||||
private:
|
||||
bool SortConsecutiveAtoms(RegExpCompiler* compiler);
|
||||
void RationalizeConsecutiveAtoms(RegExpCompiler* compiler);
|
||||
void FixSingleCharacterDisjunctions(RegExpCompiler* compiler);
|
||||
ZoneList<RegExpTree*>* alternatives_;
|
||||
int min_match_;
|
||||
int max_match_;
|
||||
};
|
||||
|
||||
|
||||
class RegExpAlternative final : public RegExpTree {
|
||||
public:
|
||||
explicit RegExpAlternative(ZoneList<RegExpTree*>* nodes);
|
||||
|
||||
DECL_BOILERPLATE(Alternative);
|
||||
|
||||
Interval CaptureRegisters() override;
|
||||
bool IsAnchoredAtStart() override;
|
||||
bool IsAnchoredAtEnd() override;
|
||||
int min_match() override { return min_match_; }
|
||||
int max_match() override { return max_match_; }
|
||||
ZoneList<RegExpTree*>* nodes() const { return nodes_; }
|
||||
|
||||
private:
|
||||
ZoneList<RegExpTree*>* nodes_;
|
||||
int min_match_;
|
||||
int max_match_;
|
||||
};
|
||||
|
||||
|
||||
class RegExpAssertion final : public RegExpTree {
|
||||
public:
|
||||
enum class Type {
|
||||
START_OF_LINE = 0,
|
||||
START_OF_INPUT = 1,
|
||||
END_OF_LINE = 2,
|
||||
END_OF_INPUT = 3,
|
||||
BOUNDARY = 4,
|
||||
NON_BOUNDARY = 5,
|
||||
LAST_ASSERTION_TYPE = NON_BOUNDARY,
|
||||
};
|
||||
explicit RegExpAssertion(Type type) : assertion_type_(type) {}
|
||||
|
||||
DECL_BOILERPLATE(Assertion);
|
||||
|
||||
bool IsAnchoredAtStart() override;
|
||||
bool IsAnchoredAtEnd() override;
|
||||
int min_match() override { return 0; }
|
||||
int max_match() override { return 0; }
|
||||
Type assertion_type() const { return assertion_type_; }
|
||||
|
||||
private:
|
||||
const Type assertion_type_;
|
||||
};
|
||||
|
||||
class CharacterSet final {
|
||||
public:
|
||||
explicit CharacterSet(uc16 standard_set_type)
|
||||
: ranges_(nullptr), standard_set_type_(standard_set_type) {}
|
||||
explicit CharacterSet(ZoneList<CharacterRange>* ranges)
|
||||
: ranges_(ranges), standard_set_type_(0) {}
|
||||
explicit CharacterSet(StandardCharacterSet standard_set_type)
|
||||
: standard_set_type_(standard_set_type) {}
|
||||
explicit CharacterSet(ZoneList<CharacterRange>* ranges) : ranges_(ranges) {}
|
||||
|
||||
ZoneList<CharacterRange>* ranges(Zone* zone);
|
||||
uc16 standard_set_type() const { return standard_set_type_; }
|
||||
void set_standard_set_type(uc16 special_set_type) {
|
||||
standard_set_type_ = special_set_type;
|
||||
StandardCharacterSet standard_set_type() const {
|
||||
return standard_set_type_.value();
|
||||
}
|
||||
bool is_standard() { return standard_set_type_ != 0; }
|
||||
void set_standard_set_type(StandardCharacterSet standard_set_type) {
|
||||
standard_set_type_ = standard_set_type;
|
||||
}
|
||||
bool is_standard() const { return standard_set_type_.has_value(); }
|
||||
V8_EXPORT_PRIVATE void Canonicalize();
|
||||
|
||||
private:
|
||||
ZoneList<CharacterRange>* ranges_;
|
||||
// If non-zero, the value represents a standard set (e.g., all whitespace
|
||||
// characters) without having to expand the ranges.
|
||||
uc16 standard_set_type_;
|
||||
ZoneList<CharacterRange>* ranges_ = nullptr;
|
||||
base::Optional<StandardCharacterSet> standard_set_type_;
|
||||
};
|
||||
|
||||
class RegExpCharacterClass final : public RegExpTree {
|
||||
public:
|
||||
// NEGATED: The character class is negated and should match everything but
|
||||
// the specified ranges.
|
||||
// CONTAINS_SPLIT_SURROGATE: The character class contains part of a split
|
||||
// surrogate and should not be unicode-desugared (crbug.com/641091).
|
||||
enum Flag {
|
||||
NEGATED = 1 << 0,
|
||||
CONTAINS_SPLIT_SURROGATE = 1 << 1,
|
||||
};
|
||||
using CharacterClassFlags = base::Flags<Flag>;
|
||||
|
||||
RegExpCharacterClass(
|
||||
Zone* zone, ZoneList<CharacterRange>* ranges,
|
||||
CharacterClassFlags character_class_flags = CharacterClassFlags())
|
||||
: set_(ranges), character_class_flags_(character_class_flags) {
|
||||
// Convert the empty set of ranges to the negated Everything() range.
|
||||
if (ranges->is_empty()) {
|
||||
ranges->Add(CharacterRange::Everything(), zone);
|
||||
character_class_flags_ ^= NEGATED;
|
||||
}
|
||||
}
|
||||
explicit RegExpCharacterClass(StandardCharacterSet standard_set_type)
|
||||
: set_(standard_set_type), character_class_flags_() {}
|
||||
|
||||
DECL_BOILERPLATE(CharacterClass);
|
||||
|
||||
bool IsTextElement() override { return true; }
|
||||
int min_match() override { return 1; }
|
||||
// The character class may match two code units for unicode regexps.
|
||||
// TODO(yangguo): we should split this class for usage in TextElement, and
|
||||
// make max_match() dependent on the character class content.
|
||||
int max_match() override { return 2; }
|
||||
|
||||
void AppendToText(RegExpText* text, Zone* zone) override;
|
||||
|
||||
// TODO(lrn): Remove need for complex version if is_standard that
|
||||
// recognizes a mangled standard set and just do { return set_.is_special(); }
|
||||
bool is_standard(Zone* zone);
|
||||
// Returns a value representing the standard character set if is_standard()
|
||||
// returns true.
|
||||
StandardCharacterSet standard_type() const {
|
||||
return set_.standard_set_type();
|
||||
}
|
||||
|
||||
CharacterSet character_set() const { return set_; }
|
||||
ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); }
|
||||
|
||||
bool is_negated() const { return (character_class_flags_ & NEGATED) != 0; }
|
||||
bool contains_split_surrogate() const {
|
||||
return (character_class_flags_ & CONTAINS_SPLIT_SURROGATE) != 0;
|
||||
}
|
||||
|
||||
private:
|
||||
CharacterSet set_;
|
||||
CharacterClassFlags character_class_flags_;
|
||||
};
|
||||
|
||||
class RegExpAtom final : public RegExpTree {
|
||||
public:
|
||||
explicit RegExpAtom(base::Vector<const base::uc16> data) : data_(data) {}
|
||||
|
||||
DECL_BOILERPLATE(Atom);
|
||||
|
||||
bool IsTextElement() override { return true; }
|
||||
int min_match() override { return data_.length(); }
|
||||
int max_match() override { return data_.length(); }
|
||||
void AppendToText(RegExpText* text, Zone* zone) override;
|
||||
|
||||
base::Vector<const base::uc16> data() const { return data_; }
|
||||
int length() const { return data_.length(); }
|
||||
|
||||
private:
|
||||
base::Vector<const base::uc16> data_;
|
||||
};
|
||||
|
||||
class TextElement final {
|
||||
|
@ -191,206 +377,12 @@ class TextElement final {
|
|||
RegExpTree* tree_;
|
||||
};
|
||||
|
||||
|
||||
class RegExpTree : public ZoneObject {
|
||||
public:
|
||||
static const int kInfinity = kMaxInt;
|
||||
virtual ~RegExpTree() = default;
|
||||
virtual void* Accept(RegExpVisitor* visitor, void* data) = 0;
|
||||
virtual RegExpNode* ToNode(RegExpCompiler* compiler,
|
||||
RegExpNode* on_success) = 0;
|
||||
virtual bool IsTextElement() { return false; }
|
||||
virtual bool IsAnchoredAtStart() { return false; }
|
||||
virtual bool IsAnchoredAtEnd() { return false; }
|
||||
virtual int min_match() = 0;
|
||||
virtual int max_match() = 0;
|
||||
// Returns the interval of registers used for captures within this
|
||||
// expression.
|
||||
virtual Interval CaptureRegisters() { return Interval::Empty(); }
|
||||
virtual void AppendToText(RegExpText* text, Zone* zone);
|
||||
V8_EXPORT_PRIVATE std::ostream& Print(std::ostream& os, Zone* zone);
|
||||
#define MAKE_ASTYPE(Name) \
|
||||
virtual RegExp##Name* As##Name(); \
|
||||
virtual bool Is##Name();
|
||||
FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ASTYPE)
|
||||
#undef MAKE_ASTYPE
|
||||
};
|
||||
|
||||
|
||||
class RegExpDisjunction final : public RegExpTree {
|
||||
public:
|
||||
explicit RegExpDisjunction(ZoneList<RegExpTree*>* alternatives);
|
||||
void* Accept(RegExpVisitor* visitor, void* data) override;
|
||||
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
|
||||
RegExpDisjunction* AsDisjunction() override;
|
||||
Interval CaptureRegisters() override;
|
||||
bool IsDisjunction() override;
|
||||
bool IsAnchoredAtStart() override;
|
||||
bool IsAnchoredAtEnd() override;
|
||||
int min_match() override { return min_match_; }
|
||||
int max_match() override { return max_match_; }
|
||||
ZoneList<RegExpTree*>* alternatives() { return alternatives_; }
|
||||
|
||||
private:
|
||||
bool SortConsecutiveAtoms(RegExpCompiler* compiler);
|
||||
void RationalizeConsecutiveAtoms(RegExpCompiler* compiler);
|
||||
void FixSingleCharacterDisjunctions(RegExpCompiler* compiler);
|
||||
ZoneList<RegExpTree*>* alternatives_;
|
||||
int min_match_;
|
||||
int max_match_;
|
||||
};
|
||||
|
||||
|
||||
class RegExpAlternative final : public RegExpTree {
|
||||
public:
|
||||
explicit RegExpAlternative(ZoneList<RegExpTree*>* nodes);
|
||||
void* Accept(RegExpVisitor* visitor, void* data) override;
|
||||
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
|
||||
RegExpAlternative* AsAlternative() override;
|
||||
Interval CaptureRegisters() override;
|
||||
bool IsAlternative() override;
|
||||
bool IsAnchoredAtStart() override;
|
||||
bool IsAnchoredAtEnd() override;
|
||||
int min_match() override { return min_match_; }
|
||||
int max_match() override { return max_match_; }
|
||||
ZoneList<RegExpTree*>* nodes() { return nodes_; }
|
||||
|
||||
private:
|
||||
ZoneList<RegExpTree*>* nodes_;
|
||||
int min_match_;
|
||||
int max_match_;
|
||||
};
|
||||
|
||||
|
||||
class RegExpAssertion final : public RegExpTree {
|
||||
public:
|
||||
enum AssertionType {
|
||||
START_OF_LINE = 0,
|
||||
START_OF_INPUT = 1,
|
||||
END_OF_LINE = 2,
|
||||
END_OF_INPUT = 3,
|
||||
BOUNDARY = 4,
|
||||
NON_BOUNDARY = 5,
|
||||
LAST_TYPE = NON_BOUNDARY,
|
||||
};
|
||||
RegExpAssertion(AssertionType type, JSRegExp::Flags flags)
|
||||
: assertion_type_(type), flags_(flags) {}
|
||||
void* Accept(RegExpVisitor* visitor, void* data) override;
|
||||
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
|
||||
RegExpAssertion* AsAssertion() override;
|
||||
bool IsAssertion() override;
|
||||
bool IsAnchoredAtStart() override;
|
||||
bool IsAnchoredAtEnd() override;
|
||||
int min_match() override { return 0; }
|
||||
int max_match() override { return 0; }
|
||||
AssertionType assertion_type() const { return assertion_type_; }
|
||||
JSRegExp::Flags flags() const { return flags_; }
|
||||
|
||||
private:
|
||||
const AssertionType assertion_type_;
|
||||
const JSRegExp::Flags flags_;
|
||||
};
|
||||
|
||||
|
||||
class RegExpCharacterClass final : public RegExpTree {
|
||||
public:
|
||||
// NEGATED: The character class is negated and should match everything but
|
||||
// the specified ranges.
|
||||
// CONTAINS_SPLIT_SURROGATE: The character class contains part of a split
|
||||
// surrogate and should not be unicode-desugared (crbug.com/641091).
|
||||
enum Flag {
|
||||
NEGATED = 1 << 0,
|
||||
CONTAINS_SPLIT_SURROGATE = 1 << 1,
|
||||
};
|
||||
using CharacterClassFlags = base::Flags<Flag>;
|
||||
|
||||
RegExpCharacterClass(
|
||||
Zone* zone, ZoneList<CharacterRange>* ranges, JSRegExp::Flags flags,
|
||||
CharacterClassFlags character_class_flags = CharacterClassFlags())
|
||||
: set_(ranges),
|
||||
flags_(flags),
|
||||
character_class_flags_(character_class_flags) {
|
||||
// Convert the empty set of ranges to the negated Everything() range.
|
||||
if (ranges->is_empty()) {
|
||||
ranges->Add(CharacterRange::Everything(), zone);
|
||||
character_class_flags_ ^= NEGATED;
|
||||
}
|
||||
}
|
||||
RegExpCharacterClass(uc16 type, JSRegExp::Flags flags)
|
||||
: set_(type),
|
||||
flags_(flags),
|
||||
character_class_flags_(CharacterClassFlags()) {}
|
||||
void* Accept(RegExpVisitor* visitor, void* data) override;
|
||||
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
|
||||
RegExpCharacterClass* AsCharacterClass() override;
|
||||
bool IsCharacterClass() override;
|
||||
bool IsTextElement() override { return true; }
|
||||
int min_match() override { return 1; }
|
||||
// The character class may match two code units for unicode regexps.
|
||||
// TODO(yangguo): we should split this class for usage in TextElement, and
|
||||
// make max_match() dependent on the character class content.
|
||||
int max_match() override { return 2; }
|
||||
void AppendToText(RegExpText* text, Zone* zone) override;
|
||||
CharacterSet character_set() { return set_; }
|
||||
// TODO(lrn): Remove need for complex version if is_standard that
|
||||
// recognizes a mangled standard set and just do { return set_.is_special(); }
|
||||
bool is_standard(Zone* zone);
|
||||
// Returns a value representing the standard character set if is_standard()
|
||||
// returns true.
|
||||
// Currently used values are:
|
||||
// s : unicode whitespace
|
||||
// S : unicode non-whitespace
|
||||
// w : ASCII word character (digit, letter, underscore)
|
||||
// W : non-ASCII word character
|
||||
// d : ASCII digit
|
||||
// D : non-ASCII digit
|
||||
// . : non-newline
|
||||
// * : All characters, for advancing unanchored regexp
|
||||
uc16 standard_type() const { return set_.standard_set_type(); }
|
||||
ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); }
|
||||
bool is_negated() const { return (character_class_flags_ & NEGATED) != 0; }
|
||||
JSRegExp::Flags flags() const { return flags_; }
|
||||
bool contains_split_surrogate() const {
|
||||
return (character_class_flags_ & CONTAINS_SPLIT_SURROGATE) != 0;
|
||||
}
|
||||
|
||||
private:
|
||||
CharacterSet set_;
|
||||
const JSRegExp::Flags flags_;
|
||||
CharacterClassFlags character_class_flags_;
|
||||
};
|
||||
|
||||
|
||||
class RegExpAtom final : public RegExpTree {
|
||||
public:
|
||||
explicit RegExpAtom(Vector<const uc16> data, JSRegExp::Flags flags)
|
||||
: data_(data), flags_(flags) {}
|
||||
void* Accept(RegExpVisitor* visitor, void* data) override;
|
||||
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
|
||||
RegExpAtom* AsAtom() override;
|
||||
bool IsAtom() override;
|
||||
bool IsTextElement() override { return true; }
|
||||
int min_match() override { return data_.length(); }
|
||||
int max_match() override { return data_.length(); }
|
||||
void AppendToText(RegExpText* text, Zone* zone) override;
|
||||
Vector<const uc16> data() { return data_; }
|
||||
int length() { return data_.length(); }
|
||||
JSRegExp::Flags flags() const { return flags_; }
|
||||
bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
|
||||
|
||||
private:
|
||||
Vector<const uc16> data_;
|
||||
const JSRegExp::Flags flags_;
|
||||
};
|
||||
|
||||
|
||||
class RegExpText final : public RegExpTree {
|
||||
public:
|
||||
explicit RegExpText(Zone* zone) : elements_(2, zone), length_(0) {}
|
||||
void* Accept(RegExpVisitor* visitor, void* data) override;
|
||||
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
|
||||
RegExpText* AsText() override;
|
||||
bool IsText() override;
|
||||
explicit RegExpText(Zone* zone) : elements_(2, zone) {}
|
||||
|
||||
DECL_BOILERPLATE(Text);
|
||||
|
||||
bool IsTextElement() override { return true; }
|
||||
int min_match() override { return length_; }
|
||||
int max_match() override { return length_; }
|
||||
|
@ -403,7 +395,7 @@ class RegExpText final : public RegExpTree {
|
|||
|
||||
private:
|
||||
ZoneList<TextElement> elements_;
|
||||
int length_;
|
||||
int length_ = 0;
|
||||
};
|
||||
|
||||
|
||||
|
@ -426,23 +418,22 @@ class RegExpQuantifier final : public RegExpTree {
|
|||
max_match_ = max * body->max_match();
|
||||
}
|
||||
}
|
||||
void* Accept(RegExpVisitor* visitor, void* data) override;
|
||||
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
|
||||
|
||||
DECL_BOILERPLATE(Quantifier);
|
||||
|
||||
static RegExpNode* ToNode(int min, int max, bool is_greedy, RegExpTree* body,
|
||||
RegExpCompiler* compiler, RegExpNode* on_success,
|
||||
bool not_at_start = false);
|
||||
RegExpQuantifier* AsQuantifier() override;
|
||||
Interval CaptureRegisters() override;
|
||||
bool IsQuantifier() override;
|
||||
int min_match() override { return min_match_; }
|
||||
int max_match() override { return max_match_; }
|
||||
int min() const { return min_; }
|
||||
int max() const { return max_; }
|
||||
QuantifierType quantifier_type() const { return quantifier_type_; }
|
||||
bool is_possessive() const { return quantifier_type_ == POSSESSIVE; }
|
||||
bool is_non_greedy() { return quantifier_type_ == NON_GREEDY; }
|
||||
bool is_non_greedy() const { return quantifier_type_ == NON_GREEDY; }
|
||||
bool is_greedy() const { return quantifier_type_ == GREEDY; }
|
||||
RegExpTree* body() { return body_; }
|
||||
RegExpTree* body() const { return body_; }
|
||||
|
||||
private:
|
||||
RegExpTree* body_;
|
||||
|
@ -462,15 +453,14 @@ class RegExpCapture final : public RegExpTree {
|
|||
min_match_(0),
|
||||
max_match_(0),
|
||||
name_(nullptr) {}
|
||||
void* Accept(RegExpVisitor* visitor, void* data) override;
|
||||
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
|
||||
|
||||
DECL_BOILERPLATE(Capture);
|
||||
|
||||
static RegExpNode* ToNode(RegExpTree* body, int index,
|
||||
RegExpCompiler* compiler, RegExpNode* on_success);
|
||||
RegExpCapture* AsCapture() override;
|
||||
bool IsAnchoredAtStart() override;
|
||||
bool IsAnchoredAtEnd() override;
|
||||
Interval CaptureRegisters() override;
|
||||
bool IsCapture() override;
|
||||
int min_match() override { return min_match_; }
|
||||
int max_match() override { return max_match_; }
|
||||
RegExpTree* body() { return body_; }
|
||||
|
@ -480,17 +470,17 @@ class RegExpCapture final : public RegExpTree {
|
|||
max_match_ = body->max_match();
|
||||
}
|
||||
int index() const { return index_; }
|
||||
const ZoneVector<uc16>* name() const { return name_; }
|
||||
void set_name(const ZoneVector<uc16>* name) { name_ = name; }
|
||||
const ZoneVector<base::uc16>* name() const { return name_; }
|
||||
void set_name(const ZoneVector<base::uc16>* name) { name_ = name; }
|
||||
static int StartRegister(int index) { return index * 2; }
|
||||
static int EndRegister(int index) { return index * 2 + 1; }
|
||||
|
||||
private:
|
||||
RegExpTree* body_;
|
||||
RegExpTree* body_ = nullptr;
|
||||
int index_;
|
||||
int min_match_;
|
||||
int max_match_;
|
||||
const ZoneVector<uc16>* name_;
|
||||
int min_match_ = 0;
|
||||
int max_match_ = 0;
|
||||
const ZoneVector<base::uc16>* name_ = nullptr;
|
||||
};
|
||||
|
||||
class RegExpGroup final : public RegExpTree {
|
||||
|
@ -499,19 +489,15 @@ class RegExpGroup final : public RegExpTree {
|
|||
: body_(body),
|
||||
min_match_(body->min_match()),
|
||||
max_match_(body->max_match()) {}
|
||||
void* Accept(RegExpVisitor* visitor, void* data) override;
|
||||
RegExpNode* ToNode(RegExpCompiler* compiler,
|
||||
RegExpNode* on_success) override {
|
||||
return body_->ToNode(compiler, on_success);
|
||||
}
|
||||
RegExpGroup* AsGroup() override;
|
||||
|
||||
DECL_BOILERPLATE(Group);
|
||||
|
||||
bool IsAnchoredAtStart() override { return body_->IsAnchoredAtStart(); }
|
||||
bool IsAnchoredAtEnd() override { return body_->IsAnchoredAtEnd(); }
|
||||
bool IsGroup() override;
|
||||
int min_match() override { return min_match_; }
|
||||
int max_match() override { return max_match_; }
|
||||
Interval CaptureRegisters() override { return body_->CaptureRegisters(); }
|
||||
RegExpTree* body() { return body_; }
|
||||
RegExpTree* body() const { return body_; }
|
||||
|
||||
private:
|
||||
RegExpTree* body_;
|
||||
|
@ -531,26 +517,24 @@ class RegExpLookaround final : public RegExpTree {
|
|||
capture_from_(capture_from),
|
||||
type_(type) {}
|
||||
|
||||
void* Accept(RegExpVisitor* visitor, void* data) override;
|
||||
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
|
||||
RegExpLookaround* AsLookaround() override;
|
||||
DECL_BOILERPLATE(Lookaround);
|
||||
|
||||
Interval CaptureRegisters() override;
|
||||
bool IsLookaround() override;
|
||||
bool IsAnchoredAtStart() override;
|
||||
int min_match() override { return 0; }
|
||||
int max_match() override { return 0; }
|
||||
RegExpTree* body() { return body_; }
|
||||
bool is_positive() { return is_positive_; }
|
||||
int capture_count() { return capture_count_; }
|
||||
int capture_from() { return capture_from_; }
|
||||
Type type() { return type_; }
|
||||
RegExpTree* body() const { return body_; }
|
||||
bool is_positive() const { return is_positive_; }
|
||||
int capture_count() const { return capture_count_; }
|
||||
int capture_from() const { return capture_from_; }
|
||||
Type type() const { return type_; }
|
||||
|
||||
class Builder {
|
||||
public:
|
||||
Builder(bool is_positive, RegExpNode* on_success,
|
||||
int stack_pointer_register, int position_register,
|
||||
int capture_register_count = 0, int capture_register_start = 0);
|
||||
RegExpNode* on_match_success() { return on_match_success_; }
|
||||
RegExpNode* on_match_success() const { return on_match_success_; }
|
||||
RegExpNode* ForMatch(RegExpNode* match);
|
||||
|
||||
private:
|
||||
|
@ -572,38 +556,32 @@ class RegExpLookaround final : public RegExpTree {
|
|||
|
||||
class RegExpBackReference final : public RegExpTree {
|
||||
public:
|
||||
explicit RegExpBackReference(JSRegExp::Flags flags)
|
||||
: capture_(nullptr), name_(nullptr), flags_(flags) {}
|
||||
RegExpBackReference(RegExpCapture* capture, JSRegExp::Flags flags)
|
||||
: capture_(capture), name_(nullptr), flags_(flags) {}
|
||||
void* Accept(RegExpVisitor* visitor, void* data) override;
|
||||
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
|
||||
RegExpBackReference* AsBackReference() override;
|
||||
bool IsBackReference() override;
|
||||
explicit RegExpBackReference(RegExpFlags flags) : flags_(flags) {}
|
||||
RegExpBackReference(RegExpCapture* capture, RegExpFlags flags)
|
||||
: capture_(capture), flags_(flags) {}
|
||||
|
||||
DECL_BOILERPLATE(BackReference);
|
||||
|
||||
int min_match() override { return 0; }
|
||||
// The back reference may be recursive, e.g. /(\2)(\1)/. To avoid infinite
|
||||
// recursion, we give up. Ignorance is bliss.
|
||||
int max_match() override { return kInfinity; }
|
||||
int index() { return capture_->index(); }
|
||||
RegExpCapture* capture() { return capture_; }
|
||||
int index() const { return capture_->index(); }
|
||||
RegExpCapture* capture() const { return capture_; }
|
||||
void set_capture(RegExpCapture* capture) { capture_ = capture; }
|
||||
const ZoneVector<uc16>* name() const { return name_; }
|
||||
void set_name(const ZoneVector<uc16>* name) { name_ = name; }
|
||||
const ZoneVector<base::uc16>* name() const { return name_; }
|
||||
void set_name(const ZoneVector<base::uc16>* name) { name_ = name; }
|
||||
|
||||
private:
|
||||
RegExpCapture* capture_;
|
||||
const ZoneVector<uc16>* name_;
|
||||
const JSRegExp::Flags flags_;
|
||||
RegExpCapture* capture_ = nullptr;
|
||||
const ZoneVector<base::uc16>* name_ = nullptr;
|
||||
const RegExpFlags flags_;
|
||||
};
|
||||
|
||||
|
||||
class RegExpEmpty final : public RegExpTree {
|
||||
public:
|
||||
RegExpEmpty() = default;
|
||||
void* Accept(RegExpVisitor* visitor, void* data) override;
|
||||
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
|
||||
RegExpEmpty* AsEmpty() override;
|
||||
bool IsEmpty() override;
|
||||
DECL_BOILERPLATE(Empty);
|
||||
int min_match() override { return 0; }
|
||||
int max_match() override { return 0; }
|
||||
};
|
||||
|
@ -611,4 +589,6 @@ class RegExpEmpty final : public RegExpTree {
|
|||
} // namespace internal
|
||||
} // namespace v8
|
||||
|
||||
#undef DECL_BOILERPLATE
|
||||
|
||||
#endif // V8_REGEXP_REGEXP_AST_H_
|
||||
|
|
|
@ -23,29 +23,29 @@ void RegExpBytecodeGenerator::Emit(uint32_t byte, int32_t twenty_four_bits) {
|
|||
}
|
||||
|
||||
void RegExpBytecodeGenerator::Emit16(uint32_t word) {
|
||||
DCHECK(pc_ <= buffer_.length());
|
||||
if (pc_ + 1 >= buffer_.length()) {
|
||||
Expand();
|
||||
DCHECK(pc_ <= static_cast<int>(buffer_.size()));
|
||||
if (pc_ + 1 >= static_cast<int>(buffer_.size())) {
|
||||
ExpandBuffer();
|
||||
}
|
||||
*reinterpret_cast<uint16_t*>(buffer_.begin() + pc_) = word;
|
||||
*reinterpret_cast<uint16_t*>(buffer_.data() + pc_) = word;
|
||||
pc_ += 2;
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::Emit8(uint32_t word) {
|
||||
DCHECK(pc_ <= buffer_.length());
|
||||
if (pc_ == buffer_.length()) {
|
||||
Expand();
|
||||
DCHECK(pc_ <= static_cast<int>(buffer_.size()));
|
||||
if (pc_ == static_cast<int>(buffer_.size())) {
|
||||
ExpandBuffer();
|
||||
}
|
||||
*reinterpret_cast<unsigned char*>(buffer_.begin() + pc_) = word;
|
||||
*reinterpret_cast<unsigned char*>(buffer_.data() + pc_) = word;
|
||||
pc_ += 1;
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::Emit32(uint32_t word) {
|
||||
DCHECK(pc_ <= buffer_.length());
|
||||
if (pc_ + 3 >= buffer_.length()) {
|
||||
Expand();
|
||||
DCHECK(pc_ <= static_cast<int>(buffer_.size()));
|
||||
if (pc_ + 3 >= static_cast<int>(buffer_.size())) {
|
||||
ExpandBuffer();
|
||||
}
|
||||
*reinterpret_cast<uint32_t*>(buffer_.begin() + pc_) = word;
|
||||
*reinterpret_cast<uint32_t*>(buffer_.data() + pc_) = word;
|
||||
pc_ += 4;
|
||||
}
|
||||
|
||||
|
|
|
@ -14,7 +14,7 @@ namespace internal {
|
|||
|
||||
RegExpBytecodeGenerator::RegExpBytecodeGenerator(Isolate* isolate, Zone* zone)
|
||||
: RegExpMacroAssembler(isolate, zone),
|
||||
buffer_(Vector<byte>::New(1024)),
|
||||
buffer_(kInitialBufferSize, zone),
|
||||
pc_(0),
|
||||
advance_current_end_(kInvalidPC),
|
||||
jump_edges_(zone),
|
||||
|
@ -22,7 +22,6 @@ RegExpBytecodeGenerator::RegExpBytecodeGenerator(Isolate* isolate, Zone* zone)
|
|||
|
||||
RegExpBytecodeGenerator::~RegExpBytecodeGenerator() {
|
||||
if (backtrack_.is_linked()) backtrack_.Unuse();
|
||||
buffer_.Dispose();
|
||||
}
|
||||
|
||||
RegExpBytecodeGenerator::IrregexpImplementation
|
||||
|
@ -37,8 +36,8 @@ void RegExpBytecodeGenerator::Bind(Label* l) {
|
|||
int pos = l->pos();
|
||||
while (pos != 0) {
|
||||
int fixup = pos;
|
||||
pos = *reinterpret_cast<int32_t*>(buffer_.begin() + fixup);
|
||||
*reinterpret_cast<uint32_t*>(buffer_.begin() + fixup) = pc_;
|
||||
pos = *reinterpret_cast<int32_t*>(buffer_.data() + fixup);
|
||||
*reinterpret_cast<uint32_t*>(buffer_.data() + fixup) = pc_;
|
||||
jump_edges_.emplace(fixup, pc_);
|
||||
}
|
||||
}
|
||||
|
@ -218,12 +217,14 @@ void RegExpBytecodeGenerator::LoadCurrentCharacterImpl(int cp_offset,
|
|||
if (check_bounds) EmitOrLink(on_failure);
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::CheckCharacterLT(uc16 limit, Label* on_less) {
|
||||
void RegExpBytecodeGenerator::CheckCharacterLT(base::uc16 limit,
|
||||
Label* on_less) {
|
||||
Emit(BC_CHECK_LT, limit);
|
||||
EmitOrLink(on_less);
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::CheckCharacterGT(uc16 limit, Label* on_greater) {
|
||||
void RegExpBytecodeGenerator::CheckCharacterGT(base::uc16 limit,
|
||||
Label* on_greater) {
|
||||
Emit(BC_CHECK_GT, limit);
|
||||
EmitOrLink(on_greater);
|
||||
}
|
||||
|
@ -286,14 +287,15 @@ void RegExpBytecodeGenerator::CheckNotCharacterAfterAnd(uint32_t c,
|
|||
}
|
||||
|
||||
void RegExpBytecodeGenerator::CheckNotCharacterAfterMinusAnd(
|
||||
uc16 c, uc16 minus, uc16 mask, Label* on_not_equal) {
|
||||
base::uc16 c, base::uc16 minus, base::uc16 mask, Label* on_not_equal) {
|
||||
Emit(BC_MINUS_AND_CHECK_NOT_CHAR, c);
|
||||
Emit16(minus);
|
||||
Emit16(mask);
|
||||
EmitOrLink(on_not_equal);
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::CheckCharacterInRange(uc16 from, uc16 to,
|
||||
void RegExpBytecodeGenerator::CheckCharacterInRange(base::uc16 from,
|
||||
base::uc16 to,
|
||||
Label* on_in_range) {
|
||||
Emit(BC_CHECK_CHAR_IN_RANGE, 0);
|
||||
Emit16(from);
|
||||
|
@ -301,7 +303,8 @@ void RegExpBytecodeGenerator::CheckCharacterInRange(uc16 from, uc16 to,
|
|||
EmitOrLink(on_in_range);
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::CheckCharacterNotInRange(uc16 from, uc16 to,
|
||||
void RegExpBytecodeGenerator::CheckCharacterNotInRange(base::uc16 from,
|
||||
base::uc16 to,
|
||||
Label* on_not_in_range) {
|
||||
Emit(BC_CHECK_CHAR_NOT_IN_RANGE, 0);
|
||||
Emit16(from);
|
||||
|
@ -377,7 +380,7 @@ Handle<HeapObject> RegExpBytecodeGenerator::GetCode(Handle<String> source) {
|
|||
Handle<ByteArray> array;
|
||||
if (FLAG_regexp_peephole_optimization) {
|
||||
array = RegExpBytecodePeepholeOptimization::OptimizeBytecode(
|
||||
isolate_, zone(), source, buffer_.begin(), length(), jump_edges_);
|
||||
isolate_, zone(), source, buffer_.data(), length(), jump_edges_);
|
||||
} else {
|
||||
array = isolate_->factory()->NewByteArray(length());
|
||||
Copy(array->GetDataStartAddress());
|
||||
|
@ -389,14 +392,13 @@ Handle<HeapObject> RegExpBytecodeGenerator::GetCode(Handle<String> source) {
|
|||
int RegExpBytecodeGenerator::length() { return pc_; }
|
||||
|
||||
void RegExpBytecodeGenerator::Copy(byte* a) {
|
||||
MemCopy(a, buffer_.begin(), length());
|
||||
MemCopy(a, buffer_.data(), length());
|
||||
}
|
||||
|
||||
void RegExpBytecodeGenerator::Expand() {
|
||||
Vector<byte> old_buffer = buffer_;
|
||||
buffer_ = Vector<byte>::New(old_buffer.length() * 2);
|
||||
MemCopy(buffer_.begin(), old_buffer.begin(), old_buffer.length());
|
||||
old_buffer.Dispose();
|
||||
void RegExpBytecodeGenerator::ExpandBuffer() {
|
||||
// TODO(jgruber): The growth strategy could be smarter for large sizes.
|
||||
// TODO(jgruber): It's not necessary to default-initialize new elements.
|
||||
buffer_.resize(buffer_.size() * 2);
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
|
|
|
@ -25,7 +25,7 @@ class V8_EXPORT_PRIVATE RegExpBytecodeGenerator : public RegExpMacroAssembler {
|
|||
~RegExpBytecodeGenerator() override;
|
||||
// The byte-code interpreter checks on each push anyway.
|
||||
int stack_limit_slack() override { return 1; }
|
||||
bool CanReadUnaligned() override { return false; }
|
||||
bool CanReadUnaligned() const override { return false; }
|
||||
void Bind(Label* label) override;
|
||||
void AdvanceCurrentPosition(int by) override; // Signed cp change.
|
||||
void PopCurrentPosition() override;
|
||||
|
@ -52,19 +52,36 @@ class V8_EXPORT_PRIVATE RegExpBytecodeGenerator : public RegExpMacroAssembler {
|
|||
void CheckCharacter(unsigned c, Label* on_equal) override;
|
||||
void CheckCharacterAfterAnd(unsigned c, unsigned mask,
|
||||
Label* on_equal) override;
|
||||
void CheckCharacterGT(uc16 limit, Label* on_greater) override;
|
||||
void CheckCharacterLT(uc16 limit, Label* on_less) override;
|
||||
void CheckCharacterGT(base::uc16 limit, Label* on_greater) override;
|
||||
void CheckCharacterLT(base::uc16 limit, Label* on_less) override;
|
||||
void CheckGreedyLoop(Label* on_tos_equals_current_position) override;
|
||||
void CheckAtStart(int cp_offset, Label* on_at_start) override;
|
||||
void CheckNotAtStart(int cp_offset, Label* on_not_at_start) override;
|
||||
void CheckNotCharacter(unsigned c, Label* on_not_equal) override;
|
||||
void CheckNotCharacterAfterAnd(unsigned c, unsigned mask,
|
||||
Label* on_not_equal) override;
|
||||
void CheckNotCharacterAfterMinusAnd(uc16 c, uc16 minus, uc16 mask,
|
||||
void CheckNotCharacterAfterMinusAnd(base::uc16 c, base::uc16 minus,
|
||||
base::uc16 mask,
|
||||
Label* on_not_equal) override;
|
||||
void CheckCharacterInRange(uc16 from, uc16 to, Label* on_in_range) override;
|
||||
void CheckCharacterNotInRange(uc16 from, uc16 to,
|
||||
void CheckCharacterInRange(base::uc16 from, base::uc16 to,
|
||||
Label* on_in_range) override;
|
||||
void CheckCharacterNotInRange(base::uc16 from, base::uc16 to,
|
||||
Label* on_not_in_range) override;
|
||||
bool CheckCharacterInRangeArray(const ZoneList<CharacterRange>* ranges,
|
||||
Label* on_in_range) override {
|
||||
// Disabled in the interpreter, because 1) there is no constant pool that
|
||||
// could store the ByteArray pointer, 2) bytecode size limits are not as
|
||||
// restrictive as code (e.g. branch distances on arm), 3) bytecode for
|
||||
// large character classes is already quite compact.
|
||||
// TODO(jgruber): Consider using BytecodeArrays (with a constant pool)
|
||||
// instead of plain ByteArrays; then we could implement
|
||||
// CheckCharacterInRangeArray in the interpreter.
|
||||
return false;
|
||||
}
|
||||
bool CheckCharacterNotInRangeArray(const ZoneList<CharacterRange>* ranges,
|
||||
Label* on_not_in_range) override {
|
||||
return false;
|
||||
}
|
||||
void CheckBitInTable(Handle<ByteArray> table, Label* on_bit_set) override;
|
||||
void CheckNotBackReference(int start_reg, bool read_backward,
|
||||
Label* on_no_match) override;
|
||||
|
@ -79,7 +96,8 @@ class V8_EXPORT_PRIVATE RegExpBytecodeGenerator : public RegExpMacroAssembler {
|
|||
Handle<HeapObject> GetCode(Handle<String> source) override;
|
||||
|
||||
private:
|
||||
void Expand();
|
||||
void ExpandBuffer();
|
||||
|
||||
// Code and bitmap emission.
|
||||
inline void EmitOrLink(Label* label);
|
||||
inline void Emit32(uint32_t x);
|
||||
|
@ -92,7 +110,9 @@ class V8_EXPORT_PRIVATE RegExpBytecodeGenerator : public RegExpMacroAssembler {
|
|||
void Copy(byte* a);
|
||||
|
||||
// The buffer into which code and relocation info are generated.
|
||||
Vector<byte> buffer_;
|
||||
static constexpr int kInitialBufferSize = 1024;
|
||||
ZoneVector<byte> buffer_;
|
||||
|
||||
// The program counter.
|
||||
int pc_;
|
||||
Label backtrack_;
|
||||
|
|
|
@ -258,13 +258,10 @@ int32_t GetArgumentValue(const byte* bytecode, int offset, int length) {
|
|||
switch (length) {
|
||||
case 1:
|
||||
return GetValue<byte>(bytecode, offset);
|
||||
break;
|
||||
case 2:
|
||||
return GetValue<int16_t>(bytecode, offset);
|
||||
break;
|
||||
case 4:
|
||||
return GetValue<int32_t>(bytecode, offset);
|
||||
break;
|
||||
default:
|
||||
UNREACHABLE();
|
||||
}
|
||||
|
|
|
@ -22,8 +22,9 @@ constexpr int BYTECODE_MASK = kRegExpPaddedBytecodeCount - 1;
|
|||
// positive values.
|
||||
const unsigned int MAX_FIRST_ARG = 0x7fffffu;
|
||||
const int BYTECODE_SHIFT = 8;
|
||||
STATIC_ASSERT(1 << BYTECODE_SHIFT > BYTECODE_MASK);
|
||||
static_assert(1 << BYTECODE_SHIFT > BYTECODE_MASK);
|
||||
|
||||
// The list of bytecodes, in format: V(Name, Code, ByteLength).
|
||||
// TODO(pthier): Argument offsets of bytecodes should be easily accessible by
|
||||
// name or at least by position.
|
||||
// TODO(jgruber): More precise types (e.g. int32/uint32 instead of value32).
|
||||
|
@ -85,12 +86,14 @@ STATIC_ASSERT(1 << BYTECODE_SHIFT > BYTECODE_MASK);
|
|||
/* 0x10 - 0x1F: Character to match against (after mask aplied) */ \
|
||||
/* 0x20 - 0x3F: Bitmask bitwise and combined with current character */ \
|
||||
/* 0x40 - 0x5F: Address of bytecode when matched */ \
|
||||
V(AND_CHECK_CHAR, 28, 12) /* bc8 pad8 uint16 uint32 addr32 */ \
|
||||
V(AND_CHECK_NOT_4_CHARS, 29, 16) /* bc8 pad24 uint32 uint32 addr32 */ \
|
||||
V(AND_CHECK_NOT_CHAR, 30, 12) /* bc8 pad8 uint16 uint32 addr32 */ \
|
||||
V(MINUS_AND_CHECK_NOT_CHAR, 31, 12) /* bc8 pad8 uc16 uc16 uc16 addr32 */ \
|
||||
V(CHECK_CHAR_IN_RANGE, 32, 12) /* bc8 pad24 uc16 uc16 addr32 */ \
|
||||
V(CHECK_CHAR_NOT_IN_RANGE, 33, 12) /* bc8 pad24 uc16 uc16 addr32 */ \
|
||||
V(AND_CHECK_CHAR, 28, 12) /* bc8 pad8 uint16 uint32 addr32 */ \
|
||||
V(AND_CHECK_NOT_4_CHARS, 29, 16) /* bc8 pad24 uint32 uint32 addr32 */ \
|
||||
V(AND_CHECK_NOT_CHAR, 30, 12) /* bc8 pad8 uint16 uint32 addr32 */ \
|
||||
V(MINUS_AND_CHECK_NOT_CHAR, 31, \
|
||||
12) /* bc8 pad8 base::uc16 base::uc16 base::uc16 addr32 */ \
|
||||
V(CHECK_CHAR_IN_RANGE, 32, 12) /* bc8 pad24 base::uc16 base::uc16 addr32 */ \
|
||||
V(CHECK_CHAR_NOT_IN_RANGE, 33, \
|
||||
12) /* bc8 pad24 base::uc16 base::uc16 addr32 */ \
|
||||
/* Checks if the current character matches any of the characters encoded */ \
|
||||
/* in a bit table. Similar to/inspired by boyer moore string search */ \
|
||||
/* Bit Layout: */ \
|
||||
|
@ -99,8 +102,8 @@ STATIC_ASSERT(1 << BYTECODE_SHIFT > BYTECODE_MASK);
|
|||
/* 0x20 - 0x3F: Address of bytecode when bit is set */ \
|
||||
/* 0x40 - 0xBF: Bit table */ \
|
||||
V(CHECK_BIT_IN_TABLE, 34, 24) /* bc8 pad24 addr32 bits128 */ \
|
||||
V(CHECK_LT, 35, 8) /* bc8 pad8 uc16 addr32 */ \
|
||||
V(CHECK_GT, 36, 8) /* bc8 pad8 uc16 addr32 */ \
|
||||
V(CHECK_LT, 35, 8) /* bc8 pad8 base::uc16 addr32 */ \
|
||||
V(CHECK_GT, 36, 8) /* bc8 pad8 base::uc16 addr32 */ \
|
||||
V(CHECK_NOT_BACK_REF, 37, 8) /* bc8 reg_idx24 addr32 */ \
|
||||
V(CHECK_NOT_BACK_REF_NO_CASE, 38, 8) /* bc8 reg_idx24 addr32 */ \
|
||||
V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE, 39, 8) \
|
||||
|
@ -215,7 +218,7 @@ static constexpr int kRegExpBytecodeCount = BYTECODE_ITERATOR(COUNT);
|
|||
// contiguous, strictly increasing, and start at 0.
|
||||
// TODO(jgruber): Do not explicitly assign values, instead generate them
|
||||
// implicitly from the list order.
|
||||
STATIC_ASSERT(kRegExpBytecodeCount == 59);
|
||||
static_assert(kRegExpBytecodeCount == 59);
|
||||
|
||||
#define DECLARE_BYTECODES(name, code, length) \
|
||||
static constexpr int BC_##name = code;
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -5,11 +5,9 @@
|
|||
#include "irregexp/imported/regexp-compiler.h"
|
||||
|
||||
#include "irregexp/imported/regexp-macro-assembler-arch.h"
|
||||
#ifdef V8_INTL_SUPPORT
|
||||
#include "irregexp/imported/special-case.h"
|
||||
#endif // V8_INTL_SUPPORT
|
||||
|
||||
#ifdef V8_INTL_SUPPORT
|
||||
#include "irregexp/imported/special-case.h"
|
||||
#include "unicode/locid.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/utypes.h"
|
||||
|
@ -171,17 +169,17 @@ using namespace regexp_compiler_constants; // NOLINT(build/namespaces)
|
|||
|
||||
namespace {
|
||||
|
||||
constexpr uc32 MaxCodeUnit(const bool one_byte) {
|
||||
STATIC_ASSERT(String::kMaxOneByteCharCodeU <=
|
||||
constexpr base::uc32 MaxCodeUnit(const bool one_byte) {
|
||||
static_assert(String::kMaxOneByteCharCodeU <=
|
||||
std::numeric_limits<uint16_t>::max());
|
||||
STATIC_ASSERT(String::kMaxUtf16CodeUnitU <=
|
||||
static_assert(String::kMaxUtf16CodeUnitU <=
|
||||
std::numeric_limits<uint16_t>::max());
|
||||
return one_byte ? String::kMaxOneByteCharCodeU : String::kMaxUtf16CodeUnitU;
|
||||
}
|
||||
|
||||
constexpr uint32_t CharMask(const bool one_byte) {
|
||||
STATIC_ASSERT(base::bits::IsPowerOfTwo(String::kMaxOneByteCharCodeU + 1));
|
||||
STATIC_ASSERT(base::bits::IsPowerOfTwo(String::kMaxUtf16CodeUnitU + 1));
|
||||
static_assert(base::bits::IsPowerOfTwo(String::kMaxOneByteCharCodeU + 1));
|
||||
static_assert(base::bits::IsPowerOfTwo(String::kMaxUtf16CodeUnitU + 1));
|
||||
return MaxCodeUnit(one_byte);
|
||||
}
|
||||
|
||||
|
@ -235,12 +233,13 @@ class RecursionCheck {
|
|||
// Attempts to compile the regexp using an Irregexp code generator. Returns
|
||||
// a fixed array or a null handle depending on whether it succeeded.
|
||||
RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
|
||||
bool one_byte)
|
||||
RegExpFlags flags, bool one_byte)
|
||||
: next_register_(JSRegExp::RegistersForCaptureCount(capture_count)),
|
||||
unicode_lookaround_stack_register_(kNoRegister),
|
||||
unicode_lookaround_position_register_(kNoRegister),
|
||||
work_list_(nullptr),
|
||||
recursion_depth_(0),
|
||||
flags_(flags),
|
||||
one_byte_(one_byte),
|
||||
reg_exp_too_big_(false),
|
||||
limiting_recursion_(false),
|
||||
|
@ -274,6 +273,9 @@ RegExpCompiler::CompilationResult RegExpCompiler::Assemble(
|
|||
if (!node->label()->is_bound()) node->Emit(this, &new_trace);
|
||||
}
|
||||
if (reg_exp_too_big_) {
|
||||
if (FLAG_correctness_fuzzer_suppressions) {
|
||||
FATAL("Aborting on excess zone allocation");
|
||||
}
|
||||
macro_assembler_->AbortedCodeGeneration();
|
||||
return CompilationResult::RegExpTooBig();
|
||||
}
|
||||
|
@ -480,7 +482,6 @@ void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
|
|||
}
|
||||
default:
|
||||
UNREACHABLE();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -734,7 +735,7 @@ namespace {
|
|||
|
||||
#ifdef DEBUG
|
||||
bool ContainsOnlyUtf16CodeUnits(unibrow::uchar* chars, int length) {
|
||||
STATIC_ASSERT(sizeof(unibrow::uchar) == 4);
|
||||
static_assert(sizeof(unibrow::uchar) == 4);
|
||||
for (int i = 0; i < length; i++) {
|
||||
if (chars[i] > String::kMaxUtf16CodeUnit) return false;
|
||||
}
|
||||
|
@ -742,14 +743,11 @@ bool ContainsOnlyUtf16CodeUnits(unibrow::uchar* chars, int length) {
|
|||
}
|
||||
#endif // DEBUG
|
||||
|
||||
} // namespace
|
||||
|
||||
// Returns the number of characters in the equivalence class, omitting those
|
||||
// that cannot occur in the source string because it is Latin1.
|
||||
static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
|
||||
bool one_byte_subject,
|
||||
unibrow::uchar* letters,
|
||||
int letter_length) {
|
||||
int GetCaseIndependentLetters(Isolate* isolate, base::uc16 character,
|
||||
bool one_byte_subject, unibrow::uchar* letters,
|
||||
int letter_length) {
|
||||
#ifdef V8_INTL_SUPPORT
|
||||
if (RegExpCaseFolding::IgnoreSet().contains(character)) {
|
||||
letters[0] = character;
|
||||
|
@ -809,10 +807,9 @@ static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
|
|||
#endif // V8_INTL_SUPPORT
|
||||
}
|
||||
|
||||
static inline bool EmitSimpleCharacter(Isolate* isolate,
|
||||
RegExpCompiler* compiler, uc16 c,
|
||||
Label* on_failure, int cp_offset,
|
||||
bool check, bool preloaded) {
|
||||
inline bool EmitSimpleCharacter(Isolate* isolate, RegExpCompiler* compiler,
|
||||
base::uc16 c, Label* on_failure, int cp_offset,
|
||||
bool check, bool preloaded) {
|
||||
RegExpMacroAssembler* assembler = compiler->macro_assembler();
|
||||
bool bound_checked = false;
|
||||
if (!preloaded) {
|
||||
|
@ -825,9 +822,9 @@ static inline bool EmitSimpleCharacter(Isolate* isolate,
|
|||
|
||||
// Only emits non-letters (things that don't have case). Only used for case
|
||||
// independent matches.
|
||||
static inline bool EmitAtomNonLetter(Isolate* isolate, RegExpCompiler* compiler,
|
||||
uc16 c, Label* on_failure, int cp_offset,
|
||||
bool check, bool preloaded) {
|
||||
inline bool EmitAtomNonLetter(Isolate* isolate, RegExpCompiler* compiler,
|
||||
base::uc16 c, Label* on_failure, int cp_offset,
|
||||
bool check, bool preloaded) {
|
||||
RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
|
||||
bool one_byte = compiler->one_byte();
|
||||
unibrow::uchar chars[4];
|
||||
|
@ -854,28 +851,28 @@ static inline bool EmitAtomNonLetter(Isolate* isolate, RegExpCompiler* compiler,
|
|||
return checked;
|
||||
}
|
||||
|
||||
static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
|
||||
bool one_byte, uc16 c1, uc16 c2,
|
||||
Label* on_failure) {
|
||||
bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
|
||||
bool one_byte, base::uc16 c1, base::uc16 c2,
|
||||
Label* on_failure) {
|
||||
const uint32_t char_mask = CharMask(one_byte);
|
||||
uc16 exor = c1 ^ c2;
|
||||
base::uc16 exor = c1 ^ c2;
|
||||
// Check whether exor has only one bit set.
|
||||
if (((exor - 1) & exor) == 0) {
|
||||
// If c1 and c2 differ only by one bit.
|
||||
// Ecma262UnCanonicalize always gives the highest number last.
|
||||
DCHECK(c2 > c1);
|
||||
uc16 mask = char_mask ^ exor;
|
||||
base::uc16 mask = char_mask ^ exor;
|
||||
macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
|
||||
return true;
|
||||
}
|
||||
DCHECK(c2 > c1);
|
||||
uc16 diff = c2 - c1;
|
||||
base::uc16 diff = c2 - c1;
|
||||
if (((diff - 1) & diff) == 0 && c1 >= diff) {
|
||||
// If the characters differ by 2^n but don't differ by one bit then
|
||||
// subtract the difference from the found character, then do the or
|
||||
// trick. We avoid the theoretical case where negative numbers are
|
||||
// involved in order to simplify code generation.
|
||||
uc16 mask = char_mask ^ diff;
|
||||
base::uc16 mask = char_mask ^ diff;
|
||||
macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff, diff, mask,
|
||||
on_failure);
|
||||
return true;
|
||||
|
@ -885,9 +882,9 @@ static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
|
|||
|
||||
// Only emits letters (things that have case). Only used for case independent
|
||||
// matches.
|
||||
static inline bool EmitAtomLetter(Isolate* isolate, RegExpCompiler* compiler,
|
||||
uc16 c, Label* on_failure, int cp_offset,
|
||||
bool check, bool preloaded) {
|
||||
inline bool EmitAtomLetter(Isolate* isolate, RegExpCompiler* compiler,
|
||||
base::uc16 c, Label* on_failure, int cp_offset,
|
||||
bool check, bool preloaded) {
|
||||
RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
|
||||
bool one_byte = compiler->one_byte();
|
||||
unibrow::uchar chars[4];
|
||||
|
@ -925,9 +922,9 @@ static inline bool EmitAtomLetter(Isolate* isolate, RegExpCompiler* compiler,
|
|||
return true;
|
||||
}
|
||||
|
||||
static void EmitBoundaryTest(RegExpMacroAssembler* masm, int border,
|
||||
Label* fall_through, Label* above_or_equal,
|
||||
Label* below) {
|
||||
void EmitBoundaryTest(RegExpMacroAssembler* masm, int border,
|
||||
Label* fall_through, Label* above_or_equal,
|
||||
Label* below) {
|
||||
if (below != fall_through) {
|
||||
masm->CheckCharacterLT(border, below);
|
||||
if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
|
||||
|
@ -936,9 +933,9 @@ static void EmitBoundaryTest(RegExpMacroAssembler* masm, int border,
|
|||
}
|
||||
}
|
||||
|
||||
static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm, int first,
|
||||
int last, Label* fall_through,
|
||||
Label* in_range, Label* out_of_range) {
|
||||
void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm, int first, int last,
|
||||
Label* fall_through, Label* in_range,
|
||||
Label* out_of_range) {
|
||||
if (in_range == fall_through) {
|
||||
if (first == last) {
|
||||
masm->CheckNotCharacter(first, out_of_range);
|
||||
|
@ -957,15 +954,15 @@ static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm, int first,
|
|||
|
||||
// even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
|
||||
// odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
|
||||
static void EmitUseLookupTable(RegExpMacroAssembler* masm,
|
||||
ZoneList<uc32>* ranges, uint32_t start_index,
|
||||
uint32_t end_index, uc32 min_char,
|
||||
Label* fall_through, Label* even_label,
|
||||
Label* odd_label) {
|
||||
void EmitUseLookupTable(RegExpMacroAssembler* masm,
|
||||
ZoneList<base::uc32>* ranges, uint32_t start_index,
|
||||
uint32_t end_index, base::uc32 min_char,
|
||||
Label* fall_through, Label* even_label,
|
||||
Label* odd_label) {
|
||||
static const uint32_t kSize = RegExpMacroAssembler::kTableSize;
|
||||
static const uint32_t kMask = RegExpMacroAssembler::kTableMask;
|
||||
|
||||
uc32 base = (min_char & ~kMask);
|
||||
base::uc32 base = (min_char & ~kMask);
|
||||
USE(base);
|
||||
|
||||
// Assert that everything is on one kTableSize page.
|
||||
|
@ -1012,10 +1009,9 @@ static void EmitUseLookupTable(RegExpMacroAssembler* masm,
|
|||
if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
|
||||
}
|
||||
|
||||
static void CutOutRange(RegExpMacroAssembler* masm, ZoneList<uc32>* ranges,
|
||||
uint32_t start_index, uint32_t end_index,
|
||||
uint32_t cut_index, Label* even_label,
|
||||
Label* odd_label) {
|
||||
void CutOutRange(RegExpMacroAssembler* masm, ZoneList<base::uc32>* ranges,
|
||||
uint32_t start_index, uint32_t end_index, uint32_t cut_index,
|
||||
Label* even_label, Label* odd_label) {
|
||||
bool odd = (((cut_index - start_index) & 1) == 1);
|
||||
Label* in_range_label = odd ? odd_label : even_label;
|
||||
Label dummy;
|
||||
|
@ -1036,14 +1032,14 @@ static void CutOutRange(RegExpMacroAssembler* masm, ZoneList<uc32>* ranges,
|
|||
|
||||
// Unicode case. Split the search space into kSize spaces that are handled
|
||||
// with recursion.
|
||||
static void SplitSearchSpace(ZoneList<uc32>* ranges, uint32_t start_index,
|
||||
uint32_t end_index, uint32_t* new_start_index,
|
||||
uint32_t* new_end_index, uc32* border) {
|
||||
void SplitSearchSpace(ZoneList<base::uc32>* ranges, uint32_t start_index,
|
||||
uint32_t end_index, uint32_t* new_start_index,
|
||||
uint32_t* new_end_index, base::uc32* border) {
|
||||
static const uint32_t kSize = RegExpMacroAssembler::kTableSize;
|
||||
static const uint32_t kMask = RegExpMacroAssembler::kTableMask;
|
||||
|
||||
uc32 first = ranges->at(start_index);
|
||||
uc32 last = ranges->at(end_index) - 1;
|
||||
base::uc32 first = ranges->at(start_index);
|
||||
base::uc32 last = ranges->at(end_index) - 1;
|
||||
|
||||
*new_start_index = start_index;
|
||||
*border = (ranges->at(start_index) & ~kMask) + kSize;
|
||||
|
@ -1102,15 +1098,16 @@ static void SplitSearchSpace(ZoneList<uc32>* ranges, uint32_t start_index,
|
|||
// know that the character is in the range of min_char to max_char inclusive.
|
||||
// Either label can be nullptr indicating backtracking. Either label can also
|
||||
// be equal to the fall_through label.
|
||||
static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<uc32>* ranges,
|
||||
uint32_t start_index, uint32_t end_index,
|
||||
uc32 min_char, uc32 max_char, Label* fall_through,
|
||||
Label* even_label, Label* odd_label) {
|
||||
void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<base::uc32>* ranges,
|
||||
uint32_t start_index, uint32_t end_index,
|
||||
base::uc32 min_char, base::uc32 max_char,
|
||||
Label* fall_through, Label* even_label,
|
||||
Label* odd_label) {
|
||||
DCHECK_LE(min_char, String::kMaxUtf16CodeUnit);
|
||||
DCHECK_LE(max_char, String::kMaxUtf16CodeUnit);
|
||||
|
||||
uc32 first = ranges->at(start_index);
|
||||
uc32 last = ranges->at(end_index) - 1;
|
||||
base::uc32 first = ranges->at(start_index);
|
||||
base::uc32 last = ranges->at(end_index) - 1;
|
||||
|
||||
DCHECK_LT(min_char, first);
|
||||
|
||||
|
@ -1170,7 +1167,7 @@ static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<uc32>* ranges,
|
|||
|
||||
uint32_t new_start_index = 0;
|
||||
uint32_t new_end_index = 0;
|
||||
uc32 border = 0;
|
||||
base::uc32 border = 0;
|
||||
|
||||
SplitSearchSpace(ranges, start_index, end_index, &new_start_index,
|
||||
&new_end_index, &border);
|
||||
|
@ -1213,24 +1210,19 @@ static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<uc32>* ranges,
|
|||
}
|
||||
}
|
||||
|
||||
static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
|
||||
RegExpCharacterClass* cc, bool one_byte,
|
||||
Label* on_failure, int cp_offset, bool check_offset,
|
||||
bool preloaded, Zone* zone) {
|
||||
void EmitCharClass(RegExpMacroAssembler* macro_assembler,
|
||||
RegExpCharacterClass* cc, bool one_byte, Label* on_failure,
|
||||
int cp_offset, bool check_offset, bool preloaded,
|
||||
Zone* zone) {
|
||||
ZoneList<CharacterRange>* ranges = cc->ranges(zone);
|
||||
CharacterRange::Canonicalize(ranges);
|
||||
|
||||
const uc32 max_char = MaxCodeUnit(one_byte);
|
||||
int range_count = ranges->length();
|
||||
// Now that all processing (like case-insensitivity) is done, clamp the
|
||||
// ranges to the set of ranges that may actually occur in the subject string.
|
||||
if (one_byte) CharacterRange::ClampToOneByte(ranges);
|
||||
|
||||
int last_valid_range = range_count - 1;
|
||||
while (last_valid_range >= 0) {
|
||||
CharacterRange& range = ranges->at(last_valid_range);
|
||||
if (range.from() <= max_char) break;
|
||||
last_valid_range--;
|
||||
}
|
||||
|
||||
if (last_valid_range < 0) {
|
||||
const int ranges_length = ranges->length();
|
||||
if (ranges_length == 0) {
|
||||
if (!cc->is_negated()) {
|
||||
macro_assembler->GoTo(on_failure);
|
||||
}
|
||||
|
@ -1240,7 +1232,8 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
|
|||
return;
|
||||
}
|
||||
|
||||
if (last_valid_range == 0 && ranges->at(0).IsEverything(max_char)) {
|
||||
const base::uc32 max_char = MaxCodeUnit(one_byte);
|
||||
if (ranges_length == 1 && ranges->at(0).IsEverything(max_char)) {
|
||||
if (cc->is_negated()) {
|
||||
macro_assembler->GoTo(on_failure);
|
||||
} else {
|
||||
|
@ -1261,18 +1254,33 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
|
|||
return;
|
||||
}
|
||||
|
||||
// A new list with ascending entries. Each entry is a code unit
|
||||
// where there is a boundary between code units that are part of
|
||||
// the class and code units that are not. Normally we insert an
|
||||
// entry at zero which goes to the failure label, but if there
|
||||
// was already one there we fall through for success on that entry.
|
||||
// Subsequent entries have alternating meaning (success/failure).
|
||||
ZoneList<uc32>* range_boundaries =
|
||||
zone->New<ZoneList<uc32>>(last_valid_range, zone);
|
||||
static constexpr int kMaxRangesForInlineBranchGeneration = 16;
|
||||
if (ranges_length > kMaxRangesForInlineBranchGeneration) {
|
||||
// For large range sets, emit a more compact instruction sequence to avoid
|
||||
// a potentially problematic increase in code size.
|
||||
// Note the flipped logic below (we check InRange if negated, NotInRange if
|
||||
// not negated); this is necessary since the method falls through on
|
||||
// failure whereas we want to fall through on success.
|
||||
if (cc->is_negated()) {
|
||||
if (macro_assembler->CheckCharacterInRangeArray(ranges, on_failure)) {
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
if (macro_assembler->CheckCharacterNotInRangeArray(ranges, on_failure)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Generate a flat list of range boundaries for consumption by
|
||||
// GenerateBranches. See the comment on that function for how the list should
|
||||
// be structured
|
||||
ZoneList<base::uc32>* range_boundaries =
|
||||
zone->New<ZoneList<base::uc32>>(ranges_length * 2, zone);
|
||||
|
||||
bool zeroth_entry_is_failure = !cc->is_negated();
|
||||
|
||||
for (int i = 0; i <= last_valid_range; i++) {
|
||||
for (int i = 0; i < ranges_length; i++) {
|
||||
CharacterRange& range = ranges->at(i);
|
||||
if (range.from() == 0) {
|
||||
DCHECK_EQ(i, 0);
|
||||
|
@ -1280,6 +1288,8 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
|
|||
} else {
|
||||
range_boundaries->Add(range.from(), zone);
|
||||
}
|
||||
// `+ 1` to convert from inclusive to exclusive `to`.
|
||||
// [from, to] == [from, to+1[.
|
||||
range_boundaries->Add(range.to() + 1, zone);
|
||||
}
|
||||
int end_index = range_boundaries->length() - 1;
|
||||
|
@ -1298,6 +1308,8 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
|
|||
macro_assembler->Bind(&fall_through);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
RegExpNode::~RegExpNode() = default;
|
||||
|
||||
RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
|
||||
|
@ -1385,8 +1397,10 @@ void NegativeLookaroundChoiceNode::GetQuickCheckDetails(
|
|||
return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
// Takes the left-most 1-bit and smears it out, setting all bits to its right.
|
||||
static inline uint32_t SmearBitsRight(uint32_t v) {
|
||||
inline uint32_t SmearBitsRight(uint32_t v) {
|
||||
v |= v >> 1;
|
||||
v |= v >> 2;
|
||||
v |= v >> 4;
|
||||
|
@ -1395,6 +1409,8 @@ static inline uint32_t SmearBitsRight(uint32_t v) {
|
|||
return v;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
bool QuickCheckDetails::Rationalize(bool asc) {
|
||||
bool found_useful_op = false;
|
||||
const uint32_t char_mask = CharMask(asc);
|
||||
|
@ -1574,12 +1590,12 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
|
|||
for (int k = 0; k < elements()->length(); k++) {
|
||||
TextElement elm = elements()->at(k);
|
||||
if (elm.text_type() == TextElement::ATOM) {
|
||||
Vector<const uc16> quarks = elm.atom()->data();
|
||||
base::Vector<const base::uc16> quarks = elm.atom()->data();
|
||||
for (int i = 0; i < characters && i < quarks.length(); i++) {
|
||||
QuickCheckDetails::Position* pos =
|
||||
details->positions(characters_filled_in);
|
||||
uc16 c = quarks[i];
|
||||
if (elm.atom()->ignore_case()) {
|
||||
base::uc16 c = quarks[i];
|
||||
if (IsIgnoreCase(compiler->flags())) {
|
||||
unibrow::uchar chars[4];
|
||||
int length = GetCaseIndependentLetters(
|
||||
isolate, c, compiler->one_byte(), chars, 4);
|
||||
|
@ -1640,12 +1656,14 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
|
|||
details->positions(characters_filled_in);
|
||||
RegExpCharacterClass* tree = elm.char_class();
|
||||
ZoneList<CharacterRange>* ranges = tree->ranges(zone());
|
||||
DCHECK(!ranges->is_empty());
|
||||
if (tree->is_negated()) {
|
||||
if (tree->is_negated() || ranges->is_empty()) {
|
||||
// A quick check uses multi-character mask and compare. There is no
|
||||
// useful way to incorporate a negative char class into this scheme
|
||||
// so we just conservatively create a mask and value that will always
|
||||
// succeed.
|
||||
// Likewise for empty ranges (empty ranges can occur e.g. when
|
||||
// compiling for one-byte subjects and impossible (non-one-byte) ranges
|
||||
// have been removed).
|
||||
pos->mask = 0;
|
||||
pos->value = 0;
|
||||
} else {
|
||||
|
@ -1659,8 +1677,9 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
|
|||
}
|
||||
}
|
||||
CharacterRange range = ranges->at(first_range);
|
||||
const uc32 first_from = range.from();
|
||||
const uc32 first_to = (range.to() > char_mask) ? char_mask : range.to();
|
||||
const base::uc32 first_from = range.from();
|
||||
const base::uc32 first_to =
|
||||
(range.to() > char_mask) ? char_mask : range.to();
|
||||
const uint32_t differing_bits = (first_from ^ first_to);
|
||||
// A mask and compare is only perfect if the differing bits form a
|
||||
// number like 00011111 with one single block of trailing 1s.
|
||||
|
@ -1671,10 +1690,11 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
|
|||
uint32_t common_bits = ~SmearBitsRight(differing_bits);
|
||||
uint32_t bits = (first_from & common_bits);
|
||||
for (int i = first_range + 1; i < ranges->length(); i++) {
|
||||
CharacterRange range = ranges->at(i);
|
||||
const uc32 from = range.from();
|
||||
range = ranges->at(i);
|
||||
const base::uc32 from = range.from();
|
||||
if (from > char_mask) continue;
|
||||
const uc32 to = (range.to() > char_mask) ? char_mask : range.to();
|
||||
const base::uc32 to =
|
||||
(range.to() > char_mask) ? char_mask : range.to();
|
||||
// Here we are combining more ranges into the mask and compare
|
||||
// value. With each new range the mask becomes more sparse and
|
||||
// so the chances of a false positive rise. A character class
|
||||
|
@ -1685,8 +1705,8 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
|
|||
new_common_bits = ~SmearBitsRight(new_common_bits);
|
||||
common_bits &= new_common_bits;
|
||||
bits &= new_common_bits;
|
||||
uint32_t differing_bits = (from & common_bits) ^ bits;
|
||||
common_bits ^= differing_bits;
|
||||
uint32_t new_differing_bits = (from & common_bits) ^ bits;
|
||||
common_bits ^= new_differing_bits;
|
||||
bits &= common_bits;
|
||||
}
|
||||
pos->mask = common_bits;
|
||||
|
@ -1807,16 +1827,16 @@ class IterationDecrementer {
|
|||
LoopChoiceNode* node_;
|
||||
};
|
||||
|
||||
RegExpNode* SeqRegExpNode::FilterOneByte(int depth) {
|
||||
RegExpNode* SeqRegExpNode::FilterOneByte(int depth, RegExpFlags flags) {
|
||||
if (info()->replacement_calculated) return replacement();
|
||||
if (depth < 0) return this;
|
||||
DCHECK(!info()->visited);
|
||||
VisitMarker marker(info());
|
||||
return FilterSuccessor(depth - 1);
|
||||
return FilterSuccessor(depth - 1, flags);
|
||||
}
|
||||
|
||||
RegExpNode* SeqRegExpNode::FilterSuccessor(int depth) {
|
||||
RegExpNode* next = on_success_->FilterOneByte(depth - 1);
|
||||
RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, RegExpFlags flags) {
|
||||
RegExpNode* next = on_success_->FilterOneByte(depth - 1, flags);
|
||||
if (next == nullptr) return set_replacement(nullptr);
|
||||
on_success_ = next;
|
||||
return set_replacement(this);
|
||||
|
@ -1829,7 +1849,9 @@ bool RangeContainsLatin1Equivalents(CharacterRange range) {
|
|||
range.Contains(0x0178);
|
||||
}
|
||||
|
||||
static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
|
||||
namespace {
|
||||
|
||||
bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
|
||||
for (int i = 0; i < ranges->length(); i++) {
|
||||
// TODO(dcarney): this could be a lot more efficient.
|
||||
if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
|
||||
|
@ -1837,7 +1859,9 @@ static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
|
|||
return false;
|
||||
}
|
||||
|
||||
RegExpNode* TextNode::FilterOneByte(int depth) {
|
||||
} // namespace
|
||||
|
||||
RegExpNode* TextNode::FilterOneByte(int depth, RegExpFlags flags) {
|
||||
if (info()->replacement_calculated) return replacement();
|
||||
if (depth < 0) return this;
|
||||
DCHECK(!info()->visited);
|
||||
|
@ -1846,15 +1870,15 @@ RegExpNode* TextNode::FilterOneByte(int depth) {
|
|||
for (int i = 0; i < element_count; i++) {
|
||||
TextElement elm = elements()->at(i);
|
||||
if (elm.text_type() == TextElement::ATOM) {
|
||||
Vector<const uc16> quarks = elm.atom()->data();
|
||||
base::Vector<const base::uc16> quarks = elm.atom()->data();
|
||||
for (int j = 0; j < quarks.length(); j++) {
|
||||
uc16 c = quarks[j];
|
||||
if (elm.atom()->ignore_case()) {
|
||||
base::uc16 c = quarks[j];
|
||||
if (IsIgnoreCase(flags)) {
|
||||
c = unibrow::Latin1::TryConvertToLatin1(c);
|
||||
}
|
||||
if (c > unibrow::Latin1::kMaxChar) return set_replacement(nullptr);
|
||||
// Replace quark in case we converted to Latin-1.
|
||||
uc16* writable_quarks = const_cast<uc16*>(quarks.begin());
|
||||
base::uc16* writable_quarks = const_cast<base::uc16*>(quarks.begin());
|
||||
writable_quarks[j] = c;
|
||||
}
|
||||
} else {
|
||||
|
@ -1868,8 +1892,7 @@ RegExpNode* TextNode::FilterOneByte(int depth) {
|
|||
if (range_count != 0 && ranges->at(0).from() == 0 &&
|
||||
ranges->at(0).to() >= String::kMaxOneByteCharCode) {
|
||||
// This will be handled in a later filter.
|
||||
if (IgnoreCase(cc->flags()) &&
|
||||
RangesContainLatin1Equivalents(ranges)) {
|
||||
if (IsIgnoreCase(flags) && RangesContainLatin1Equivalents(ranges)) {
|
||||
continue;
|
||||
}
|
||||
return set_replacement(nullptr);
|
||||
|
@ -1878,8 +1901,7 @@ RegExpNode* TextNode::FilterOneByte(int depth) {
|
|||
if (range_count == 0 ||
|
||||
ranges->at(0).from() > String::kMaxOneByteCharCode) {
|
||||
// This will be handled in a later filter.
|
||||
if (IgnoreCase(cc->flags()) &&
|
||||
RangesContainLatin1Equivalents(ranges)) {
|
||||
if (IsIgnoreCase(flags) && RangesContainLatin1Equivalents(ranges)) {
|
||||
continue;
|
||||
}
|
||||
return set_replacement(nullptr);
|
||||
|
@ -1887,26 +1909,27 @@ RegExpNode* TextNode::FilterOneByte(int depth) {
|
|||
}
|
||||
}
|
||||
}
|
||||
return FilterSuccessor(depth - 1);
|
||||
return FilterSuccessor(depth - 1, flags);
|
||||
}
|
||||
|
||||
RegExpNode* LoopChoiceNode::FilterOneByte(int depth) {
|
||||
RegExpNode* LoopChoiceNode::FilterOneByte(int depth, RegExpFlags flags) {
|
||||
if (info()->replacement_calculated) return replacement();
|
||||
if (depth < 0) return this;
|
||||
if (info()->visited) return this;
|
||||
{
|
||||
VisitMarker marker(info());
|
||||
|
||||
RegExpNode* continue_replacement = continue_node_->FilterOneByte(depth - 1);
|
||||
RegExpNode* continue_replacement =
|
||||
continue_node_->FilterOneByte(depth - 1, flags);
|
||||
// If we can't continue after the loop then there is no sense in doing the
|
||||
// loop.
|
||||
if (continue_replacement == nullptr) return set_replacement(nullptr);
|
||||
}
|
||||
|
||||
return ChoiceNode::FilterOneByte(depth - 1);
|
||||
return ChoiceNode::FilterOneByte(depth - 1, flags);
|
||||
}
|
||||
|
||||
RegExpNode* ChoiceNode::FilterOneByte(int depth) {
|
||||
RegExpNode* ChoiceNode::FilterOneByte(int depth, RegExpFlags flags) {
|
||||
if (info()->replacement_calculated) return replacement();
|
||||
if (depth < 0) return this;
|
||||
if (info()->visited) return this;
|
||||
|
@ -1926,7 +1949,8 @@ RegExpNode* ChoiceNode::FilterOneByte(int depth) {
|
|||
RegExpNode* survivor = nullptr;
|
||||
for (int i = 0; i < choice_count; i++) {
|
||||
GuardedAlternative alternative = alternatives_->at(i);
|
||||
RegExpNode* replacement = alternative.node()->FilterOneByte(depth - 1);
|
||||
RegExpNode* replacement =
|
||||
alternative.node()->FilterOneByte(depth - 1, flags);
|
||||
DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK.
|
||||
if (replacement != nullptr) {
|
||||
alternatives_->at(i).set_node(replacement);
|
||||
|
@ -1946,7 +1970,7 @@ RegExpNode* ChoiceNode::FilterOneByte(int depth) {
|
|||
zone()->New<ZoneList<GuardedAlternative>>(surviving, zone());
|
||||
for (int i = 0; i < choice_count; i++) {
|
||||
RegExpNode* replacement =
|
||||
alternatives_->at(i).node()->FilterOneByte(depth - 1);
|
||||
alternatives_->at(i).node()->FilterOneByte(depth - 1, flags);
|
||||
if (replacement != nullptr) {
|
||||
alternatives_->at(i).set_node(replacement);
|
||||
new_alternatives->Add(alternatives_->at(i), zone());
|
||||
|
@ -1956,7 +1980,8 @@ RegExpNode* ChoiceNode::FilterOneByte(int depth) {
|
|||
return this;
|
||||
}
|
||||
|
||||
RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth) {
|
||||
RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth,
|
||||
RegExpFlags flags) {
|
||||
if (info()->replacement_calculated) return replacement();
|
||||
if (depth < 0) return this;
|
||||
if (info()->visited) return this;
|
||||
|
@ -1964,12 +1989,12 @@ RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth) {
|
|||
// Alternative 0 is the negative lookahead, alternative 1 is what comes
|
||||
// afterwards.
|
||||
RegExpNode* node = continue_node();
|
||||
RegExpNode* replacement = node->FilterOneByte(depth - 1);
|
||||
RegExpNode* replacement = node->FilterOneByte(depth - 1, flags);
|
||||
if (replacement == nullptr) return set_replacement(nullptr);
|
||||
alternatives_->at(kContinueIndex).set_node(replacement);
|
||||
|
||||
RegExpNode* neg_node = lookaround_node();
|
||||
RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1);
|
||||
RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, flags);
|
||||
// If the negative lookahead is always going to fail then
|
||||
// we don't need to check it.
|
||||
if (neg_replacement == nullptr) return set_replacement(replacement);
|
||||
|
@ -2062,7 +2087,8 @@ namespace {
|
|||
void EmitWordCheck(RegExpMacroAssembler* assembler, Label* word,
|
||||
Label* non_word, bool fall_through_on_word) {
|
||||
if (assembler->CheckSpecialCharacterClass(
|
||||
fall_through_on_word ? 'w' : 'W',
|
||||
fall_through_on_word ? StandardCharacterSet::kWord
|
||||
: StandardCharacterSet::kNotWord,
|
||||
fall_through_on_word ? non_word : word)) {
|
||||
// Optimized implementation available.
|
||||
return;
|
||||
|
@ -2108,7 +2134,8 @@ void EmitHat(RegExpCompiler* compiler, RegExpNode* on_success, Trace* trace) {
|
|||
const bool can_skip_bounds_check = !may_be_at_or_before_subject_string_start;
|
||||
assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
|
||||
new_trace.backtrack(), can_skip_bounds_check);
|
||||
if (!assembler->CheckSpecialCharacterClass('n', new_trace.backtrack())) {
|
||||
if (!assembler->CheckSpecialCharacterClass(
|
||||
StandardCharacterSet::kLineTerminator, new_trace.backtrack())) {
|
||||
// Newline means \n, \r, 0x2028 or 0x2029.
|
||||
if (!compiler->one_byte()) {
|
||||
assembler->CheckCharacterAfterAnd(0x2028, 0xFFFE, &ok);
|
||||
|
@ -2253,18 +2280,22 @@ void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
|
|||
on_success()->Emit(compiler, trace);
|
||||
}
|
||||
|
||||
static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
|
||||
namespace {
|
||||
|
||||
bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
|
||||
if (quick_check == nullptr) return false;
|
||||
if (offset >= quick_check->characters()) return false;
|
||||
return quick_check->positions(offset)->determines_perfectly;
|
||||
}
|
||||
|
||||
static void UpdateBoundsCheck(int index, int* checked_up_to) {
|
||||
void UpdateBoundsCheck(int index, int* checked_up_to) {
|
||||
if (index > *checked_up_to) {
|
||||
*checked_up_to = index;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
// We call this repeatedly to generate code for each pass over the text node.
|
||||
// The passes are in increasing order of difficulty because we hope one
|
||||
// of the first passes will fail in which case we are saved the work of the
|
||||
|
@ -2308,13 +2339,13 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass,
|
|||
TextElement elm = elements()->at(i);
|
||||
int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
|
||||
if (elm.text_type() == TextElement::ATOM) {
|
||||
if (SkipPass(pass, elm.atom()->ignore_case())) continue;
|
||||
Vector<const uc16> quarks = elm.atom()->data();
|
||||
if (SkipPass(pass, IsIgnoreCase(compiler->flags()))) continue;
|
||||
base::Vector<const base::uc16> quarks = elm.atom()->data();
|
||||
for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
|
||||
if (first_element_checked && i == 0 && j == 0) continue;
|
||||
if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
|
||||
uc16 quark = quarks[j];
|
||||
if (elm.atom()->ignore_case()) {
|
||||
base::uc16 quark = quarks[j];
|
||||
if (IsIgnoreCase(compiler->flags())) {
|
||||
// Everywhere else we assume that a non-Latin-1 character cannot match
|
||||
// a Latin-1 character. Avoid the cases where this is assumption is
|
||||
// invalid by using the Latin1 equivalent instead.
|
||||
|
@ -2383,29 +2414,38 @@ bool TextNode::SkipPass(TextEmitPassType pass, bool ignore_case) {
|
|||
TextNode* TextNode::CreateForCharacterRanges(Zone* zone,
|
||||
ZoneList<CharacterRange>* ranges,
|
||||
bool read_backward,
|
||||
RegExpNode* on_success,
|
||||
JSRegExp::Flags flags) {
|
||||
RegExpNode* on_success) {
|
||||
DCHECK_NOT_NULL(ranges);
|
||||
ZoneList<TextElement>* elms = zone->New<ZoneList<TextElement>>(1, zone);
|
||||
// TODO(jgruber): There's no fundamental need to create this
|
||||
// RegExpCharacterClass; we could refactor to avoid the allocation.
|
||||
return zone->New<TextNode>(zone->New<RegExpCharacterClass>(zone, ranges),
|
||||
read_backward, on_success);
|
||||
}
|
||||
|
||||
TextNode* TextNode::CreateForSurrogatePair(
|
||||
Zone* zone, CharacterRange lead, ZoneList<CharacterRange>* trail_ranges,
|
||||
bool read_backward, RegExpNode* on_success) {
|
||||
ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);
|
||||
ZoneList<TextElement>* elms = zone->New<ZoneList<TextElement>>(2, zone);
|
||||
elms->Add(TextElement::CharClass(
|
||||
zone->New<RegExpCharacterClass>(zone, ranges, flags)),
|
||||
zone->New<RegExpCharacterClass>(zone, lead_ranges)),
|
||||
zone);
|
||||
elms->Add(TextElement::CharClass(
|
||||
zone->New<RegExpCharacterClass>(zone, trail_ranges)),
|
||||
zone);
|
||||
return zone->New<TextNode>(elms, read_backward, on_success);
|
||||
}
|
||||
|
||||
TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead,
|
||||
CharacterRange trail,
|
||||
bool read_backward,
|
||||
RegExpNode* on_success,
|
||||
JSRegExp::Flags flags) {
|
||||
ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);
|
||||
TextNode* TextNode::CreateForSurrogatePair(
|
||||
Zone* zone, ZoneList<CharacterRange>* lead_ranges, CharacterRange trail,
|
||||
bool read_backward, RegExpNode* on_success) {
|
||||
ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);
|
||||
ZoneList<TextElement>* elms = zone->New<ZoneList<TextElement>>(2, zone);
|
||||
elms->Add(TextElement::CharClass(
|
||||
zone->New<RegExpCharacterClass>(zone, lead_ranges, flags)),
|
||||
zone->New<RegExpCharacterClass>(zone, lead_ranges)),
|
||||
zone);
|
||||
elms->Add(TextElement::CharClass(
|
||||
zone->New<RegExpCharacterClass>(zone, trail_ranges, flags)),
|
||||
zone->New<RegExpCharacterClass>(zone, trail_ranges)),
|
||||
zone);
|
||||
return zone->New<TextNode>(elms, read_backward, on_success);
|
||||
}
|
||||
|
@ -2479,26 +2519,23 @@ void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
|
|||
bound_checked_up_to_ = std::max(0, bound_checked_up_to_ - by);
|
||||
}
|
||||
|
||||
void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) {
|
||||
void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte,
|
||||
RegExpFlags flags) {
|
||||
if (!IsIgnoreCase(flags)) return;
|
||||
#ifdef V8_INTL_SUPPORT
|
||||
if (NeedsUnicodeCaseEquivalents(flags)) return;
|
||||
#endif
|
||||
|
||||
int element_count = elements()->length();
|
||||
for (int i = 0; i < element_count; i++) {
|
||||
TextElement elm = elements()->at(i);
|
||||
if (elm.text_type() == TextElement::CHAR_CLASS) {
|
||||
RegExpCharacterClass* cc = elm.char_class();
|
||||
#ifdef V8_INTL_SUPPORT
|
||||
bool case_equivalents_already_added =
|
||||
NeedsUnicodeCaseEquivalents(cc->flags());
|
||||
#else
|
||||
bool case_equivalents_already_added = false;
|
||||
#endif
|
||||
if (IgnoreCase(cc->flags()) && !case_equivalents_already_added) {
|
||||
// None of the standard character classes is different in the case
|
||||
// independent case and it slows us down if we don't know that.
|
||||
if (cc->is_standard(zone())) continue;
|
||||
ZoneList<CharacterRange>* ranges = cc->ranges(zone());
|
||||
CharacterRange::AddCaseEquivalents(isolate, zone(), ranges,
|
||||
is_one_byte);
|
||||
}
|
||||
// None of the standard character classes is different in the case
|
||||
// independent case and it slows us down if we don't know that.
|
||||
if (cc->is_standard(zone())) continue;
|
||||
ZoneList<CharacterRange>* ranges = cc->ranges(zone());
|
||||
CharacterRange::AddCaseEquivalents(isolate, zone(), ranges, is_one_byte);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -2518,7 +2555,7 @@ RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
|
|||
return ranges->length() == 0 ? on_success() : nullptr;
|
||||
}
|
||||
if (ranges->length() != 1) return nullptr;
|
||||
const uc32 max_char = MaxCodeUnit(compiler->one_byte());
|
||||
const base::uc32 max_char = MaxCodeUnit(compiler->one_byte());
|
||||
return ranges->at(0).IsEverything(max_char) ? on_success() : nullptr;
|
||||
}
|
||||
|
||||
|
@ -2681,7 +2718,7 @@ ContainedInLattice AddRange(ContainedInLattice containment, const int* ranges,
|
|||
}
|
||||
|
||||
int BitsetFirstSetBit(BoyerMoorePositionInfo::Bitset bitset) {
|
||||
STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
|
||||
static_assert(BoyerMoorePositionInfo::kMapSize ==
|
||||
2 * kInt64Size * kBitsPerByte);
|
||||
|
||||
// Slight fiddling is needed here, since the bitset is of length 128 while
|
||||
|
@ -2692,7 +2729,7 @@ int BitsetFirstSetBit(BoyerMoorePositionInfo::Bitset bitset) {
|
|||
{
|
||||
static constexpr BoyerMoorePositionInfo::Bitset mask(~uint64_t{0});
|
||||
BoyerMoorePositionInfo::Bitset masked_bitset = bitset & mask;
|
||||
STATIC_ASSERT(kInt64Size >= sizeof(decltype(masked_bitset.to_ullong())));
|
||||
static_assert(kInt64Size >= sizeof(decltype(masked_bitset.to_ullong())));
|
||||
uint64_t lsb = masked_bitset.to_ullong();
|
||||
if (lsb != 0) return base::bits::CountTrailingZeros(lsb);
|
||||
}
|
||||
|
@ -3436,7 +3473,7 @@ void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
|
|||
RecursionCheck rc(compiler);
|
||||
|
||||
DCHECK_EQ(start_reg_ + 1, end_reg_);
|
||||
if (IgnoreCase(flags_)) {
|
||||
if (IsIgnoreCase(flags_)) {
|
||||
bool unicode = IsUnicode(flags_);
|
||||
assembler->CheckNotBackReferenceIgnoreCase(start_reg_, read_backward(),
|
||||
unicode, trace->backtrack());
|
||||
|
@ -3626,9 +3663,10 @@ class EatsAtLeastPropagator : public AllStatic {
|
|||
template <typename... Propagators>
|
||||
class Analysis : public NodeVisitor {
|
||||
public:
|
||||
Analysis(Isolate* isolate, bool is_one_byte)
|
||||
Analysis(Isolate* isolate, bool is_one_byte, RegExpFlags flags)
|
||||
: isolate_(isolate),
|
||||
is_one_byte_(is_one_byte),
|
||||
flags_(flags),
|
||||
error_(RegExpError::kNone) {}
|
||||
|
||||
void EnsureAnalyzed(RegExpNode* that) {
|
||||
|
@ -3669,7 +3707,7 @@ class Analysis : public NodeVisitor {
|
|||
} while (false)
|
||||
|
||||
void VisitText(TextNode* that) override {
|
||||
that->MakeCaseIndependent(isolate(), is_one_byte_);
|
||||
that->MakeCaseIndependent(isolate(), is_one_byte_, flags_);
|
||||
EnsureAnalyzed(that->on_success());
|
||||
if (has_failed()) return;
|
||||
that->CalculateOffsets();
|
||||
|
@ -3736,16 +3774,17 @@ class Analysis : public NodeVisitor {
|
|||
|
||||
private:
|
||||
Isolate* isolate_;
|
||||
bool is_one_byte_;
|
||||
const bool is_one_byte_;
|
||||
const RegExpFlags flags_;
|
||||
RegExpError error_;
|
||||
|
||||
DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis);
|
||||
};
|
||||
|
||||
RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte,
|
||||
RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpFlags flags,
|
||||
RegExpNode* node) {
|
||||
Analysis<AssertionPropagator, EatsAtLeastPropagator> analysis(isolate,
|
||||
is_one_byte);
|
||||
Analysis<AssertionPropagator, EatsAtLeastPropagator> analysis(
|
||||
isolate, is_one_byte, flags);
|
||||
DCHECK_EQ(node->info()->been_analyzed, false);
|
||||
analysis.EnsureAnalyzed(node);
|
||||
DCHECK_IMPLIES(analysis.has_failed(), analysis.error() != RegExpError::kNone);
|
||||
|
@ -3761,7 +3800,7 @@ void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
|
|||
SaveBMInfo(bm, not_at_start, offset);
|
||||
}
|
||||
|
||||
STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
|
||||
static_assert(BoyerMoorePositionInfo::kMapSize ==
|
||||
RegExpMacroAssembler::kTableSize);
|
||||
|
||||
void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
|
||||
|
@ -3798,14 +3837,14 @@ void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
|
|||
if (initial_offset == 0) set_bm_info(not_at_start, bm);
|
||||
return;
|
||||
}
|
||||
uc16 character = atom->data()[j];
|
||||
if (IgnoreCase(atom->flags())) {
|
||||
base::uc16 character = atom->data()[j];
|
||||
if (IsIgnoreCase(bm->compiler()->flags())) {
|
||||
unibrow::uchar chars[4];
|
||||
int length = GetCaseIndependentLetters(
|
||||
isolate, character, bm->max_char() == String::kMaxOneByteCharCode,
|
||||
chars, 4);
|
||||
for (int j = 0; j < length; j++) {
|
||||
bm->Set(offset, chars[j]);
|
||||
for (int k = 0; k < length; k++) {
|
||||
bm->Set(offset, chars[k]);
|
||||
}
|
||||
} else {
|
||||
if (character <= max_char) bm->Set(offset, character);
|
||||
|
@ -3838,7 +3877,7 @@ void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
|
|||
}
|
||||
|
||||
RegExpNode* RegExpCompiler::OptionallyStepBackToLeadSurrogate(
|
||||
RegExpNode* on_success, JSRegExp::Flags flags) {
|
||||
RegExpNode* on_success) {
|
||||
DCHECK(!read_backward());
|
||||
ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
|
||||
zone(), CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
|
||||
|
@ -3850,11 +3889,11 @@ RegExpNode* RegExpCompiler::OptionallyStepBackToLeadSurrogate(
|
|||
int stack_register = UnicodeLookaroundStackRegister();
|
||||
int position_register = UnicodeLookaroundPositionRegister();
|
||||
RegExpNode* step_back = TextNode::CreateForCharacterRanges(
|
||||
zone(), lead_surrogates, true, on_success, flags);
|
||||
zone(), lead_surrogates, true, on_success);
|
||||
RegExpLookaround::Builder builder(true, step_back, stack_register,
|
||||
position_register);
|
||||
RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
|
||||
zone(), trail_surrogates, false, builder.on_match_success(), flags);
|
||||
zone(), trail_surrogates, false, builder.on_match_success());
|
||||
|
||||
optional_step_back->AddAlternative(
|
||||
GuardedAlternative(builder.ForMatch(match_trail)));
|
||||
|
@ -3864,7 +3903,7 @@ RegExpNode* RegExpCompiler::OptionallyStepBackToLeadSurrogate(
|
|||
}
|
||||
|
||||
RegExpNode* RegExpCompiler::PreprocessRegExp(RegExpCompileData* data,
|
||||
JSRegExp::Flags flags,
|
||||
RegExpFlags flags,
|
||||
bool is_one_byte) {
|
||||
// Wrap the body of the regexp in capture #0.
|
||||
RegExpNode* captured_body =
|
||||
|
@ -3873,11 +3912,10 @@ RegExpNode* RegExpCompiler::PreprocessRegExp(RegExpCompileData* data,
|
|||
if (!data->tree->IsAnchoredAtStart() && !IsSticky(flags)) {
|
||||
// Add a .*? at the beginning, outside the body capture, unless
|
||||
// this expression is anchored at the beginning or sticky.
|
||||
JSRegExp::Flags default_flags = JSRegExp::Flags();
|
||||
RegExpNode* loop_node = RegExpQuantifier::ToNode(
|
||||
0, RegExpTree::kInfinity, false,
|
||||
zone()->New<RegExpCharacterClass>('*', default_flags), this,
|
||||
captured_body, data->contains_anchor);
|
||||
zone()->New<RegExpCharacterClass>(StandardCharacterSet::kEverything),
|
||||
this, captured_body, data->contains_anchor);
|
||||
|
||||
if (data->contains_anchor) {
|
||||
// Unroll loop once, to take care of the case that might start
|
||||
|
@ -3885,27 +3923,33 @@ RegExpNode* RegExpCompiler::PreprocessRegExp(RegExpCompileData* data,
|
|||
ChoiceNode* first_step_node = zone()->New<ChoiceNode>(2, zone());
|
||||
first_step_node->AddAlternative(GuardedAlternative(captured_body));
|
||||
first_step_node->AddAlternative(GuardedAlternative(zone()->New<TextNode>(
|
||||
zone()->New<RegExpCharacterClass>('*', default_flags), false,
|
||||
loop_node)));
|
||||
zone()->New<RegExpCharacterClass>(StandardCharacterSet::kEverything),
|
||||
false, loop_node)));
|
||||
node = first_step_node;
|
||||
} else {
|
||||
node = loop_node;
|
||||
}
|
||||
}
|
||||
if (is_one_byte) {
|
||||
node = node->FilterOneByte(RegExpCompiler::kMaxRecursion);
|
||||
node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, flags);
|
||||
// Do it again to propagate the new nodes to places where they were not
|
||||
// put because they had not been calculated yet.
|
||||
if (node != nullptr) {
|
||||
node = node->FilterOneByte(RegExpCompiler::kMaxRecursion);
|
||||
node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, flags);
|
||||
}
|
||||
} else if (IsUnicode(flags) && (IsGlobal(flags) || IsSticky(flags))) {
|
||||
node = OptionallyStepBackToLeadSurrogate(node, flags);
|
||||
node = OptionallyStepBackToLeadSurrogate(node);
|
||||
}
|
||||
|
||||
if (node == nullptr) node = zone()->New<EndNode>(EndNode::BACKTRACK, zone());
|
||||
return node;
|
||||
}
|
||||
|
||||
void RegExpCompiler::ToNodeCheckForStackOverflow() {
|
||||
if (StackLimitCheck{isolate()}.HasOverflowed()) {
|
||||
V8::FatalProcessOutOfMemory(isolate(), "RegExpCompiler");
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
||||
|
|
|
@ -20,7 +20,7 @@ namespace regexp_compiler_constants {
|
|||
// The '2' variant is has inclusive from and exclusive to.
|
||||
// This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
|
||||
// which include WhiteSpace (7.2) or LineTerminator (7.3) values.
|
||||
constexpr uc32 kRangeEndMarker = 0x110000;
|
||||
constexpr base::uc32 kRangeEndMarker = 0x110000;
|
||||
constexpr int kSpaceRanges[] = {
|
||||
'\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680,
|
||||
0x1681, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030,
|
||||
|
@ -47,34 +47,10 @@ constexpr int kPatternTooShortForBoyerMoore = 2;
|
|||
|
||||
} // namespace regexp_compiler_constants
|
||||
|
||||
inline bool IgnoreCase(JSRegExp::Flags flags) {
|
||||
return (flags & JSRegExp::kIgnoreCase) != 0;
|
||||
}
|
||||
|
||||
inline bool IsUnicode(JSRegExp::Flags flags) {
|
||||
return (flags & JSRegExp::kUnicode) != 0;
|
||||
}
|
||||
|
||||
inline bool IsSticky(JSRegExp::Flags flags) {
|
||||
return (flags & JSRegExp::kSticky) != 0;
|
||||
}
|
||||
|
||||
inline bool IsGlobal(JSRegExp::Flags flags) {
|
||||
return (flags & JSRegExp::kGlobal) != 0;
|
||||
}
|
||||
|
||||
inline bool DotAll(JSRegExp::Flags flags) {
|
||||
return (flags & JSRegExp::kDotAll) != 0;
|
||||
}
|
||||
|
||||
inline bool Multiline(JSRegExp::Flags flags) {
|
||||
return (flags & JSRegExp::kMultiline) != 0;
|
||||
}
|
||||
|
||||
inline bool NeedsUnicodeCaseEquivalents(JSRegExp::Flags flags) {
|
||||
inline bool NeedsUnicodeCaseEquivalents(RegExpFlags flags) {
|
||||
// Both unicode and ignore_case flags are set. We need to use ICU to find
|
||||
// the closure over case equivalents.
|
||||
return IsUnicode(flags) && IgnoreCase(flags);
|
||||
return IsUnicode(flags) && IsIgnoreCase(flags);
|
||||
}
|
||||
|
||||
// Details of a quick mask-compare check that can look ahead in the
|
||||
|
@ -95,8 +71,8 @@ class QuickCheckDetails {
|
|||
void set_cannot_match() { cannot_match_ = true; }
|
||||
struct Position {
|
||||
Position() : mask(0), value(0), determines_perfectly(false) {}
|
||||
uc32 mask;
|
||||
uc32 value;
|
||||
base::uc32 mask;
|
||||
base::uc32 value;
|
||||
bool determines_perfectly;
|
||||
};
|
||||
int characters() { return characters_; }
|
||||
|
@ -422,7 +398,8 @@ struct PreloadState {
|
|||
// Analysis performs assertion propagation and computes eats_at_least_ values.
|
||||
// See the comments on AssertionPropagator and EatsAtLeastPropagator for more
|
||||
// details.
|
||||
RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpNode* node);
|
||||
RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpFlags flags,
|
||||
RegExpNode* node);
|
||||
|
||||
class FrequencyCollator {
|
||||
public:
|
||||
|
@ -472,7 +449,7 @@ class FrequencyCollator {
|
|||
class RegExpCompiler {
|
||||
public:
|
||||
RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
|
||||
bool is_one_byte);
|
||||
RegExpFlags flags, bool is_one_byte);
|
||||
|
||||
int AllocateRegister() {
|
||||
if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
|
||||
|
@ -524,13 +501,12 @@ class RegExpCompiler {
|
|||
// - Inserting the implicit .* before/after the regexp if necessary.
|
||||
// - If the input is a one-byte string, filtering out nodes that can't match.
|
||||
// - Fixing up regexp matches that start within a surrogate pair.
|
||||
RegExpNode* PreprocessRegExp(RegExpCompileData* data, JSRegExp::Flags flags,
|
||||
RegExpNode* PreprocessRegExp(RegExpCompileData* data, RegExpFlags flags,
|
||||
bool is_one_byte);
|
||||
|
||||
// If the regexp matching starts within a surrogate pair, step back to the
|
||||
// lead surrogate and start matching from there.
|
||||
RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpNode* on_success,
|
||||
JSRegExp::Flags flags);
|
||||
RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpNode* on_success);
|
||||
|
||||
inline void AddWork(RegExpNode* node) {
|
||||
if (!node->on_work_list() && !node->label()->is_bound()) {
|
||||
|
@ -551,6 +527,8 @@ class RegExpCompiler {
|
|||
inline void IncrementRecursionDepth() { recursion_depth_++; }
|
||||
inline void DecrementRecursionDepth() { recursion_depth_--; }
|
||||
|
||||
RegExpFlags flags() const { return flags_; }
|
||||
|
||||
void SetRegExpTooBig() { reg_exp_too_big_ = true; }
|
||||
|
||||
inline bool one_byte() { return one_byte_; }
|
||||
|
@ -569,6 +547,18 @@ class RegExpCompiler {
|
|||
current_expansion_factor_ = value;
|
||||
}
|
||||
|
||||
// The recursive nature of ToNode node generation means we may run into stack
|
||||
// overflow issues. We introduce periodic checks to detect these, and the
|
||||
// tick counter helps limit overhead of these checks.
|
||||
// TODO(jgruber): This is super hacky and should be replaced by an abort
|
||||
// mechanism or iterative node generation.
|
||||
void ToNodeMaybeCheckForStackOverflow() {
|
||||
if ((to_node_overflow_check_ticks_++ % 16 == 0)) {
|
||||
ToNodeCheckForStackOverflow();
|
||||
}
|
||||
}
|
||||
void ToNodeCheckForStackOverflow();
|
||||
|
||||
Isolate* isolate() const { return isolate_; }
|
||||
Zone* zone() const { return zone_; }
|
||||
|
||||
|
@ -581,10 +571,12 @@ class RegExpCompiler {
|
|||
int unicode_lookaround_position_register_;
|
||||
ZoneVector<RegExpNode*>* work_list_;
|
||||
int recursion_depth_;
|
||||
const RegExpFlags flags_;
|
||||
RegExpMacroAssembler* macro_assembler_;
|
||||
bool one_byte_;
|
||||
bool reg_exp_too_big_;
|
||||
bool limiting_recursion_;
|
||||
int to_node_overflow_check_ticks_ = 0;
|
||||
bool optimize_;
|
||||
bool read_backward_;
|
||||
int current_expansion_factor_;
|
||||
|
|
|
@ -127,9 +127,9 @@ void DotPrinterImpl::VisitText(TextNode* that) {
|
|||
TextElement elm = that->elements()->at(i);
|
||||
switch (elm.text_type()) {
|
||||
case TextElement::ATOM: {
|
||||
Vector<const uc16> data = elm.atom()->data();
|
||||
for (int i = 0; i < data.length(); i++) {
|
||||
os_ << static_cast<char>(data[i]);
|
||||
base::Vector<const base::uc16> data = elm.atom()->data();
|
||||
for (int j = 0; j < data.length(); j++) {
|
||||
os_ << static_cast<char>(data[j]);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -52,6 +52,11 @@ enum class RegExpError : uint32_t {
|
|||
|
||||
V8_EXPORT_PRIVATE const char* RegExpErrorString(RegExpError error);
|
||||
|
||||
inline constexpr bool RegExpErrorIsStackOverflow(RegExpError error) {
|
||||
return error == RegExpError::kStackOverflow ||
|
||||
error == RegExpError::kAnalysisStackOverflow;
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace v8
|
||||
|
||||
|
|
|
@ -28,12 +28,13 @@ namespace internal {
|
|||
namespace {
|
||||
|
||||
bool BackRefMatchesNoCase(Isolate* isolate, int from, int current, int len,
|
||||
Vector<const uc16> subject, bool unicode) {
|
||||
base::Vector<const base::uc16> subject,
|
||||
bool unicode) {
|
||||
Address offset_a =
|
||||
reinterpret_cast<Address>(const_cast<uc16*>(&subject.at(from)));
|
||||
reinterpret_cast<Address>(const_cast<base::uc16*>(&subject.at(from)));
|
||||
Address offset_b =
|
||||
reinterpret_cast<Address>(const_cast<uc16*>(&subject.at(current)));
|
||||
size_t length = len * kUC16Size;
|
||||
reinterpret_cast<Address>(const_cast<base::uc16*>(&subject.at(current)));
|
||||
size_t length = len * base::kUC16Size;
|
||||
|
||||
bool result = unicode
|
||||
? RegExpMacroAssembler::CaseInsensitiveCompareUnicode(
|
||||
|
@ -44,7 +45,7 @@ bool BackRefMatchesNoCase(Isolate* isolate, int from, int current, int len,
|
|||
}
|
||||
|
||||
bool BackRefMatchesNoCase(Isolate* isolate, int from, int current, int len,
|
||||
Vector<const uint8_t> subject, bool unicode) {
|
||||
base::Vector<const uint8_t> subject, bool unicode) {
|
||||
// For Latin1 characters the unicode flag makes no difference.
|
||||
for (int i = 0; i < len; i++) {
|
||||
unsigned int old_char = subject[from++];
|
||||
|
@ -170,7 +171,7 @@ class InterpreterRegisters {
|
|||
output_register_count_(output_register_count) {
|
||||
// TODO(jgruber): Use int32_t consistently for registers. Currently, CSA
|
||||
// uses int32_t while runtime uses int.
|
||||
STATIC_ASSERT(sizeof(int) == sizeof(int32_t));
|
||||
static_assert(sizeof(int) == sizeof(int32_t));
|
||||
DCHECK_GE(output_register_count, 2); // At least 2 for the match itself.
|
||||
DCHECK_GE(total_register_count, output_register_count);
|
||||
DCHECK_LE(total_register_count, RegExpMacroAssembler::kMaxRegisterCount);
|
||||
|
@ -222,7 +223,7 @@ void UpdateCodeAndSubjectReferences(
|
|||
Isolate* isolate, Handle<ByteArray> code_array,
|
||||
Handle<String> subject_string, ByteArray* code_array_out,
|
||||
const byte** code_base_out, const byte** pc_out, String* subject_string_out,
|
||||
Vector<const Char>* subject_string_vector_out) {
|
||||
base::Vector<const Char>* subject_string_vector_out) {
|
||||
DisallowGarbageCollection no_gc;
|
||||
|
||||
if (*code_base_out != code_array->GetDataStartAddress()) {
|
||||
|
@ -244,7 +245,7 @@ template <typename Char>
|
|||
IrregexpInterpreter::Result HandleInterrupts(
|
||||
Isolate* isolate, RegExp::CallOrigin call_origin, ByteArray* code_array_out,
|
||||
String* subject_string_out, const byte** code_base_out,
|
||||
Vector<const Char>* subject_string_vector_out, const byte** pc_out) {
|
||||
base::Vector<const Char>* subject_string_vector_out, const byte** pc_out) {
|
||||
DisallowGarbageCollection no_gc;
|
||||
|
||||
StackLimitCheck check(isolate);
|
||||
|
@ -282,8 +283,8 @@ IrregexpInterpreter::Result HandleInterrupts(
|
|||
return IrregexpInterpreter::EXCEPTION;
|
||||
}
|
||||
|
||||
// If we changed between a LATIN1 and a UC16 string, we need to restart
|
||||
// regexp matching with the appropriate template instantiation of
|
||||
// If we changed between a LATIN1 and a UC16 string, we need to
|
||||
// restart regexp matching with the appropriate template instantiation of
|
||||
// RawMatch.
|
||||
if (String::IsOneByteRepresentationUnderneath(*subject_handle) !=
|
||||
was_one_byte) {
|
||||
|
@ -373,7 +374,7 @@ bool IndexIsInBounds(int index, int length) {
|
|||
template <typename Char>
|
||||
IrregexpInterpreter::Result RawMatch(
|
||||
Isolate* isolate, ByteArray code_array, String subject_string,
|
||||
Vector<const Char> subject, int* output_registers,
|
||||
base::Vector<const Char> subject, int* output_registers,
|
||||
int output_register_count, int total_register_count, int current,
|
||||
uint32_t current_char, RegExp::CallOrigin call_origin,
|
||||
const uint32_t backtrack_limit) {
|
||||
|
@ -414,8 +415,8 @@ IrregexpInterpreter::Result RawMatch(
|
|||
base::bits::RoundUpToPowerOfTwo32(kRegExpBytecodeCount));
|
||||
|
||||
// Make sure every bytecode we get by using BYTECODE_MASK is well defined.
|
||||
STATIC_ASSERT(kRegExpBytecodeCount <= kRegExpPaddedBytecodeCount);
|
||||
STATIC_ASSERT(kRegExpBytecodeCount + kRegExpBytecodeFillerCount ==
|
||||
static_assert(kRegExpBytecodeCount <= kRegExpPaddedBytecodeCount);
|
||||
static_assert(kRegExpBytecodeCount + kRegExpBytecodeFillerCount ==
|
||||
kRegExpPaddedBytecodeCount);
|
||||
|
||||
#define DECLARE_DISPATCH_TABLE_ENTRY(name, ...) &&BC_##name,
|
||||
|
@ -512,7 +513,7 @@ IrregexpInterpreter::Result RawMatch(
|
|||
DISPATCH();
|
||||
}
|
||||
BYTECODE(POP_BT) {
|
||||
STATIC_ASSERT(JSRegExp::kNoBacktrackLimit == 0);
|
||||
static_assert(JSRegExp::kNoBacktrackLimit == 0);
|
||||
if (++backtrack_count == backtrack_limit) {
|
||||
int return_code = LoadPacked24Signed(insn);
|
||||
return static_cast<IrregexpInterpreter::Result>(return_code);
|
||||
|
@ -1050,12 +1051,12 @@ IrregexpInterpreter::Result IrregexpInterpreter::Match(
|
|||
if (FLAG_regexp_tier_up) regexp.TierUpTick();
|
||||
|
||||
bool is_one_byte = String::IsOneByteRepresentationUnderneath(subject_string);
|
||||
ByteArray code_array = ByteArray::cast(regexp.Bytecode(is_one_byte));
|
||||
int total_register_count = regexp.MaxRegisterCount();
|
||||
ByteArray code_array = ByteArray::cast(regexp.bytecode(is_one_byte));
|
||||
int total_register_count = regexp.max_register_count();
|
||||
|
||||
return MatchInternal(isolate, code_array, subject_string, output_registers,
|
||||
output_register_count, total_register_count,
|
||||
start_position, call_origin, regexp.BacktrackLimit());
|
||||
start_position, call_origin, regexp.backtrack_limit());
|
||||
}
|
||||
|
||||
IrregexpInterpreter::Result IrregexpInterpreter::MatchInternal(
|
||||
|
@ -1065,6 +1066,9 @@ IrregexpInterpreter::Result IrregexpInterpreter::MatchInternal(
|
|||
uint32_t backtrack_limit) {
|
||||
DCHECK(subject_string.IsFlat());
|
||||
|
||||
// TODO(chromium:1262676): Remove this CHECK once fixed.
|
||||
CHECK(code_array.IsByteArray());
|
||||
|
||||
// Note: Heap allocation *is* allowed in two situations if calling from
|
||||
// Runtime:
|
||||
// 1. When creating & throwing a stack overflow exception. The interpreter
|
||||
|
@ -1073,10 +1077,15 @@ IrregexpInterpreter::Result IrregexpInterpreter::MatchInternal(
|
|||
// after interrupts have run.
|
||||
DisallowGarbageCollection no_gc;
|
||||
|
||||
uc16 previous_char = '\n';
|
||||
base::uc16 previous_char = '\n';
|
||||
String::FlatContent subject_content = subject_string.GetFlatContent(no_gc);
|
||||
// Because interrupts can result in GC and string content relocation, the
|
||||
// checksum verification in FlatContent may fail even though this code is
|
||||
// safe. See (2) above.
|
||||
subject_content.UnsafeDisableChecksumVerification();
|
||||
if (subject_content.IsOneByte()) {
|
||||
Vector<const uint8_t> subject_vector = subject_content.ToOneByteVector();
|
||||
base::Vector<const uint8_t> subject_vector =
|
||||
subject_content.ToOneByteVector();
|
||||
if (start_position != 0) previous_char = subject_vector[start_position - 1];
|
||||
return RawMatch(isolate, code_array, subject_string, subject_vector,
|
||||
output_registers, output_register_count,
|
||||
|
@ -1084,7 +1093,8 @@ IrregexpInterpreter::Result IrregexpInterpreter::MatchInternal(
|
|||
call_origin, backtrack_limit);
|
||||
} else {
|
||||
DCHECK(subject_content.IsTwoByte());
|
||||
Vector<const uc16> subject_vector = subject_content.ToUC16Vector();
|
||||
base::Vector<const base::uc16> subject_vector =
|
||||
subject_content.ToUC16Vector();
|
||||
if (start_position != 0) previous_char = subject_vector[start_position - 1];
|
||||
return RawMatch(isolate, code_array, subject_string, subject_vector,
|
||||
output_registers, output_register_count,
|
||||
|
@ -1099,7 +1109,7 @@ IrregexpInterpreter::Result IrregexpInterpreter::MatchInternal(
|
|||
// builtin.
|
||||
IrregexpInterpreter::Result IrregexpInterpreter::MatchForCallFromJs(
|
||||
Address subject, int32_t start_position, Address, Address,
|
||||
int* output_registers, int32_t output_register_count, Address,
|
||||
int* output_registers, int32_t output_register_count,
|
||||
RegExp::CallOrigin call_origin, Isolate* isolate, Address regexp) {
|
||||
DCHECK_NOT_NULL(isolate);
|
||||
DCHECK_NOT_NULL(output_registers);
|
||||
|
|
|
@ -12,6 +12,8 @@
|
|||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
class ByteArray;
|
||||
|
||||
class V8_EXPORT_PRIVATE IrregexpInterpreter : public AllStatic {
|
||||
public:
|
||||
enum Result {
|
||||
|
@ -34,9 +36,8 @@ class V8_EXPORT_PRIVATE IrregexpInterpreter : public AllStatic {
|
|||
// RETRY is returned if a retry through the runtime is needed (e.g. when
|
||||
// interrupts have been scheduled or the regexp is marked for tier-up).
|
||||
//
|
||||
// Arguments input_start, input_end and backtrack_stack are
|
||||
// unused. They are only passed to match the signature of the native irregex
|
||||
// code.
|
||||
// Arguments input_start and input_end are unused. They are only passed to
|
||||
// match the signature of the native irregex code.
|
||||
//
|
||||
// Arguments output_registers and output_register_count describe the results
|
||||
// array, which will contain register values of all captures if SUCCESS is
|
||||
|
@ -45,7 +46,6 @@ class V8_EXPORT_PRIVATE IrregexpInterpreter : public AllStatic {
|
|||
Address input_start, Address input_end,
|
||||
int* output_registers,
|
||||
int32_t output_register_count,
|
||||
Address backtrack_stack,
|
||||
RegExp::CallOrigin call_origin,
|
||||
Isolate* isolate, Address regexp);
|
||||
|
||||
|
|
|
@ -170,9 +170,11 @@ void RegExpMacroAssemblerTracer::LoadCurrentCharacterImpl(
|
|||
characters, eats_at_least);
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
class PrintablePrinter {
|
||||
public:
|
||||
explicit PrintablePrinter(uc16 character) : character_(character) { }
|
||||
explicit PrintablePrinter(base::uc16 character) : character_(character) {}
|
||||
|
||||
const char* operator*() {
|
||||
if (character_ >= ' ' && character_ <= '~') {
|
||||
|
@ -187,12 +189,14 @@ class PrintablePrinter {
|
|||
}
|
||||
|
||||
private:
|
||||
uc16 character_;
|
||||
base::uc16 character_;
|
||||
char buffer_[4];
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
void RegExpMacroAssemblerTracer::CheckCharacterLT(uc16 limit, Label* on_less) {
|
||||
void RegExpMacroAssemblerTracer::CheckCharacterLT(base::uc16 limit,
|
||||
Label* on_less) {
|
||||
PrintablePrinter printable(limit);
|
||||
PrintF(" CheckCharacterLT(c=0x%04x%s, label[%08x]);\n",
|
||||
limit,
|
||||
|
@ -201,8 +205,7 @@ void RegExpMacroAssemblerTracer::CheckCharacterLT(uc16 limit, Label* on_less) {
|
|||
assembler_->CheckCharacterLT(limit, on_less);
|
||||
}
|
||||
|
||||
|
||||
void RegExpMacroAssemblerTracer::CheckCharacterGT(uc16 limit,
|
||||
void RegExpMacroAssemblerTracer::CheckCharacterGT(base::uc16 limit,
|
||||
Label* on_greater) {
|
||||
PrintablePrinter printable(limit);
|
||||
PrintF(" CheckCharacterGT(c=0x%04x%s, label[%08x]);\n",
|
||||
|
@ -212,7 +215,6 @@ void RegExpMacroAssemblerTracer::CheckCharacterGT(uc16 limit,
|
|||
assembler_->CheckCharacterGT(limit, on_greater);
|
||||
}
|
||||
|
||||
|
||||
void RegExpMacroAssemblerTracer::CheckCharacter(unsigned c, Label* on_equal) {
|
||||
PrintablePrinter printable(c);
|
||||
PrintF(" CheckCharacter(c=0x%04x%s, label[%08x]);\n",
|
||||
|
@ -275,12 +277,8 @@ void RegExpMacroAssemblerTracer::CheckNotCharacterAfterAnd(
|
|||
assembler_->CheckNotCharacterAfterAnd(c, mask, on_not_equal);
|
||||
}
|
||||
|
||||
|
||||
void RegExpMacroAssemblerTracer::CheckNotCharacterAfterMinusAnd(
|
||||
uc16 c,
|
||||
uc16 minus,
|
||||
uc16 mask,
|
||||
Label* on_not_equal) {
|
||||
base::uc16 c, base::uc16 minus, base::uc16 mask, Label* on_not_equal) {
|
||||
PrintF(" CheckNotCharacterAfterMinusAnd(c=0x%04x, minus=%04x, mask=0x%04x, "
|
||||
"label[%08x]);\n",
|
||||
c,
|
||||
|
@ -290,11 +288,9 @@ void RegExpMacroAssemblerTracer::CheckNotCharacterAfterMinusAnd(
|
|||
assembler_->CheckNotCharacterAfterMinusAnd(c, minus, mask, on_not_equal);
|
||||
}
|
||||
|
||||
|
||||
void RegExpMacroAssemblerTracer::CheckCharacterInRange(
|
||||
uc16 from,
|
||||
uc16 to,
|
||||
Label* on_not_in_range) {
|
||||
void RegExpMacroAssemblerTracer::CheckCharacterInRange(base::uc16 from,
|
||||
base::uc16 to,
|
||||
Label* on_not_in_range) {
|
||||
PrintablePrinter printable_from(from);
|
||||
PrintablePrinter printable_to(to);
|
||||
PrintF(" CheckCharacterInRange(from=0x%04x%s, to=0x%04x%s, label[%08x]);\n",
|
||||
|
@ -306,11 +302,9 @@ void RegExpMacroAssemblerTracer::CheckCharacterInRange(
|
|||
assembler_->CheckCharacterInRange(from, to, on_not_in_range);
|
||||
}
|
||||
|
||||
|
||||
void RegExpMacroAssemblerTracer::CheckCharacterNotInRange(
|
||||
uc16 from,
|
||||
uc16 to,
|
||||
Label* on_in_range) {
|
||||
void RegExpMacroAssemblerTracer::CheckCharacterNotInRange(base::uc16 from,
|
||||
base::uc16 to,
|
||||
Label* on_in_range) {
|
||||
PrintablePrinter printable_from(from);
|
||||
PrintablePrinter printable_to(to);
|
||||
PrintF(
|
||||
|
@ -323,6 +317,40 @@ void RegExpMacroAssemblerTracer::CheckCharacterNotInRange(
|
|||
assembler_->CheckCharacterNotInRange(from, to, on_in_range);
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
void PrintRangeArray(const ZoneList<CharacterRange>* ranges) {
|
||||
for (int i = 0; i < ranges->length(); i++) {
|
||||
base::uc16 from = ranges->at(i).from();
|
||||
base::uc16 to = ranges->at(i).to();
|
||||
PrintablePrinter printable_from(from);
|
||||
PrintablePrinter printable_to(to);
|
||||
PrintF(" [from=0x%04x%s, to=%04x%s],\n", from, *printable_from, to,
|
||||
*printable_to);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
bool RegExpMacroAssemblerTracer::CheckCharacterInRangeArray(
|
||||
const ZoneList<CharacterRange>* ranges, Label* on_in_range) {
|
||||
PrintF(
|
||||
" CheckCharacterInRangeArray(\n"
|
||||
" label[%08x]);\n",
|
||||
LabelToInt(on_in_range));
|
||||
PrintRangeArray(ranges);
|
||||
return assembler_->CheckCharacterInRangeArray(ranges, on_in_range);
|
||||
}
|
||||
|
||||
bool RegExpMacroAssemblerTracer::CheckCharacterNotInRangeArray(
|
||||
const ZoneList<CharacterRange>* ranges, Label* on_not_in_range) {
|
||||
PrintF(
|
||||
" CheckCharacterNotInRangeArray(\n"
|
||||
" label[%08x]);\n",
|
||||
LabelToInt(on_not_in_range));
|
||||
PrintRangeArray(ranges);
|
||||
return assembler_->CheckCharacterNotInRangeArray(ranges, on_not_in_range);
|
||||
}
|
||||
|
||||
void RegExpMacroAssemblerTracer::CheckBitInTable(
|
||||
Handle<ByteArray> table, Label* on_bit_set) {
|
||||
|
@ -362,20 +390,16 @@ void RegExpMacroAssemblerTracer::CheckPosition(int cp_offset,
|
|||
assembler_->CheckPosition(cp_offset, on_outside_input);
|
||||
}
|
||||
|
||||
|
||||
bool RegExpMacroAssemblerTracer::CheckSpecialCharacterClass(
|
||||
uc16 type,
|
||||
Label* on_no_match) {
|
||||
StandardCharacterSet type, Label* on_no_match) {
|
||||
bool supported = assembler_->CheckSpecialCharacterClass(type,
|
||||
on_no_match);
|
||||
PrintF(" CheckSpecialCharacterClass(type='%c', label[%08x]): %s;\n",
|
||||
type,
|
||||
LabelToInt(on_no_match),
|
||||
static_cast<char>(type), LabelToInt(on_no_match),
|
||||
supported ? "true" : "false");
|
||||
return supported;
|
||||
}
|
||||
|
||||
|
||||
void RegExpMacroAssemblerTracer::IfRegisterLT(int register_index,
|
||||
int comparand, Label* if_lt) {
|
||||
PrintF(" IfRegisterLT(register=%d, number=%d, label[%08x]);\n",
|
||||
|
|
|
@ -17,7 +17,9 @@ class RegExpMacroAssemblerTracer: public RegExpMacroAssembler {
|
|||
~RegExpMacroAssemblerTracer() override;
|
||||
void AbortedCodeGeneration() override;
|
||||
int stack_limit_slack() override { return assembler_->stack_limit_slack(); }
|
||||
bool CanReadUnaligned() override { return assembler_->CanReadUnaligned(); }
|
||||
bool CanReadUnaligned() const override {
|
||||
return assembler_->CanReadUnaligned();
|
||||
}
|
||||
void AdvanceCurrentPosition(int by) override; // Signed cp change.
|
||||
void AdvanceRegister(int reg, int by) override; // r[reg] += by.
|
||||
void Backtrack() override;
|
||||
|
@ -25,8 +27,8 @@ class RegExpMacroAssemblerTracer: public RegExpMacroAssembler {
|
|||
void CheckCharacter(unsigned c, Label* on_equal) override;
|
||||
void CheckCharacterAfterAnd(unsigned c, unsigned and_with,
|
||||
Label* on_equal) override;
|
||||
void CheckCharacterGT(uc16 limit, Label* on_greater) override;
|
||||
void CheckCharacterLT(uc16 limit, Label* on_less) override;
|
||||
void CheckCharacterGT(base::uc16 limit, Label* on_greater) override;
|
||||
void CheckCharacterLT(base::uc16 limit, Label* on_less) override;
|
||||
void CheckGreedyLoop(Label* on_tos_equals_current_position) override;
|
||||
void CheckAtStart(int cp_offset, Label* on_at_start) override;
|
||||
void CheckNotAtStart(int cp_offset, Label* on_not_at_start) override;
|
||||
|
@ -38,14 +40,21 @@ class RegExpMacroAssemblerTracer: public RegExpMacroAssembler {
|
|||
void CheckNotCharacter(unsigned c, Label* on_not_equal) override;
|
||||
void CheckNotCharacterAfterAnd(unsigned c, unsigned and_with,
|
||||
Label* on_not_equal) override;
|
||||
void CheckNotCharacterAfterMinusAnd(uc16 c, uc16 minus, uc16 and_with,
|
||||
void CheckNotCharacterAfterMinusAnd(base::uc16 c, base::uc16 minus,
|
||||
base::uc16 and_with,
|
||||
Label* on_not_equal) override;
|
||||
void CheckCharacterInRange(uc16 from, uc16 to, Label* on_in_range) override;
|
||||
void CheckCharacterNotInRange(uc16 from, uc16 to,
|
||||
void CheckCharacterInRange(base::uc16 from, base::uc16 to,
|
||||
Label* on_in_range) override;
|
||||
void CheckCharacterNotInRange(base::uc16 from, base::uc16 to,
|
||||
Label* on_not_in_range) override;
|
||||
bool CheckCharacterInRangeArray(const ZoneList<CharacterRange>* ranges,
|
||||
Label* on_in_range) override;
|
||||
bool CheckCharacterNotInRangeArray(const ZoneList<CharacterRange>* ranges,
|
||||
Label* on_not_in_range) override;
|
||||
void CheckBitInTable(Handle<ByteArray> table, Label* on_bit_set) override;
|
||||
void CheckPosition(int cp_offset, Label* on_outside_input) override;
|
||||
bool CheckSpecialCharacterClass(uc16 type, Label* on_no_match) override;
|
||||
bool CheckSpecialCharacterClass(StandardCharacterSet type,
|
||||
Label* on_no_match) override;
|
||||
void Fail() override;
|
||||
Handle<HeapObject> GetCode(Handle<String> source) override;
|
||||
void GoTo(Label* label) override;
|
||||
|
|
|
@ -17,12 +17,16 @@ namespace internal {
|
|||
|
||||
RegExpMacroAssembler::RegExpMacroAssembler(Isolate* isolate, Zone* zone)
|
||||
: slow_safe_compiler_(false),
|
||||
backtrack_limit_(JSRegExp::kNoBacktrackLimit),
|
||||
global_mode_(NOT_GLOBAL),
|
||||
isolate_(isolate),
|
||||
zone_(zone) {}
|
||||
|
||||
RegExpMacroAssembler::~RegExpMacroAssembler() = default;
|
||||
bool RegExpMacroAssembler::has_backtrack_limit() const {
|
||||
return backtrack_limit_ != JSRegExp::kNoBacktrackLimit;
|
||||
}
|
||||
|
||||
// static
|
||||
int RegExpMacroAssembler::CaseInsensitiveCompareNonUnicode(Address byte_offset1,
|
||||
Address byte_offset2,
|
||||
size_t byte_length,
|
||||
|
@ -34,8 +38,8 @@ int RegExpMacroAssembler::CaseInsensitiveCompareNonUnicode(Address byte_offset1,
|
|||
DisallowGarbageCollection no_gc;
|
||||
DCHECK_EQ(0, byte_length % 2);
|
||||
size_t length = byte_length / 2;
|
||||
uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
|
||||
uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
|
||||
base::uc16* substring1 = reinterpret_cast<base::uc16*>(byte_offset1);
|
||||
base::uc16* substring2 = reinterpret_cast<base::uc16*>(byte_offset2);
|
||||
|
||||
for (size_t i = 0; i < length; i++) {
|
||||
UChar32 c1 = RegExpCaseFolding::Canonicalize(substring1[i]);
|
||||
|
@ -51,6 +55,7 @@ int RegExpMacroAssembler::CaseInsensitiveCompareNonUnicode(Address byte_offset1,
|
|||
#endif
|
||||
}
|
||||
|
||||
// static
|
||||
int RegExpMacroAssembler::CaseInsensitiveCompareUnicode(Address byte_offset1,
|
||||
Address byte_offset2,
|
||||
size_t byte_length,
|
||||
|
@ -68,8 +73,8 @@ int RegExpMacroAssembler::CaseInsensitiveCompareUnicode(Address byte_offset1,
|
|||
return uni_str_1.caseCompare(reinterpret_cast<const char16_t*>(byte_offset2),
|
||||
length, U_FOLD_CASE_DEFAULT) == 0;
|
||||
#else
|
||||
uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
|
||||
uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
|
||||
base::uc16* substring1 = reinterpret_cast<base::uc16*>(byte_offset1);
|
||||
base::uc16* substring2 = reinterpret_cast<base::uc16*>(byte_offset2);
|
||||
size_t length = byte_length >> 1;
|
||||
DCHECK_NOT_NULL(isolate);
|
||||
unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
|
||||
|
@ -93,6 +98,130 @@ int RegExpMacroAssembler::CaseInsensitiveCompareUnicode(Address byte_offset1,
|
|||
#endif // V8_INTL_SUPPORT
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
uint32_t Hash(const ZoneList<CharacterRange>* ranges) {
|
||||
size_t seed = 0;
|
||||
for (int i = 0; i < ranges->length(); i++) {
|
||||
const CharacterRange& r = ranges->at(i);
|
||||
seed = base::hash_combine(seed, r.from(), r.to());
|
||||
}
|
||||
return static_cast<uint32_t>(seed);
|
||||
}
|
||||
|
||||
constexpr base::uc32 MaskEndOfRangeMarker(base::uc32 c) {
|
||||
// CharacterRanges may use 0x10ffff as the end-of-range marker irrespective
|
||||
// of whether the regexp IsUnicode or not; translate the marker value here.
|
||||
DCHECK_IMPLIES(c > kMaxUInt16, c == String::kMaxCodePoint);
|
||||
return c & 0xffff;
|
||||
}
|
||||
|
||||
int RangeArrayLengthFor(const ZoneList<CharacterRange>* ranges) {
|
||||
const int ranges_length = ranges->length();
|
||||
return MaskEndOfRangeMarker(ranges->at(ranges_length - 1).to()) == kMaxUInt16
|
||||
? ranges_length * 2 - 1
|
||||
: ranges_length * 2;
|
||||
}
|
||||
|
||||
bool Equals(const ZoneList<CharacterRange>* lhs, const Handle<ByteArray>& rhs) {
|
||||
DCHECK_EQ(rhs->length() % kUInt16Size, 0); // uc16 elements.
|
||||
const int rhs_length = rhs->length() / kUInt16Size;
|
||||
if (rhs_length != RangeArrayLengthFor(lhs)) return false;
|
||||
for (int i = 0; i < lhs->length(); i++) {
|
||||
const CharacterRange& r = lhs->at(i);
|
||||
if (rhs->get_uint16(i * 2 + 0) != r.from()) return false;
|
||||
if (i * 2 + 1 == rhs_length) break;
|
||||
if (rhs->get_uint16(i * 2 + 1) != r.to() + 1) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
Handle<ByteArray> MakeRangeArray(Isolate* isolate,
|
||||
const ZoneList<CharacterRange>* ranges) {
|
||||
const int ranges_length = ranges->length();
|
||||
const int byte_array_length = RangeArrayLengthFor(ranges);
|
||||
const int size_in_bytes = byte_array_length * kUInt16Size;
|
||||
Handle<ByteArray> range_array =
|
||||
isolate->factory()->NewByteArray(size_in_bytes);
|
||||
for (int i = 0; i < ranges_length; i++) {
|
||||
const CharacterRange& r = ranges->at(i);
|
||||
DCHECK_LE(r.from(), kMaxUInt16);
|
||||
range_array->set_uint16(i * 2 + 0, r.from());
|
||||
const base::uc32 to = MaskEndOfRangeMarker(r.to());
|
||||
if (i == ranges_length - 1 && to == kMaxUInt16) {
|
||||
DCHECK_EQ(byte_array_length, ranges_length * 2 - 1);
|
||||
break; // Avoid overflow by leaving the last range open-ended.
|
||||
}
|
||||
DCHECK_LT(to, kMaxUInt16);
|
||||
range_array->set_uint16(i * 2 + 1, to + 1); // Exclusive.
|
||||
}
|
||||
return range_array;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
Handle<ByteArray> NativeRegExpMacroAssembler::GetOrAddRangeArray(
|
||||
const ZoneList<CharacterRange>* ranges) {
|
||||
const uint32_t hash = Hash(ranges);
|
||||
|
||||
if (range_array_cache_.count(hash) != 0) {
|
||||
Handle<ByteArray> range_array = range_array_cache_[hash];
|
||||
if (Equals(ranges, range_array)) return range_array;
|
||||
}
|
||||
|
||||
Handle<ByteArray> range_array = MakeRangeArray(isolate(), ranges);
|
||||
range_array_cache_[hash] = range_array;
|
||||
return range_array;
|
||||
}
|
||||
|
||||
// static
|
||||
uint32_t RegExpMacroAssembler::IsCharacterInRangeArray(uint32_t current_char,
|
||||
Address raw_byte_array,
|
||||
Isolate* isolate) {
|
||||
// Use uint32_t to avoid complexity around bool return types (which may be
|
||||
// optimized to use only the least significant byte).
|
||||
static constexpr uint32_t kTrue = 1;
|
||||
static constexpr uint32_t kFalse = 0;
|
||||
|
||||
ByteArray ranges = ByteArray::cast(Object(raw_byte_array));
|
||||
|
||||
DCHECK_EQ(ranges.length() % kUInt16Size, 0); // uc16 elements.
|
||||
const int length = ranges.length() / kUInt16Size;
|
||||
DCHECK_GE(length, 1);
|
||||
|
||||
// Shortcut for fully out of range chars.
|
||||
if (current_char < ranges.get_uint16(0)) return kFalse;
|
||||
if (current_char >= ranges.get_uint16(length - 1)) {
|
||||
// The last range may be open-ended.
|
||||
return (length % 2) == 0 ? kFalse : kTrue;
|
||||
}
|
||||
|
||||
// Binary search for the matching range. `ranges` is encoded as
|
||||
// [from0, to0, from1, to1, ..., fromN, toN], or
|
||||
// [from0, to0, from1, to1, ..., fromN] (open-ended last interval).
|
||||
|
||||
int mid, lower = 0, upper = length;
|
||||
do {
|
||||
mid = lower + (upper - lower) / 2;
|
||||
const base::uc16 elem = ranges.get_uint16(mid);
|
||||
if (current_char < elem) {
|
||||
upper = mid;
|
||||
} else if (current_char > elem) {
|
||||
lower = mid + 1;
|
||||
} else {
|
||||
DCHECK_EQ(current_char, elem);
|
||||
break;
|
||||
}
|
||||
} while (lower < upper);
|
||||
|
||||
const bool current_char_ge_last_elem = current_char >= ranges.get_uint16(mid);
|
||||
const int current_range_start_index =
|
||||
current_char_ge_last_elem ? mid : mid - 1;
|
||||
|
||||
// Ranges start at even indices and end at odd indices.
|
||||
return (current_range_start_index % 2) == 0 ? kTrue : kFalse;
|
||||
}
|
||||
|
||||
void RegExpMacroAssembler::CheckNotInSurrogatePair(int cp_offset,
|
||||
Label* on_failure) {
|
||||
Label ok;
|
||||
|
@ -124,17 +253,6 @@ void RegExpMacroAssembler::LoadCurrentCharacter(int cp_offset,
|
|||
eats_at_least);
|
||||
}
|
||||
|
||||
bool RegExpMacroAssembler::CheckSpecialCharacterClass(uc16 type,
|
||||
Label* on_no_match) {
|
||||
return false;
|
||||
}
|
||||
|
||||
NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate,
|
||||
Zone* zone)
|
||||
: RegExpMacroAssembler(isolate, zone) {}
|
||||
|
||||
NativeRegExpMacroAssembler::~NativeRegExpMacroAssembler() = default;
|
||||
|
||||
void NativeRegExpMacroAssembler::LoadCurrentCharacterImpl(
|
||||
int cp_offset, Label* on_end_of_input, bool check_bounds, int characters,
|
||||
int eats_at_least) {
|
||||
|
@ -153,13 +271,14 @@ void NativeRegExpMacroAssembler::LoadCurrentCharacterImpl(
|
|||
LoadCurrentCharacterUnchecked(cp_offset, characters);
|
||||
}
|
||||
|
||||
bool NativeRegExpMacroAssembler::CanReadUnaligned() {
|
||||
bool NativeRegExpMacroAssembler::CanReadUnaligned() const {
|
||||
return FLAG_enable_regexp_unaligned_accesses && !slow_safe();
|
||||
}
|
||||
|
||||
#ifndef COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER
|
||||
|
||||
// This method may only be called after an interrupt.
|
||||
// static
|
||||
int NativeRegExpMacroAssembler::CheckStackGuardState(
|
||||
Isolate* isolate, int start_index, RegExp::CallOrigin call_origin,
|
||||
Address* return_address, Code re_code, Address* subject,
|
||||
|
@ -287,6 +406,15 @@ int NativeRegExpMacroAssembler::Match(Handle<JSRegExp> regexp,
|
|||
offsets_vector_length, isolate, *regexp);
|
||||
}
|
||||
|
||||
// static
|
||||
int NativeRegExpMacroAssembler::ExecuteForTesting(
|
||||
String input, int start_offset, const byte* input_start,
|
||||
const byte* input_end, int* output, int output_size, Isolate* isolate,
|
||||
JSRegExp regexp) {
|
||||
return Execute(input, start_offset, input_start, input_end, output,
|
||||
output_size, isolate, regexp);
|
||||
}
|
||||
|
||||
// Returns a {Result} sentinel, or the number of successful matches.
|
||||
// TODO(pthier): The JSRegExp object is passed to native irregexp code to match
|
||||
// the signature of the interpreter. We should get rid of JS objects passed to
|
||||
|
@ -295,23 +423,21 @@ int NativeRegExpMacroAssembler::Execute(
|
|||
String input, // This needs to be the unpacked (sliced, cons) string.
|
||||
int start_offset, const byte* input_start, const byte* input_end,
|
||||
int* output, int output_size, Isolate* isolate, JSRegExp regexp) {
|
||||
// Ensure that the minimum stack has been allocated.
|
||||
RegExpStackScope stack_scope(isolate);
|
||||
Address stack_base = stack_scope.stack()->stack_base();
|
||||
|
||||
bool is_one_byte = String::IsOneByteRepresentationUnderneath(input);
|
||||
Code code = Code::cast(regexp.Code(is_one_byte));
|
||||
Code code = FromCodeT(CodeT::cast(regexp.code(is_one_byte)));
|
||||
RegExp::CallOrigin call_origin = RegExp::CallOrigin::kFromRuntime;
|
||||
|
||||
using RegexpMatcherSig = int(
|
||||
Address input_string, int start_offset, const byte* input_start,
|
||||
const byte* input_end, int* output, int output_size, Address stack_base,
|
||||
int call_origin, Isolate* isolate, Address regexp);
|
||||
using RegexpMatcherSig =
|
||||
// NOLINTNEXTLINE(readability/casting)
|
||||
int(Address input_string, int start_offset, const byte* input_start,
|
||||
const byte* input_end, int* output, int output_size, int call_origin,
|
||||
Isolate* isolate, Address regexp);
|
||||
|
||||
auto fn = GeneratedCode<RegexpMatcherSig>::FromCode(code);
|
||||
int result =
|
||||
fn.Call(input.ptr(), start_offset, input_start, input_end, output,
|
||||
output_size, stack_base, call_origin, isolate, regexp.ptr());
|
||||
int result = fn.Call(input.ptr(), start_offset, input_start, input_end,
|
||||
output, output_size, call_origin, isolate, regexp.ptr());
|
||||
DCHECK_GE(result, SMALLEST_REGEXP_RESULT);
|
||||
|
||||
if (result == EXCEPTION && !isolate->has_pending_exception()) {
|
||||
|
@ -371,22 +497,24 @@ const byte NativeRegExpMacroAssembler::word_character_map[] = {
|
|||
};
|
||||
// clang-format on
|
||||
|
||||
Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer,
|
||||
Address* stack_base,
|
||||
Isolate* isolate) {
|
||||
// static
|
||||
Address NativeRegExpMacroAssembler::GrowStack(Isolate* isolate) {
|
||||
DisallowGarbageCollection no_gc;
|
||||
|
||||
RegExpStack* regexp_stack = isolate->regexp_stack();
|
||||
size_t size = regexp_stack->stack_capacity();
|
||||
Address old_stack_base = regexp_stack->stack_base();
|
||||
DCHECK(old_stack_base == *stack_base);
|
||||
DCHECK(stack_pointer <= old_stack_base);
|
||||
DCHECK(static_cast<size_t>(old_stack_base - stack_pointer) <= size);
|
||||
Address new_stack_base = regexp_stack->EnsureCapacity(size * 2);
|
||||
if (new_stack_base == kNullAddress) {
|
||||
return kNullAddress;
|
||||
}
|
||||
*stack_base = new_stack_base;
|
||||
intptr_t stack_content_size = old_stack_base - stack_pointer;
|
||||
return new_stack_base - stack_content_size;
|
||||
const size_t old_size = regexp_stack->memory_size();
|
||||
|
||||
#ifdef DEBUG
|
||||
const Address old_stack_top = regexp_stack->memory_top();
|
||||
const Address old_stack_pointer = regexp_stack->stack_pointer();
|
||||
CHECK_LE(old_stack_pointer, old_stack_top);
|
||||
CHECK_LE(static_cast<size_t>(old_stack_top - old_stack_pointer), old_size);
|
||||
#endif // DEBUG
|
||||
|
||||
Address new_stack_base = regexp_stack->EnsureCapacity(old_size * 2);
|
||||
if (new_stack_base == kNullAddress) return kNullAddress;
|
||||
|
||||
return regexp_stack->stack_pointer();
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
|
|
|
@ -12,18 +12,17 @@
|
|||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
static const uc32 kLeadSurrogateStart = 0xd800;
|
||||
static const uc32 kLeadSurrogateEnd = 0xdbff;
|
||||
static const uc32 kTrailSurrogateStart = 0xdc00;
|
||||
static const uc32 kTrailSurrogateEnd = 0xdfff;
|
||||
static const uc32 kNonBmpStart = 0x10000;
|
||||
static const uc32 kNonBmpEnd = 0x10ffff;
|
||||
|
||||
struct DisjunctDecisionRow {
|
||||
RegExpCharacterClass cc;
|
||||
Label* on_match;
|
||||
};
|
||||
class ByteArray;
|
||||
class JSRegExp;
|
||||
class Label;
|
||||
class String;
|
||||
|
||||
static const base::uc32 kLeadSurrogateStart = 0xd800;
|
||||
static const base::uc32 kLeadSurrogateEnd = 0xdbff;
|
||||
static const base::uc32 kTrailSurrogateStart = 0xdc00;
|
||||
static const base::uc32 kTrailSurrogateEnd = 0xdfff;
|
||||
static const base::uc32 kNonBmpStart = 0x10000;
|
||||
static const base::uc32 kNonBmpEnd = 0x10ffff;
|
||||
|
||||
class RegExpMacroAssembler {
|
||||
public:
|
||||
|
@ -39,11 +38,134 @@ class RegExpMacroAssembler {
|
|||
|
||||
static constexpr int kUseCharactersValue = -1;
|
||||
|
||||
RegExpMacroAssembler(Isolate* isolate, Zone* zone);
|
||||
virtual ~RegExpMacroAssembler() = default;
|
||||
|
||||
virtual Handle<HeapObject> GetCode(Handle<String> source) = 0;
|
||||
|
||||
// This function is called when code generation is aborted, so that
|
||||
// the assembler could clean up internal data structures.
|
||||
virtual void AbortedCodeGeneration() {}
|
||||
// The maximal number of pushes between stack checks. Users must supply
|
||||
// kCheckStackLimit flag to push operations (instead of kNoStackLimitCheck)
|
||||
// at least once for every stack_limit() pushes that are executed.
|
||||
virtual int stack_limit_slack() = 0;
|
||||
virtual bool CanReadUnaligned() const = 0;
|
||||
|
||||
virtual void AdvanceCurrentPosition(int by) = 0; // Signed cp change.
|
||||
virtual void AdvanceRegister(int reg, int by) = 0; // r[reg] += by.
|
||||
// Continues execution from the position pushed on the top of the backtrack
|
||||
// stack by an earlier PushBacktrack(Label*).
|
||||
virtual void Backtrack() = 0;
|
||||
virtual void Bind(Label* label) = 0;
|
||||
// Dispatch after looking the current character up in a 2-bits-per-entry
|
||||
// map. The destinations vector has up to 4 labels.
|
||||
virtual void CheckCharacter(unsigned c, Label* on_equal) = 0;
|
||||
// Bitwise and the current character with the given constant and then
|
||||
// check for a match with c.
|
||||
virtual void CheckCharacterAfterAnd(unsigned c,
|
||||
unsigned and_with,
|
||||
Label* on_equal) = 0;
|
||||
virtual void CheckCharacterGT(base::uc16 limit, Label* on_greater) = 0;
|
||||
virtual void CheckCharacterLT(base::uc16 limit, Label* on_less) = 0;
|
||||
virtual void CheckGreedyLoop(Label* on_tos_equals_current_position) = 0;
|
||||
virtual void CheckAtStart(int cp_offset, Label* on_at_start) = 0;
|
||||
virtual void CheckNotAtStart(int cp_offset, Label* on_not_at_start) = 0;
|
||||
virtual void CheckNotBackReference(int start_reg, bool read_backward,
|
||||
Label* on_no_match) = 0;
|
||||
virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
|
||||
bool read_backward, bool unicode,
|
||||
Label* on_no_match) = 0;
|
||||
// Check the current character for a match with a literal character. If we
|
||||
// fail to match then goto the on_failure label. End of input always
|
||||
// matches. If the label is nullptr then we should pop a backtrack address
|
||||
// off the stack and go to that.
|
||||
virtual void CheckNotCharacter(unsigned c, Label* on_not_equal) = 0;
|
||||
virtual void CheckNotCharacterAfterAnd(unsigned c,
|
||||
unsigned and_with,
|
||||
Label* on_not_equal) = 0;
|
||||
// Subtract a constant from the current character, then and with the given
|
||||
// constant and then check for a match with c.
|
||||
virtual void CheckNotCharacterAfterMinusAnd(base::uc16 c, base::uc16 minus,
|
||||
base::uc16 and_with,
|
||||
Label* on_not_equal) = 0;
|
||||
virtual void CheckCharacterInRange(base::uc16 from,
|
||||
base::uc16 to, // Both inclusive.
|
||||
Label* on_in_range) = 0;
|
||||
virtual void CheckCharacterNotInRange(base::uc16 from,
|
||||
base::uc16 to, // Both inclusive.
|
||||
Label* on_not_in_range) = 0;
|
||||
// Returns true if the check was emitted, false otherwise.
|
||||
virtual bool CheckCharacterInRangeArray(
|
||||
const ZoneList<CharacterRange>* ranges, Label* on_in_range) = 0;
|
||||
virtual bool CheckCharacterNotInRangeArray(
|
||||
const ZoneList<CharacterRange>* ranges, Label* on_not_in_range) = 0;
|
||||
|
||||
// The current character (modulus the kTableSize) is looked up in the byte
|
||||
// array, and if the found byte is non-zero, we jump to the on_bit_set label.
|
||||
virtual void CheckBitInTable(Handle<ByteArray> table, Label* on_bit_set) = 0;
|
||||
|
||||
// Checks whether the given offset from the current position is before
|
||||
// the end of the string. May overwrite the current character.
|
||||
virtual void CheckPosition(int cp_offset, Label* on_outside_input);
|
||||
// Check whether a standard/default character class matches the current
|
||||
// character. Returns false if the type of special character class does
|
||||
// not have custom support.
|
||||
// May clobber the current loaded character.
|
||||
virtual bool CheckSpecialCharacterClass(StandardCharacterSet type,
|
||||
Label* on_no_match) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Control-flow integrity:
|
||||
// Define a jump target and bind a label.
|
||||
virtual void BindJumpTarget(Label* label) { Bind(label); }
|
||||
|
||||
virtual void Fail() = 0;
|
||||
virtual void GoTo(Label* label) = 0;
|
||||
// Check whether a register is >= a given constant and go to a label if it
|
||||
// is. Backtracks instead if the label is nullptr.
|
||||
virtual void IfRegisterGE(int reg, int comparand, Label* if_ge) = 0;
|
||||
// Check whether a register is < a given constant and go to a label if it is.
|
||||
// Backtracks instead if the label is nullptr.
|
||||
virtual void IfRegisterLT(int reg, int comparand, Label* if_lt) = 0;
|
||||
// Check whether a register is == to the current position and go to a
|
||||
// label if it is.
|
||||
virtual void IfRegisterEqPos(int reg, Label* if_eq) = 0;
|
||||
V8_EXPORT_PRIVATE void LoadCurrentCharacter(
|
||||
int cp_offset, Label* on_end_of_input, bool check_bounds = true,
|
||||
int characters = 1, int eats_at_least = kUseCharactersValue);
|
||||
virtual void LoadCurrentCharacterImpl(int cp_offset, Label* on_end_of_input,
|
||||
bool check_bounds, int characters,
|
||||
int eats_at_least) = 0;
|
||||
virtual void PopCurrentPosition() = 0;
|
||||
virtual void PopRegister(int register_index) = 0;
|
||||
// Pushes the label on the backtrack stack, so that a following Backtrack
|
||||
// will go to this label. Always checks the backtrack stack limit.
|
||||
virtual void PushBacktrack(Label* label) = 0;
|
||||
virtual void PushCurrentPosition() = 0;
|
||||
enum StackCheckFlag { kNoStackLimitCheck = false, kCheckStackLimit = true };
|
||||
virtual void PushRegister(int register_index,
|
||||
StackCheckFlag check_stack_limit) = 0;
|
||||
virtual void ReadCurrentPositionFromRegister(int reg) = 0;
|
||||
virtual void ReadStackPointerFromRegister(int reg) = 0;
|
||||
virtual void SetCurrentPositionFromEnd(int by) = 0;
|
||||
virtual void SetRegister(int register_index, int to) = 0;
|
||||
// Return whether the matching (with a global regexp) will be restarted.
|
||||
virtual bool Succeed() = 0;
|
||||
virtual void WriteCurrentPositionToRegister(int reg, int cp_offset) = 0;
|
||||
virtual void ClearRegisters(int reg_from, int reg_to) = 0;
|
||||
virtual void WriteStackPointerToRegister(int reg) = 0;
|
||||
|
||||
// Check that we are not in the middle of a surrogate pair.
|
||||
void CheckNotInSurrogatePair(int cp_offset, Label* on_failure);
|
||||
|
||||
#define IMPLEMENTATIONS_LIST(V) \
|
||||
V(IA32) \
|
||||
V(ARM) \
|
||||
V(ARM64) \
|
||||
V(MIPS) \
|
||||
V(LOONG64) \
|
||||
V(RISCV) \
|
||||
V(S390) \
|
||||
V(PPC) \
|
||||
|
@ -65,123 +187,11 @@ class RegExpMacroAssembler {
|
|||
return kNames[impl];
|
||||
}
|
||||
#undef IMPLEMENTATIONS_LIST
|
||||
|
||||
enum StackCheckFlag {
|
||||
kNoStackLimitCheck = false,
|
||||
kCheckStackLimit = true
|
||||
};
|
||||
|
||||
RegExpMacroAssembler(Isolate* isolate, Zone* zone);
|
||||
virtual ~RegExpMacroAssembler();
|
||||
// This function is called when code generation is aborted, so that
|
||||
// the assembler could clean up internal data structures.
|
||||
virtual void AbortedCodeGeneration() {}
|
||||
// The maximal number of pushes between stack checks. Users must supply
|
||||
// kCheckStackLimit flag to push operations (instead of kNoStackLimitCheck)
|
||||
// at least once for every stack_limit() pushes that are executed.
|
||||
virtual int stack_limit_slack() = 0;
|
||||
virtual bool CanReadUnaligned() = 0;
|
||||
virtual void AdvanceCurrentPosition(int by) = 0; // Signed cp change.
|
||||
virtual void AdvanceRegister(int reg, int by) = 0; // r[reg] += by.
|
||||
// Continues execution from the position pushed on the top of the backtrack
|
||||
// stack by an earlier PushBacktrack(Label*).
|
||||
virtual void Backtrack() = 0;
|
||||
virtual void Bind(Label* label) = 0;
|
||||
// Dispatch after looking the current character up in a 2-bits-per-entry
|
||||
// map. The destinations vector has up to 4 labels.
|
||||
virtual void CheckCharacter(unsigned c, Label* on_equal) = 0;
|
||||
// Bitwise and the current character with the given constant and then
|
||||
// check for a match with c.
|
||||
virtual void CheckCharacterAfterAnd(unsigned c,
|
||||
unsigned and_with,
|
||||
Label* on_equal) = 0;
|
||||
virtual void CheckCharacterGT(uc16 limit, Label* on_greater) = 0;
|
||||
virtual void CheckCharacterLT(uc16 limit, Label* on_less) = 0;
|
||||
virtual void CheckGreedyLoop(Label* on_tos_equals_current_position) = 0;
|
||||
virtual void CheckAtStart(int cp_offset, Label* on_at_start) = 0;
|
||||
virtual void CheckNotAtStart(int cp_offset, Label* on_not_at_start) = 0;
|
||||
virtual void CheckNotBackReference(int start_reg, bool read_backward,
|
||||
Label* on_no_match) = 0;
|
||||
virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
|
||||
bool read_backward, bool unicode,
|
||||
Label* on_no_match) = 0;
|
||||
// Check the current character for a match with a literal character. If we
|
||||
// fail to match then goto the on_failure label. End of input always
|
||||
// matches. If the label is nullptr then we should pop a backtrack address
|
||||
// off the stack and go to that.
|
||||
virtual void CheckNotCharacter(unsigned c, Label* on_not_equal) = 0;
|
||||
virtual void CheckNotCharacterAfterAnd(unsigned c,
|
||||
unsigned and_with,
|
||||
Label* on_not_equal) = 0;
|
||||
// Subtract a constant from the current character, then and with the given
|
||||
// constant and then check for a match with c.
|
||||
virtual void CheckNotCharacterAfterMinusAnd(uc16 c,
|
||||
uc16 minus,
|
||||
uc16 and_with,
|
||||
Label* on_not_equal) = 0;
|
||||
virtual void CheckCharacterInRange(uc16 from,
|
||||
uc16 to, // Both inclusive.
|
||||
Label* on_in_range) = 0;
|
||||
virtual void CheckCharacterNotInRange(uc16 from,
|
||||
uc16 to, // Both inclusive.
|
||||
Label* on_not_in_range) = 0;
|
||||
|
||||
// The current character (modulus the kTableSize) is looked up in the byte
|
||||
// array, and if the found byte is non-zero, we jump to the on_bit_set label.
|
||||
virtual void CheckBitInTable(Handle<ByteArray> table, Label* on_bit_set) = 0;
|
||||
|
||||
// Checks whether the given offset from the current position is before
|
||||
// the end of the string. May overwrite the current character.
|
||||
virtual void CheckPosition(int cp_offset, Label* on_outside_input);
|
||||
// Check whether a standard/default character class matches the current
|
||||
// character. Returns false if the type of special character class does
|
||||
// not have custom support.
|
||||
// May clobber the current loaded character.
|
||||
virtual bool CheckSpecialCharacterClass(uc16 type, Label* on_no_match);
|
||||
|
||||
// Control-flow integrity:
|
||||
// Define a jump target and bind a label.
|
||||
virtual void BindJumpTarget(Label* label) { Bind(label); }
|
||||
|
||||
virtual void Fail() = 0;
|
||||
virtual Handle<HeapObject> GetCode(Handle<String> source) = 0;
|
||||
virtual void GoTo(Label* label) = 0;
|
||||
// Check whether a register is >= a given constant and go to a label if it
|
||||
// is. Backtracks instead if the label is nullptr.
|
||||
virtual void IfRegisterGE(int reg, int comparand, Label* if_ge) = 0;
|
||||
// Check whether a register is < a given constant and go to a label if it is.
|
||||
// Backtracks instead if the label is nullptr.
|
||||
virtual void IfRegisterLT(int reg, int comparand, Label* if_lt) = 0;
|
||||
// Check whether a register is == to the current position and go to a
|
||||
// label if it is.
|
||||
virtual void IfRegisterEqPos(int reg, Label* if_eq) = 0;
|
||||
virtual IrregexpImplementation Implementation() = 0;
|
||||
V8_EXPORT_PRIVATE void LoadCurrentCharacter(
|
||||
int cp_offset, Label* on_end_of_input, bool check_bounds = true,
|
||||
int characters = 1, int eats_at_least = kUseCharactersValue);
|
||||
virtual void LoadCurrentCharacterImpl(int cp_offset, Label* on_end_of_input,
|
||||
bool check_bounds, int characters,
|
||||
int eats_at_least) = 0;
|
||||
virtual void PopCurrentPosition() = 0;
|
||||
virtual void PopRegister(int register_index) = 0;
|
||||
// Pushes the label on the backtrack stack, so that a following Backtrack
|
||||
// will go to this label. Always checks the backtrack stack limit.
|
||||
virtual void PushBacktrack(Label* label) = 0;
|
||||
virtual void PushCurrentPosition() = 0;
|
||||
virtual void PushRegister(int register_index,
|
||||
StackCheckFlag check_stack_limit) = 0;
|
||||
virtual void ReadCurrentPositionFromRegister(int reg) = 0;
|
||||
virtual void ReadStackPointerFromRegister(int reg) = 0;
|
||||
virtual void SetCurrentPositionFromEnd(int by) = 0;
|
||||
virtual void SetRegister(int register_index, int to) = 0;
|
||||
// Return whether the matching (with a global regexp) will be restarted.
|
||||
virtual bool Succeed() = 0;
|
||||
virtual void WriteCurrentPositionToRegister(int reg, int cp_offset) = 0;
|
||||
virtual void ClearRegisters(int reg_from, int reg_to) = 0;
|
||||
virtual void WriteStackPointerToRegister(int reg) = 0;
|
||||
|
||||
// Compare two-byte strings case insensitively.
|
||||
// Called from generated RegExp code.
|
||||
//
|
||||
// Called from generated code.
|
||||
static int CaseInsensitiveCompareNonUnicode(Address byte_offset1,
|
||||
Address byte_offset2,
|
||||
size_t byte_length,
|
||||
|
@ -191,12 +201,23 @@ class RegExpMacroAssembler {
|
|||
size_t byte_length,
|
||||
Isolate* isolate);
|
||||
|
||||
// Check that we are not in the middle of a surrogate pair.
|
||||
void CheckNotInSurrogatePair(int cp_offset, Label* on_failure);
|
||||
// `raw_byte_array` is a ByteArray containing a set of character ranges,
|
||||
// where ranges are encoded as uint16_t elements:
|
||||
//
|
||||
// [from0, to0, from1, to1, ..., fromN, toN], or
|
||||
// [from0, to0, from1, to1, ..., fromN] (open-ended last interval).
|
||||
//
|
||||
// fromN is inclusive, toN is exclusive. Returns zero if not in a range,
|
||||
// non-zero otherwise.
|
||||
//
|
||||
// Called from generated code.
|
||||
static uint32_t IsCharacterInRangeArray(uint32_t current_char,
|
||||
Address raw_byte_array,
|
||||
Isolate* isolate);
|
||||
|
||||
// Controls the generation of large inlined constants in the code.
|
||||
void set_slow_safe(bool ssc) { slow_safe_compiler_ = ssc; }
|
||||
bool slow_safe() { return slow_safe_compiler_; }
|
||||
bool slow_safe() const { return slow_safe_compiler_; }
|
||||
|
||||
// Controls after how many backtracks irregexp should abort execution. If it
|
||||
// can fall back to the experimental engine (see `set_can_fallback`), it will
|
||||
|
@ -220,30 +241,28 @@ class RegExpMacroAssembler {
|
|||
// Set whether the regular expression has the global flag. Exiting due to
|
||||
// a failure in a global regexp may still mean success overall.
|
||||
inline void set_global_mode(GlobalMode mode) { global_mode_ = mode; }
|
||||
inline bool global() { return global_mode_ != NOT_GLOBAL; }
|
||||
inline bool global_with_zero_length_check() {
|
||||
inline bool global() const { return global_mode_ != NOT_GLOBAL; }
|
||||
inline bool global_with_zero_length_check() const {
|
||||
return global_mode_ == GLOBAL || global_mode_ == GLOBAL_UNICODE;
|
||||
}
|
||||
inline bool global_unicode() { return global_mode_ == GLOBAL_UNICODE; }
|
||||
inline bool global_unicode() const { return global_mode_ == GLOBAL_UNICODE; }
|
||||
|
||||
Isolate* isolate() const { return isolate_; }
|
||||
Zone* zone() const { return zone_; }
|
||||
|
||||
protected:
|
||||
bool has_backtrack_limit() const {
|
||||
return backtrack_limit_ != JSRegExp::kNoBacktrackLimit;
|
||||
}
|
||||
bool has_backtrack_limit() const;
|
||||
uint32_t backtrack_limit() const { return backtrack_limit_; }
|
||||
|
||||
bool can_fallback() const { return can_fallback_; }
|
||||
|
||||
private:
|
||||
bool slow_safe_compiler_;
|
||||
uint32_t backtrack_limit_ = JSRegExp::kNoBacktrackLimit;
|
||||
uint32_t backtrack_limit_;
|
||||
bool can_fallback_ = false;
|
||||
GlobalMode global_mode_;
|
||||
Isolate* isolate_;
|
||||
Zone* zone_;
|
||||
Isolate* const isolate_;
|
||||
Zone* const zone_;
|
||||
};
|
||||
|
||||
class NativeRegExpMacroAssembler: public RegExpMacroAssembler {
|
||||
|
@ -271,44 +290,24 @@ class NativeRegExpMacroAssembler: public RegExpMacroAssembler {
|
|||
SMALLEST_REGEXP_RESULT = RegExp::kInternalRegExpSmallestResult,
|
||||
};
|
||||
|
||||
NativeRegExpMacroAssembler(Isolate* isolate, Zone* zone);
|
||||
~NativeRegExpMacroAssembler() override;
|
||||
bool CanReadUnaligned() override;
|
||||
NativeRegExpMacroAssembler(Isolate* isolate, Zone* zone)
|
||||
: RegExpMacroAssembler(isolate, zone), range_array_cache_(zone) {}
|
||||
~NativeRegExpMacroAssembler() override = default;
|
||||
|
||||
// Returns a {Result} sentinel, or the number of successful matches.
|
||||
static int Match(Handle<JSRegExp> regexp, Handle<String> subject,
|
||||
int* offsets_vector, int offsets_vector_length,
|
||||
int previous_index, Isolate* isolate);
|
||||
|
||||
// Called from RegExp if the backtrack stack limit is hit.
|
||||
// Tries to expand the stack. Returns the new stack-pointer if
|
||||
// successful, and updates the stack_top address, or returns 0 if unable
|
||||
// to grow the stack.
|
||||
// This function must not trigger a garbage collection.
|
||||
static Address GrowStack(Address stack_pointer, Address* stack_top,
|
||||
Isolate* isolate);
|
||||
V8_EXPORT_PRIVATE static int ExecuteForTesting(String input, int start_offset,
|
||||
const byte* input_start,
|
||||
const byte* input_end,
|
||||
int* output, int output_size,
|
||||
Isolate* isolate,
|
||||
JSRegExp regexp);
|
||||
|
||||
static int CheckStackGuardState(Isolate* isolate, int start_index,
|
||||
RegExp::CallOrigin call_origin,
|
||||
Address* return_address, Code re_code,
|
||||
Address* subject, const byte** input_start,
|
||||
const byte** input_end);
|
||||
bool CanReadUnaligned() const override;
|
||||
|
||||
// Byte map of one byte characters with a 0xff if the character is a word
|
||||
// character (digit, letter or underscore) and 0x00 otherwise.
|
||||
// Used by generated RegExp code.
|
||||
static const byte word_character_map[256];
|
||||
|
||||
static Address word_character_map_address() {
|
||||
return reinterpret_cast<Address>(&word_character_map[0]);
|
||||
}
|
||||
|
||||
// Returns a {Result} sentinel, or the number of successful matches.
|
||||
V8_EXPORT_PRIVATE static int Execute(String input, int start_offset,
|
||||
const byte* input_start,
|
||||
const byte* input_end, int* output,
|
||||
int output_size, Isolate* isolate,
|
||||
JSRegExp regexp);
|
||||
void LoadCurrentCharacterImpl(int cp_offset, Label* on_end_of_input,
|
||||
bool check_bounds, int characters,
|
||||
int eats_at_least) override;
|
||||
|
@ -316,6 +315,41 @@ class NativeRegExpMacroAssembler: public RegExpMacroAssembler {
|
|||
// current position, into the current-character register.
|
||||
virtual void LoadCurrentCharacterUnchecked(int cp_offset,
|
||||
int character_count) = 0;
|
||||
|
||||
// Called from RegExp if the backtrack stack limit is hit. Tries to expand
|
||||
// the stack. Returns the new stack-pointer if successful, or returns 0 if
|
||||
// unable to grow the stack.
|
||||
// This function must not trigger a garbage collection.
|
||||
//
|
||||
// Called from generated code.
|
||||
static Address GrowStack(Isolate* isolate);
|
||||
|
||||
// Called from generated code.
|
||||
static int CheckStackGuardState(Isolate* isolate, int start_index,
|
||||
RegExp::CallOrigin call_origin,
|
||||
Address* return_address, Code re_code,
|
||||
Address* subject, const byte** input_start,
|
||||
const byte** input_end);
|
||||
|
||||
static Address word_character_map_address() {
|
||||
return reinterpret_cast<Address>(&word_character_map[0]);
|
||||
}
|
||||
|
||||
protected:
|
||||
// Byte map of one byte characters with a 0xff if the character is a word
|
||||
// character (digit, letter or underscore) and 0x00 otherwise.
|
||||
// Used by generated RegExp code.
|
||||
static const byte word_character_map[256];
|
||||
|
||||
Handle<ByteArray> GetOrAddRangeArray(const ZoneList<CharacterRange>* ranges);
|
||||
|
||||
private:
|
||||
// Returns a {Result} sentinel, or the number of successful matches.
|
||||
static int Execute(String input, int start_offset, const byte* input_start,
|
||||
const byte* input_end, int* output, int output_size,
|
||||
Isolate* isolate, JSRegExp regexp);
|
||||
|
||||
ZoneUnorderedMap<uint32_t, Handle<ByteArray>> range_array_cache_;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
|
|
@ -13,7 +13,6 @@ namespace internal {
|
|||
class AlternativeGenerationList;
|
||||
class BoyerMooreLookahead;
|
||||
class GreedyLoopState;
|
||||
class Label;
|
||||
class NodeVisitor;
|
||||
class QuickCheckDetails;
|
||||
class RegExpCompiler;
|
||||
|
@ -204,7 +203,9 @@ class RegExpNode : public ZoneObject {
|
|||
// If we know that the input is one-byte then there are some nodes that can
|
||||
// never match. This method returns a node that can be substituted for
|
||||
// itself, or nullptr if the node can never match.
|
||||
virtual RegExpNode* FilterOneByte(int depth) { return this; }
|
||||
virtual RegExpNode* FilterOneByte(int depth, RegExpFlags flags) {
|
||||
return this;
|
||||
}
|
||||
// Helper for FilterOneByte.
|
||||
RegExpNode* replacement() {
|
||||
DCHECK(info()->replacement_calculated);
|
||||
|
@ -293,7 +294,7 @@ class SeqRegExpNode : public RegExpNode {
|
|||
: RegExpNode(on_success->zone()), on_success_(on_success) {}
|
||||
RegExpNode* on_success() { return on_success_; }
|
||||
void set_on_success(RegExpNode* node) { on_success_ = node; }
|
||||
RegExpNode* FilterOneByte(int depth) override;
|
||||
RegExpNode* FilterOneByte(int depth, RegExpFlags flags) override;
|
||||
void FillInBMInfo(Isolate* isolate, int offset, int budget,
|
||||
BoyerMooreLookahead* bm, bool not_at_start) override {
|
||||
on_success_->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
|
||||
|
@ -301,7 +302,7 @@ class SeqRegExpNode : public RegExpNode {
|
|||
}
|
||||
|
||||
protected:
|
||||
RegExpNode* FilterSuccessor(int depth);
|
||||
RegExpNode* FilterSuccessor(int depth, RegExpFlags flags);
|
||||
|
||||
private:
|
||||
RegExpNode* on_success_;
|
||||
|
@ -405,15 +406,17 @@ class TextNode : public SeqRegExpNode {
|
|||
static TextNode* CreateForCharacterRanges(Zone* zone,
|
||||
ZoneList<CharacterRange>* ranges,
|
||||
bool read_backward,
|
||||
RegExpNode* on_success,
|
||||
JSRegExp::Flags flags);
|
||||
// Create TextNode for a surrogate pair with a range given for the
|
||||
// lead and the trail surrogate each.
|
||||
static TextNode* CreateForSurrogatePair(Zone* zone, CharacterRange lead,
|
||||
RegExpNode* on_success);
|
||||
// Create TextNode for a surrogate pair (i.e. match a sequence of two uc16
|
||||
// code unit ranges).
|
||||
static TextNode* CreateForSurrogatePair(
|
||||
Zone* zone, CharacterRange lead, ZoneList<CharacterRange>* trail_ranges,
|
||||
bool read_backward, RegExpNode* on_success);
|
||||
static TextNode* CreateForSurrogatePair(Zone* zone,
|
||||
ZoneList<CharacterRange>* lead_ranges,
|
||||
CharacterRange trail,
|
||||
bool read_backward,
|
||||
RegExpNode* on_success,
|
||||
JSRegExp::Flags flags);
|
||||
RegExpNode* on_success);
|
||||
void Accept(NodeVisitor* visitor) override;
|
||||
void Emit(RegExpCompiler* compiler, Trace* trace) override;
|
||||
void GetQuickCheckDetails(QuickCheckDetails* details,
|
||||
|
@ -421,14 +424,15 @@ class TextNode : public SeqRegExpNode {
|
|||
bool not_at_start) override;
|
||||
ZoneList<TextElement>* elements() { return elms_; }
|
||||
bool read_backward() { return read_backward_; }
|
||||
void MakeCaseIndependent(Isolate* isolate, bool is_one_byte);
|
||||
void MakeCaseIndependent(Isolate* isolate, bool is_one_byte,
|
||||
RegExpFlags flags);
|
||||
int GreedyLoopTextLength() override;
|
||||
RegExpNode* GetSuccessorOfOmnivorousTextNode(
|
||||
RegExpCompiler* compiler) override;
|
||||
void FillInBMInfo(Isolate* isolate, int offset, int budget,
|
||||
BoyerMooreLookahead* bm, bool not_at_start) override;
|
||||
void CalculateOffsets();
|
||||
RegExpNode* FilterOneByte(int depth) override;
|
||||
RegExpNode* FilterOneByte(int depth, RegExpFlags flags) override;
|
||||
int Length();
|
||||
|
||||
private:
|
||||
|
@ -496,7 +500,7 @@ class AssertionNode : public SeqRegExpNode {
|
|||
|
||||
class BackReferenceNode : public SeqRegExpNode {
|
||||
public:
|
||||
BackReferenceNode(int start_reg, int end_reg, JSRegExp::Flags flags,
|
||||
BackReferenceNode(int start_reg, int end_reg, RegExpFlags flags,
|
||||
bool read_backward, RegExpNode* on_success)
|
||||
: SeqRegExpNode(on_success),
|
||||
start_reg_(start_reg),
|
||||
|
@ -519,7 +523,7 @@ class BackReferenceNode : public SeqRegExpNode {
|
|||
private:
|
||||
int start_reg_;
|
||||
int end_reg_;
|
||||
JSRegExp::Flags flags_;
|
||||
RegExpFlags flags_;
|
||||
bool read_backward_;
|
||||
};
|
||||
|
||||
|
@ -621,7 +625,7 @@ class ChoiceNode : public RegExpNode {
|
|||
virtual bool try_to_emit_quick_check_for_alternative(bool is_first) {
|
||||
return true;
|
||||
}
|
||||
RegExpNode* FilterOneByte(int depth) override;
|
||||
RegExpNode* FilterOneByte(int depth, RegExpFlags flags) override;
|
||||
virtual bool read_backward() { return false; }
|
||||
|
||||
protected:
|
||||
|
@ -693,7 +697,7 @@ class NegativeLookaroundChoiceNode : public ChoiceNode {
|
|||
return !is_first;
|
||||
}
|
||||
void Accept(NodeVisitor* visitor) override;
|
||||
RegExpNode* FilterOneByte(int depth) override;
|
||||
RegExpNode* FilterOneByte(int depth, RegExpFlags flags) override;
|
||||
};
|
||||
|
||||
class LoopChoiceNode : public ChoiceNode {
|
||||
|
@ -726,7 +730,7 @@ class LoopChoiceNode : public ChoiceNode {
|
|||
int min_loop_iterations() const { return min_loop_iterations_; }
|
||||
bool read_backward() override { return read_backward_; }
|
||||
void Accept(NodeVisitor* visitor) override;
|
||||
RegExpNode* FilterOneByte(int depth) override;
|
||||
RegExpNode* FilterOneByte(int depth, RegExpFlags flags) override;
|
||||
|
||||
private:
|
||||
// AddAlternative is made private for loop nodes because alternatives
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -5,363 +5,27 @@
|
|||
#ifndef V8_REGEXP_REGEXP_PARSER_H_
|
||||
#define V8_REGEXP_REGEXP_PARSER_H_
|
||||
|
||||
#include "irregexp/imported/regexp-ast.h"
|
||||
#include "irregexp/imported/regexp-error.h"
|
||||
#include "irregexp/RegExpShim.h"
|
||||
|
||||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
class String;
|
||||
class Zone;
|
||||
|
||||
struct RegExpCompileData;
|
||||
|
||||
// A BufferedZoneList is an automatically growing list, just like (and backed
|
||||
// by) a ZoneList, that is optimized for the case of adding and removing
|
||||
// a single element. The last element added is stored outside the backing list,
|
||||
// and if no more than one element is ever added, the ZoneList isn't even
|
||||
// allocated.
|
||||
// Elements must not be nullptr pointers.
|
||||
template <typename T, int initial_size>
|
||||
class BufferedZoneList {
|
||||
class V8_EXPORT_PRIVATE RegExpParser : public AllStatic {
|
||||
public:
|
||||
BufferedZoneList() : list_(nullptr), last_(nullptr) {}
|
||||
static bool ParseRegExpFromHeapString(Isolate* isolate, Zone* zone,
|
||||
Handle<String> input, RegExpFlags flags,
|
||||
RegExpCompileData* result);
|
||||
|
||||
// Adds element at end of list. This element is buffered and can
|
||||
// be read using last() or removed using RemoveLast until a new Add or until
|
||||
// RemoveLast or GetList has been called.
|
||||
void Add(T* value, Zone* zone) {
|
||||
if (last_ != nullptr) {
|
||||
if (list_ == nullptr) {
|
||||
list_ = zone->New<ZoneList<T*>>(initial_size, zone);
|
||||
}
|
||||
list_->Add(last_, zone);
|
||||
}
|
||||
last_ = value;
|
||||
}
|
||||
|
||||
T* last() {
|
||||
DCHECK(last_ != nullptr);
|
||||
return last_;
|
||||
}
|
||||
|
||||
T* RemoveLast() {
|
||||
DCHECK(last_ != nullptr);
|
||||
T* result = last_;
|
||||
if ((list_ != nullptr) && (list_->length() > 0))
|
||||
last_ = list_->RemoveLast();
|
||||
else
|
||||
last_ = nullptr;
|
||||
return result;
|
||||
}
|
||||
|
||||
T* Get(int i) {
|
||||
DCHECK((0 <= i) && (i < length()));
|
||||
if (list_ == nullptr) {
|
||||
DCHECK_EQ(0, i);
|
||||
return last_;
|
||||
} else {
|
||||
if (i == list_->length()) {
|
||||
DCHECK(last_ != nullptr);
|
||||
return last_;
|
||||
} else {
|
||||
return list_->at(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Clear() {
|
||||
list_ = nullptr;
|
||||
last_ = nullptr;
|
||||
}
|
||||
|
||||
int length() {
|
||||
int length = (list_ == nullptr) ? 0 : list_->length();
|
||||
return length + ((last_ == nullptr) ? 0 : 1);
|
||||
}
|
||||
|
||||
ZoneList<T*>* GetList(Zone* zone) {
|
||||
if (list_ == nullptr) {
|
||||
list_ = zone->New<ZoneList<T*>>(initial_size, zone);
|
||||
}
|
||||
if (last_ != nullptr) {
|
||||
list_->Add(last_, zone);
|
||||
last_ = nullptr;
|
||||
}
|
||||
return list_;
|
||||
}
|
||||
|
||||
private:
|
||||
ZoneList<T*>* list_;
|
||||
T* last_;
|
||||
};
|
||||
|
||||
|
||||
// Accumulates RegExp atoms and assertions into lists of terms and alternatives.
|
||||
class RegExpBuilder : public ZoneObject {
|
||||
public:
|
||||
RegExpBuilder(Zone* zone, JSRegExp::Flags flags);
|
||||
void AddCharacter(uc16 character);
|
||||
void AddUnicodeCharacter(uc32 character);
|
||||
void AddEscapedUnicodeCharacter(uc32 character);
|
||||
// "Adds" an empty expression. Does nothing except consume a
|
||||
// following quantifier
|
||||
void AddEmpty();
|
||||
void AddCharacterClass(RegExpCharacterClass* cc);
|
||||
void AddCharacterClassForDesugaring(uc32 c);
|
||||
void AddAtom(RegExpTree* tree);
|
||||
void AddTerm(RegExpTree* tree);
|
||||
void AddAssertion(RegExpTree* tree);
|
||||
void NewAlternative(); // '|'
|
||||
bool AddQuantifierToAtom(int min, int max,
|
||||
RegExpQuantifier::QuantifierType type);
|
||||
void FlushText();
|
||||
RegExpTree* ToRegExp();
|
||||
JSRegExp::Flags flags() const { return flags_; }
|
||||
void set_flags(JSRegExp::Flags flags) { flags_ = flags; }
|
||||
|
||||
bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
|
||||
bool multiline() const { return (flags_ & JSRegExp::kMultiline) != 0; }
|
||||
bool dotall() const { return (flags_ & JSRegExp::kDotAll) != 0; }
|
||||
|
||||
private:
|
||||
static const uc16 kNoPendingSurrogate = 0;
|
||||
void AddLeadSurrogate(uc16 lead_surrogate);
|
||||
void AddTrailSurrogate(uc16 trail_surrogate);
|
||||
void FlushPendingSurrogate();
|
||||
void FlushCharacters();
|
||||
void FlushTerms();
|
||||
bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc);
|
||||
bool NeedsDesugaringForIgnoreCase(uc32 c);
|
||||
Zone* zone() const { return zone_; }
|
||||
bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }
|
||||
|
||||
Zone* zone_;
|
||||
bool pending_empty_;
|
||||
JSRegExp::Flags flags_;
|
||||
ZoneList<uc16>* characters_;
|
||||
uc16 pending_surrogate_;
|
||||
BufferedZoneList<RegExpTree, 2> terms_;
|
||||
BufferedZoneList<RegExpTree, 2> text_;
|
||||
BufferedZoneList<RegExpTree, 2> alternatives_;
|
||||
#ifdef DEBUG
|
||||
enum { ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM } last_added_;
|
||||
#define LAST(x) last_added_ = x;
|
||||
#else
|
||||
#define LAST(x)
|
||||
#endif
|
||||
};
|
||||
|
||||
class V8_EXPORT_PRIVATE RegExpParser {
|
||||
public:
|
||||
RegExpParser(FlatStringReader* in, JSRegExp::Flags flags, Isolate* isolate,
|
||||
Zone* zone);
|
||||
|
||||
static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input,
|
||||
JSRegExp::Flags flags, RegExpCompileData* result);
|
||||
|
||||
// Used by the SpiderMonkey embedding of irregexp.
|
||||
static bool VerifyRegExpSyntax(Isolate* isolate, Zone* zone,
|
||||
FlatStringReader* input, JSRegExp::Flags flags,
|
||||
RegExpCompileData* result,
|
||||
const DisallowGarbageCollection& nogc);
|
||||
|
||||
private:
|
||||
bool Parse(RegExpCompileData* result, const DisallowGarbageCollection&);
|
||||
|
||||
RegExpTree* ParsePattern();
|
||||
RegExpTree* ParseDisjunction();
|
||||
RegExpTree* ParseGroup();
|
||||
|
||||
// Parses a {...,...} quantifier and stores the range in the given
|
||||
// out parameters.
|
||||
bool ParseIntervalQuantifier(int* min_out, int* max_out);
|
||||
|
||||
// Parses and returns a single escaped character. The character
|
||||
// must not be 'b' or 'B' since they are usually handle specially.
|
||||
uc32 ParseClassCharacterEscape();
|
||||
|
||||
// Checks whether the following is a length-digit hexadecimal number,
|
||||
// and sets the value if it is.
|
||||
bool ParseHexEscape(int length, uc32* value);
|
||||
bool ParseUnicodeEscape(uc32* value);
|
||||
bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value);
|
||||
|
||||
bool ParsePropertyClassName(ZoneVector<char>* name_1,
|
||||
ZoneVector<char>* name_2);
|
||||
bool AddPropertyClassRange(ZoneList<CharacterRange>* add_to, bool negate,
|
||||
const ZoneVector<char>& name_1,
|
||||
const ZoneVector<char>& name_2);
|
||||
|
||||
RegExpTree* GetPropertySequence(const ZoneVector<char>& name_1);
|
||||
RegExpTree* ParseCharacterClass(const RegExpBuilder* state);
|
||||
|
||||
uc32 ParseOctalLiteral();
|
||||
|
||||
// Tries to parse the input as a back reference. If successful it
|
||||
// stores the result in the output parameter and returns true. If
|
||||
// it fails it will push back the characters read so the same characters
|
||||
// can be reparsed.
|
||||
bool ParseBackReferenceIndex(int* index_out);
|
||||
|
||||
// Parse inside a class. Either add escaped class to the range, or return
|
||||
// false and pass parsed single character through |char_out|.
|
||||
void ParseClassEscape(ZoneList<CharacterRange>* ranges, Zone* zone,
|
||||
bool add_unicode_case_equivalents, uc32* char_out,
|
||||
bool* is_class_escape);
|
||||
|
||||
char ParseClassEscape();
|
||||
|
||||
RegExpTree* ReportError(RegExpError error);
|
||||
void Advance();
|
||||
void Advance(int dist);
|
||||
void Reset(int pos);
|
||||
|
||||
// Reports whether the pattern might be used as a literal search string.
|
||||
// Only use if the result of the parse is a single atom node.
|
||||
bool simple();
|
||||
bool contains_anchor() { return contains_anchor_; }
|
||||
void set_contains_anchor() { contains_anchor_ = true; }
|
||||
int captures_started() { return captures_started_; }
|
||||
int position() { return next_pos_ - 1; }
|
||||
bool failed() { return failed_; }
|
||||
// The Unicode flag can't be changed using in-regexp syntax, so it's OK to
|
||||
// just read the initial flag value here.
|
||||
bool unicode() const { return (top_level_flags_ & JSRegExp::kUnicode) != 0; }
|
||||
|
||||
static bool IsSyntaxCharacterOrSlash(uc32 c);
|
||||
|
||||
static const uc32 kEndMarker = (1 << 21);
|
||||
|
||||
private:
|
||||
enum SubexpressionType {
|
||||
INITIAL,
|
||||
CAPTURE, // All positive values represent captures.
|
||||
POSITIVE_LOOKAROUND,
|
||||
NEGATIVE_LOOKAROUND,
|
||||
GROUPING
|
||||
};
|
||||
|
||||
class RegExpParserState : public ZoneObject {
|
||||
public:
|
||||
// Push a state on the stack.
|
||||
RegExpParserState(RegExpParserState* previous_state,
|
||||
SubexpressionType group_type,
|
||||
RegExpLookaround::Type lookaround_type,
|
||||
int disjunction_capture_index,
|
||||
const ZoneVector<uc16>* capture_name,
|
||||
JSRegExp::Flags flags, Zone* zone)
|
||||
: previous_state_(previous_state),
|
||||
builder_(zone->New<RegExpBuilder>(zone, flags)),
|
||||
group_type_(group_type),
|
||||
lookaround_type_(lookaround_type),
|
||||
disjunction_capture_index_(disjunction_capture_index),
|
||||
capture_name_(capture_name) {}
|
||||
// Parser state of containing expression, if any.
|
||||
RegExpParserState* previous_state() const { return previous_state_; }
|
||||
bool IsSubexpression() { return previous_state_ != nullptr; }
|
||||
// RegExpBuilder building this regexp's AST.
|
||||
RegExpBuilder* builder() const { return builder_; }
|
||||
// Type of regexp being parsed (parenthesized group or entire regexp).
|
||||
SubexpressionType group_type() const { return group_type_; }
|
||||
// Lookahead or Lookbehind.
|
||||
RegExpLookaround::Type lookaround_type() const { return lookaround_type_; }
|
||||
// Index in captures array of first capture in this sub-expression, if any.
|
||||
// Also the capture index of this sub-expression itself, if group_type
|
||||
// is CAPTURE.
|
||||
int capture_index() const { return disjunction_capture_index_; }
|
||||
// The name of the current sub-expression, if group_type is CAPTURE. Only
|
||||
// used for named captures.
|
||||
const ZoneVector<uc16>* capture_name() const { return capture_name_; }
|
||||
|
||||
bool IsNamedCapture() const { return capture_name_ != nullptr; }
|
||||
|
||||
// Check whether the parser is inside a capture group with the given index.
|
||||
bool IsInsideCaptureGroup(int index);
|
||||
// Check whether the parser is inside a capture group with the given name.
|
||||
bool IsInsideCaptureGroup(const ZoneVector<uc16>* name);
|
||||
|
||||
private:
|
||||
// Linked list implementation of stack of states.
|
||||
RegExpParserState* const previous_state_;
|
||||
// Builder for the stored disjunction.
|
||||
RegExpBuilder* const builder_;
|
||||
// Stored disjunction type (capture, look-ahead or grouping), if any.
|
||||
const SubexpressionType group_type_;
|
||||
// Stored read direction.
|
||||
const RegExpLookaround::Type lookaround_type_;
|
||||
// Stored disjunction's capture index (if any).
|
||||
const int disjunction_capture_index_;
|
||||
// Stored capture name (if any).
|
||||
const ZoneVector<uc16>* const capture_name_;
|
||||
};
|
||||
|
||||
// Return the 1-indexed RegExpCapture object, allocate if necessary.
|
||||
RegExpCapture* GetCapture(int index);
|
||||
|
||||
// Creates a new named capture at the specified index. Must be called exactly
|
||||
// once for each named capture. Fails if a capture with the same name is
|
||||
// encountered.
|
||||
bool CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, int index);
|
||||
|
||||
// Parses the name of a capture group (?<name>pattern). The name must adhere
|
||||
// to IdentifierName in the ECMAScript standard.
|
||||
const ZoneVector<uc16>* ParseCaptureGroupName();
|
||||
|
||||
bool ParseNamedBackReference(RegExpBuilder* builder,
|
||||
RegExpParserState* state);
|
||||
RegExpParserState* ParseOpenParenthesis(RegExpParserState* state);
|
||||
|
||||
// After the initial parsing pass, patch corresponding RegExpCapture objects
|
||||
// into all RegExpBackReferences. This is done after initial parsing in order
|
||||
// to avoid complicating cases in which references comes before the capture.
|
||||
void PatchNamedBackReferences();
|
||||
|
||||
Handle<FixedArray> CreateCaptureNameMap();
|
||||
|
||||
// Returns true iff the pattern contains named captures. May call
|
||||
// ScanForCaptures to look ahead at the remaining pattern.
|
||||
bool HasNamedCaptures();
|
||||
|
||||
Isolate* isolate() { return isolate_; }
|
||||
Zone* zone() const { return zone_; }
|
||||
|
||||
uc32 current() { return current_; }
|
||||
bool has_more() { return has_more_; }
|
||||
bool has_next() { return next_pos_ < in()->length(); }
|
||||
uc32 Next();
|
||||
template <bool update_position>
|
||||
uc32 ReadNext();
|
||||
FlatStringReader* in() { return in_; }
|
||||
void ScanForCaptures();
|
||||
|
||||
struct RegExpCaptureNameLess {
|
||||
bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const {
|
||||
DCHECK_NOT_NULL(lhs);
|
||||
DCHECK_NOT_NULL(rhs);
|
||||
return *lhs->name() < *rhs->name();
|
||||
}
|
||||
};
|
||||
|
||||
Isolate* isolate_;
|
||||
Zone* zone_;
|
||||
RegExpError error_ = RegExpError::kNone;
|
||||
int error_pos_ = 0;
|
||||
ZoneList<RegExpCapture*>* captures_;
|
||||
ZoneSet<RegExpCapture*, RegExpCaptureNameLess>* named_captures_;
|
||||
ZoneList<RegExpBackReference*>* named_back_references_;
|
||||
FlatStringReader* in_;
|
||||
uc32 current_;
|
||||
// These are the flags specified outside the regexp syntax ie after the
|
||||
// terminating '/' or in the second argument to the constructor. The current
|
||||
// flags are stored on the RegExpBuilder.
|
||||
JSRegExp::Flags top_level_flags_;
|
||||
int next_pos_;
|
||||
int captures_started_;
|
||||
int capture_count_; // Only valid after we have scanned for captures.
|
||||
bool has_more_;
|
||||
bool simple_;
|
||||
bool contains_anchor_;
|
||||
bool is_scanned_for_captures_;
|
||||
bool has_named_captures_; // Only valid after we have scanned for captures.
|
||||
bool failed_;
|
||||
template <class CharT>
|
||||
static bool VerifyRegExpSyntax(Zone* zone, uintptr_t stack_limit,
|
||||
const CharT* input, int input_length,
|
||||
RegExpFlags flags, RegExpCompileData* result,
|
||||
const DisallowGarbageCollection& no_gc);
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
|
|
@ -9,23 +9,17 @@ namespace v8 {
|
|||
namespace internal {
|
||||
|
||||
RegExpStackScope::RegExpStackScope(Isolate* isolate)
|
||||
: regexp_stack_(isolate->regexp_stack()) {
|
||||
: regexp_stack_(isolate->regexp_stack()),
|
||||
old_sp_top_delta_(regexp_stack_->sp_top_delta()) {
|
||||
DCHECK(regexp_stack_->IsValid());
|
||||
// Irregexp is not reentrant in several ways; in particular, the
|
||||
// RegExpStackScope is not reentrant since the destructor frees allocated
|
||||
// memory. Protect against reentrancy here.
|
||||
CHECK(!regexp_stack_->is_in_use());
|
||||
regexp_stack_->set_is_in_use(true);
|
||||
}
|
||||
|
||||
|
||||
RegExpStackScope::~RegExpStackScope() {
|
||||
// Reset the buffer if it has grown.
|
||||
regexp_stack_->Reset();
|
||||
DCHECK(!regexp_stack_->is_in_use());
|
||||
CHECK_EQ(old_sp_top_delta_, regexp_stack_->sp_top_delta());
|
||||
regexp_stack_->ResetIfEmpty();
|
||||
}
|
||||
|
||||
RegExpStack::RegExpStack() : thread_local_(this), isolate_(nullptr) {}
|
||||
RegExpStack::RegExpStack() : thread_local_(this) {}
|
||||
|
||||
RegExpStack::~RegExpStack() { thread_local_.FreeAndInvalidate(); }
|
||||
|
||||
|
@ -50,18 +44,16 @@ char* RegExpStack::RestoreStack(char* from) {
|
|||
return from + kThreadLocalSize;
|
||||
}
|
||||
|
||||
void RegExpStack::Reset() { thread_local_.ResetToStaticStack(this); }
|
||||
|
||||
void RegExpStack::ThreadLocal::ResetToStaticStack(RegExpStack* regexp_stack) {
|
||||
if (owns_memory_) DeleteArray(memory_);
|
||||
|
||||
memory_ = regexp_stack->static_stack_;
|
||||
memory_top_ = regexp_stack->static_stack_ + kStaticStackSize;
|
||||
memory_size_ = kStaticStackSize;
|
||||
stack_pointer_ = memory_top_;
|
||||
limit_ = reinterpret_cast<Address>(regexp_stack->static_stack_) +
|
||||
kStackLimitSlack * kSystemPointerSize;
|
||||
owns_memory_ = false;
|
||||
is_in_use_ = false;
|
||||
}
|
||||
|
||||
void RegExpStack::ThreadLocal::FreeAndInvalidate() {
|
||||
|
@ -72,6 +64,7 @@ void RegExpStack::ThreadLocal::FreeAndInvalidate() {
|
|||
memory_ = nullptr;
|
||||
memory_top_ = nullptr;
|
||||
memory_size_ = 0;
|
||||
stack_pointer_ = nullptr;
|
||||
limit_ = kMemoryTop;
|
||||
}
|
||||
|
||||
|
@ -86,9 +79,11 @@ Address RegExpStack::EnsureCapacity(size_t size) {
|
|||
thread_local_.memory_, thread_local_.memory_size_);
|
||||
if (thread_local_.owns_memory_) DeleteArray(thread_local_.memory_);
|
||||
}
|
||||
ptrdiff_t delta = sp_top_delta();
|
||||
thread_local_.memory_ = new_memory;
|
||||
thread_local_.memory_top_ = new_memory + size;
|
||||
thread_local_.memory_size_ = size;
|
||||
thread_local_.stack_pointer_ = thread_local_.memory_top_ + delta;
|
||||
thread_local_.limit_ = reinterpret_cast<Address>(new_memory) +
|
||||
kStackLimitSlack * kSystemPointerSize;
|
||||
thread_local_.owns_memory_ = true;
|
||||
|
|
|
@ -14,10 +14,7 @@ class RegExpStack;
|
|||
|
||||
// Maintains a per-v8thread stack area that can be used by irregexp
|
||||
// implementation for its backtracking stack.
|
||||
// Since there is only one stack area, the Irregexp implementation is not
|
||||
// re-entrant. I.e., no regular expressions may be executed in the same thread
|
||||
// during a preempted Irregexp execution.
|
||||
class V8_NODISCARD RegExpStackScope {
|
||||
class V8_NODISCARD RegExpStackScope final {
|
||||
public:
|
||||
// Create and delete an instance to control the life-time of a growing stack.
|
||||
|
||||
|
@ -30,46 +27,45 @@ class V8_NODISCARD RegExpStackScope {
|
|||
RegExpStack* stack() const { return regexp_stack_; }
|
||||
|
||||
private:
|
||||
RegExpStack* regexp_stack_;
|
||||
RegExpStack* const regexp_stack_;
|
||||
const ptrdiff_t old_sp_top_delta_;
|
||||
};
|
||||
|
||||
class RegExpStack {
|
||||
class RegExpStack final {
|
||||
public:
|
||||
RegExpStack();
|
||||
~RegExpStack();
|
||||
RegExpStack(const RegExpStack&) = delete;
|
||||
RegExpStack& operator=(const RegExpStack&) = delete;
|
||||
|
||||
// Number of allocated locations on the stack below the limit.
|
||||
// No sequence of pushes must be longer that this without doing a stack-limit
|
||||
// check.
|
||||
// Number of allocated locations on the stack below the limit. No sequence of
|
||||
// pushes must be longer than this without doing a stack-limit check.
|
||||
static constexpr int kStackLimitSlack = 32;
|
||||
|
||||
// Gives the top of the memory used as stack.
|
||||
Address stack_base() {
|
||||
Address memory_top() const {
|
||||
DCHECK_NE(0, thread_local_.memory_size_);
|
||||
DCHECK_EQ(thread_local_.memory_top_,
|
||||
thread_local_.memory_ + thread_local_.memory_size_);
|
||||
return reinterpret_cast<Address>(thread_local_.memory_top_);
|
||||
}
|
||||
|
||||
// The total size of the memory allocated for the stack.
|
||||
size_t stack_capacity() { return thread_local_.memory_size_; }
|
||||
Address stack_pointer() const {
|
||||
return reinterpret_cast<Address>(thread_local_.stack_pointer_);
|
||||
}
|
||||
|
||||
size_t memory_size() const { return thread_local_.memory_size_; }
|
||||
|
||||
// If the stack pointer gets below the limit, we should react and
|
||||
// either grow the stack or report an out-of-stack exception.
|
||||
// There is only a limited number of locations below the stack limit,
|
||||
// so users of the stack should check the stack limit during any
|
||||
// sequence of pushes longer that this.
|
||||
Address* limit_address_address() { return &(thread_local_.limit_); }
|
||||
Address* limit_address_address() { return &thread_local_.limit_; }
|
||||
|
||||
// Ensures that there is a memory area with at least the specified size.
|
||||
// If passing zero, the default/minimum size buffer is allocated.
|
||||
Address EnsureCapacity(size_t size);
|
||||
|
||||
bool is_in_use() const { return thread_local_.is_in_use_; }
|
||||
void set_is_in_use(bool v) { thread_local_.is_in_use_ = v; }
|
||||
|
||||
// Thread local archiving.
|
||||
static constexpr int ArchiveSpacePerThread() {
|
||||
return static_cast<int>(kThreadLocalSize);
|
||||
|
@ -99,46 +95,61 @@ class RegExpStack {
|
|||
2 * kStackLimitSlack * kSystemPointerSize;
|
||||
byte static_stack_[kStaticStackSize] = {0};
|
||||
|
||||
STATIC_ASSERT(kStaticStackSize <= kMaximumStackSize);
|
||||
static_assert(kStaticStackSize <= kMaximumStackSize);
|
||||
|
||||
// Structure holding the allocated memory, size and limit.
|
||||
// Structure holding the allocated memory, size and limit. Thread switching
|
||||
// archives and restores this struct.
|
||||
struct ThreadLocal {
|
||||
explicit ThreadLocal(RegExpStack* regexp_stack) {
|
||||
ResetToStaticStack(regexp_stack);
|
||||
}
|
||||
|
||||
// If memory_size_ > 0 then memory_ and memory_top_ must be non-nullptr
|
||||
// and memory_top_ = memory_ + memory_size_
|
||||
// If memory_size_ > 0 then
|
||||
// - memory_, memory_top_, stack_pointer_ must be non-nullptr
|
||||
// - memory_top_ = memory_ + memory_size_
|
||||
// - memory_ <= stack_pointer_ <= memory_top_
|
||||
byte* memory_ = nullptr;
|
||||
byte* memory_top_ = nullptr;
|
||||
size_t memory_size_ = 0;
|
||||
byte* stack_pointer_ = nullptr;
|
||||
Address limit_ = kNullAddress;
|
||||
bool owns_memory_ = false; // Whether memory_ is owned and must be freed.
|
||||
bool is_in_use_ = false; // To guard against reentrancy.
|
||||
|
||||
void ResetToStaticStack(RegExpStack* regexp_stack);
|
||||
void ResetToStaticStackIfEmpty(RegExpStack* regexp_stack) {
|
||||
if (stack_pointer_ == memory_top_) ResetToStaticStack(regexp_stack);
|
||||
}
|
||||
void FreeAndInvalidate();
|
||||
};
|
||||
static constexpr size_t kThreadLocalSize = sizeof(ThreadLocal);
|
||||
|
||||
// Address of top of memory used as stack.
|
||||
Address memory_top_address_address() {
|
||||
return reinterpret_cast<Address>(&thread_local_.memory_top_);
|
||||
}
|
||||
|
||||
// Resets the buffer if it has grown beyond the default/minimum size.
|
||||
// After this, the buffer is either the default size, or it is empty, so
|
||||
// you have to call EnsureCapacity before using it again.
|
||||
void Reset();
|
||||
Address stack_pointer_address() {
|
||||
return reinterpret_cast<Address>(&thread_local_.stack_pointer_);
|
||||
}
|
||||
|
||||
// A position-independent representation of the stack pointer.
|
||||
ptrdiff_t sp_top_delta() const {
|
||||
ptrdiff_t result =
|
||||
reinterpret_cast<intptr_t>(thread_local_.stack_pointer_) -
|
||||
reinterpret_cast<intptr_t>(thread_local_.memory_top_);
|
||||
DCHECK_LE(result, 0);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Resets the buffer if it has grown beyond the default/minimum size and is
|
||||
// empty.
|
||||
void ResetIfEmpty() { thread_local_.ResetToStaticStackIfEmpty(this); }
|
||||
|
||||
// Whether the ThreadLocal storage has been invalidated.
|
||||
bool IsValid() const { return thread_local_.memory_ != nullptr; }
|
||||
|
||||
ThreadLocal thread_local_;
|
||||
Isolate* isolate_;
|
||||
|
||||
friend class ExternalReference;
|
||||
friend class Isolate;
|
||||
friend class RegExpStackScope;
|
||||
};
|
||||
|
||||
|
|
|
@ -11,6 +11,9 @@
|
|||
namespace v8 {
|
||||
namespace internal {
|
||||
|
||||
class JSRegExp;
|
||||
class RegExpCapture;
|
||||
class RegExpMatchInfo;
|
||||
class RegExpNode;
|
||||
class RegExpTree;
|
||||
|
||||
|
@ -37,9 +40,9 @@ struct RegExpCompileData {
|
|||
// True, iff the pattern is anchored at the start of the string with '^'.
|
||||
bool contains_anchor = false;
|
||||
|
||||
// Only use if the pattern contains named captures. If so, this contains a
|
||||
// mapping of capture names to capture indices.
|
||||
Handle<FixedArray> capture_name_map;
|
||||
// Only set if the pattern contains named captures.
|
||||
// Note: the lifetime equals that of the parse/compile zone.
|
||||
ZoneVector<RegExpCapture*>* named_captures = nullptr;
|
||||
|
||||
// The error message. Only used if an error occurred during parsing or
|
||||
// compilation.
|
||||
|
@ -62,9 +65,15 @@ struct RegExpCompileData {
|
|||
class RegExp final : public AllStatic {
|
||||
public:
|
||||
// Whether the irregexp engine generates interpreter bytecode.
|
||||
static bool CanGenerateBytecode() {
|
||||
return FLAG_regexp_interpret_all || FLAG_regexp_tier_up;
|
||||
}
|
||||
static bool CanGenerateBytecode();
|
||||
|
||||
// Verify the given pattern, i.e. check that parsing succeeds. If
|
||||
// verification fails, `regexp_error_out` is set.
|
||||
template <class CharT>
|
||||
static bool VerifySyntax(Zone* zone, uintptr_t stack_limit,
|
||||
const CharT* input, int input_length,
|
||||
RegExpFlags flags, RegExpError* regexp_error_out,
|
||||
const DisallowGarbageCollection& no_gc);
|
||||
|
||||
// Parses the RegExp pattern and prepares the JSRegExp object with
|
||||
// generic data and choice of implementation - as well as what
|
||||
|
@ -72,7 +81,7 @@ class RegExp final : public AllStatic {
|
|||
// Returns false if compilation fails.
|
||||
V8_WARN_UNUSED_RESULT static MaybeHandle<Object> Compile(
|
||||
Isolate* isolate, Handle<JSRegExp> re, Handle<String> pattern,
|
||||
JSRegExp::Flags flags, uint32_t backtrack_limit);
|
||||
RegExpFlags flags, uint32_t backtrack_limit);
|
||||
|
||||
// Ensures that a regexp is fully compiled and ready to be executed on a
|
||||
// subject string. Returns true on success. Return false on failure, and
|
||||
|
@ -131,12 +140,9 @@ class RegExp final : public AllStatic {
|
|||
Isolate* isolate, Handle<RegExpMatchInfo> last_match_info,
|
||||
Handle<String> subject, int capture_count, int32_t* match);
|
||||
|
||||
V8_EXPORT_PRIVATE static bool CompileForTesting(Isolate* isolate, Zone* zone,
|
||||
RegExpCompileData* input,
|
||||
JSRegExp::Flags flags,
|
||||
Handle<String> pattern,
|
||||
Handle<String> sample_subject,
|
||||
bool is_one_byte);
|
||||
V8_EXPORT_PRIVATE static bool CompileForTesting(
|
||||
Isolate* isolate, Zone* zone, RegExpCompileData* input, RegExpFlags flags,
|
||||
Handle<String> pattern, Handle<String> sample_subject, bool is_one_byte);
|
||||
|
||||
V8_EXPORT_PRIVATE static void DotPrintForTesting(const char* label,
|
||||
RegExpNode* node);
|
||||
|
@ -152,6 +158,9 @@ class RegExp final : public AllStatic {
|
|||
RegExpError error_text);
|
||||
|
||||
static bool IsUnmodifiedRegExp(Isolate* isolate, Handle<JSRegExp> regexp);
|
||||
|
||||
static Handle<FixedArray> CreateCaptureNameMap(
|
||||
Isolate* isolate, ZoneVector<RegExpCapture*>* named_captures);
|
||||
};
|
||||
|
||||
// Uses a special global mode of irregexp-generated code to perform a global
|
||||
|
|
Загрузка…
Ссылка в новой задаче