Bug 1779849: Re-import irregexp r=mgaudet

This patch was generated by running import-irregexp.py.

Depends on D152901

Differential Revision: https://phabricator.services.mozilla.com/D152902
This commit is contained in:
Iain Ireland 2022-07-28 18:58:30 +00:00
Родитель 9e068493ac
Коммит a1e55c9f5a
28 изменённых файлов: 2588 добавлений и 2283 удалений

Просмотреть файл

@ -1,2 +1,2 @@
Imported using import-irregexp.py from:
https://github.com/v8/v8/tree/8732b2ee52b567ad4e15ca91d141fd6e27499e99/src/regexp
https://github.com/v8/v8/tree/b8fe2724fc25af2c165180b2cd2930b2119ad831/src/regexp

Просмотреть файл

@ -12,9 +12,9 @@
namespace v8 {
namespace internal {
static const uc32 kSurrogateStart = 0xd800;
static const uc32 kSurrogateEnd = 0xdfff;
static const uc32 kNonBmpStart = 0x10000;
static const base::uc32 kSurrogateStart = 0xd800;
static const base::uc32 kSurrogateEnd = 0xdfff;
static const base::uc32 kNonBmpStart = 0x10000;
// The following code generates "src/regexp/special-case.cc".
void PrintSet(std::ofstream& out, const char* name,

Просмотреть файл

@ -42,7 +42,7 @@ const generateData = (property) => {
buffer.push(' ' + codePoints.join(', ') + ', 0,');
}
const output =
`const uc32 UnicodePropertySequences::k${ id }[] = {\n` +
`const base::uc32 UnicodePropertySequences::k${ id }[] = {\n` +
`${ buffer.join('\n') }\n 0 // null-terminating the list\n};\n`;
return output;
};
@ -60,7 +60,7 @@ for (const property of properties) {
*/
// clang-format off
const uc32 UnicodePropertySequences::kEmojiFlagSequences[] = {
const base::uc32 UnicodePropertySequences::kEmojiFlagSequences[] = {
0x01F1E6, 0x01F1E8, 0,
0x01F1FF, 0x01F1FC, 0,
0x01F1E6, 0x01F1EA, 0,
@ -322,14 +322,14 @@ const uc32 UnicodePropertySequences::kEmojiFlagSequences[] = {
0 // null-terminating the list
};
const uc32 UnicodePropertySequences::kEmojiTagSequences[] = {
const base::uc32 UnicodePropertySequences::kEmojiTagSequences[] = {
0x01F3F4, 0x0E0067, 0x0E0062, 0x0E0065, 0x0E006E, 0x0E0067, 0x0E007F, 0,
0x01F3F4, 0x0E0067, 0x0E0062, 0x0E0073, 0x0E0063, 0x0E0074, 0x0E007F, 0,
0x01F3F4, 0x0E0067, 0x0E0062, 0x0E0077, 0x0E006C, 0x0E0073, 0x0E007F, 0,
0 // null-terminating the list
};
const uc32 UnicodePropertySequences::kEmojiZWJSequences[] = {
const base::uc32 UnicodePropertySequences::kEmojiZWJSequences[] = {
0x01F468, 0x00200D, 0x002764, 0x00FE0F, 0x00200D, 0x01F468, 0,
0x01F441, 0x00FE0F, 0x00200D, 0x01F5E8, 0x00FE0F, 0,
0x01F468, 0x00200D, 0x01F466, 0,

Просмотреть файл

@ -14,9 +14,9 @@ namespace internal {
class UnicodePropertySequences : public AllStatic {
public:
static const uc32 kEmojiFlagSequences[];
static const uc32 kEmojiTagSequences[];
static const uc32 kEmojiZWJSequences[];
static const base::uc32 kEmojiFlagSequences[];
static const base::uc32 kEmojiTagSequences[];
static const base::uc32 kEmojiZWJSequences[];
};
} // namespace internal

Просмотреть файл

@ -26,14 +26,16 @@ FOR_EACH_REG_EXP_TREE_TYPE(MAKE_TYPE_CASE)
FOR_EACH_REG_EXP_TREE_TYPE(MAKE_TYPE_CASE)
#undef MAKE_TYPE_CASE
namespace {
static Interval ListCaptureRegisters(ZoneList<RegExpTree*>* children) {
Interval ListCaptureRegisters(ZoneList<RegExpTree*>* children) {
Interval result = Interval::Empty();
for (int i = 0; i < children->length(); i++)
result = result.Union(children->at(i)->CaptureRegisters());
return result;
}
} // namespace
Interval RegExpAlternative::CaptureRegisters() {
return ListCaptureRegisters(nodes());
@ -62,12 +64,12 @@ Interval RegExpQuantifier::CaptureRegisters() {
bool RegExpAssertion::IsAnchoredAtStart() {
return assertion_type() == RegExpAssertion::START_OF_INPUT;
return assertion_type() == RegExpAssertion::Type::START_OF_INPUT;
}
bool RegExpAssertion::IsAnchoredAtEnd() {
return assertion_type() == RegExpAssertion::END_OF_INPUT;
return assertion_type() == RegExpAssertion::Type::END_OF_INPUT;
}
@ -129,6 +131,7 @@ bool RegExpCapture::IsAnchoredAtStart() { return body()->IsAnchoredAtStart(); }
bool RegExpCapture::IsAnchoredAtEnd() { return body()->IsAnchoredAtEnd(); }
namespace {
// Convert regular expression trees to a simple sexp representation.
// This representation should be different from the input grammar
@ -147,6 +150,7 @@ class RegExpUnparser final : public RegExpVisitor {
Zone* zone_;
};
} // namespace
void* RegExpUnparser::VisitDisjunction(RegExpDisjunction* that, void* data) {
os_ << "(|";
@ -193,22 +197,22 @@ void* RegExpUnparser::VisitCharacterClass(RegExpCharacterClass* that,
void* RegExpUnparser::VisitAssertion(RegExpAssertion* that, void* data) {
switch (that->assertion_type()) {
case RegExpAssertion::START_OF_INPUT:
case RegExpAssertion::Type::START_OF_INPUT:
os_ << "@^i";
break;
case RegExpAssertion::END_OF_INPUT:
case RegExpAssertion::Type::END_OF_INPUT:
os_ << "@$i";
break;
case RegExpAssertion::START_OF_LINE:
case RegExpAssertion::Type::START_OF_LINE:
os_ << "@^l";
break;
case RegExpAssertion::END_OF_LINE:
case RegExpAssertion::Type::END_OF_LINE:
os_ << "@$l";
break;
case RegExpAssertion::BOUNDARY:
case RegExpAssertion::Type::BOUNDARY:
os_ << "@b";
break;
case RegExpAssertion::NON_BOUNDARY:
case RegExpAssertion::Type::NON_BOUNDARY:
os_ << "@B";
break;
}
@ -218,7 +222,7 @@ void* RegExpUnparser::VisitAssertion(RegExpAssertion* that, void* data) {
void* RegExpUnparser::VisitAtom(RegExpAtom* that, void* data) {
os_ << "'";
Vector<const uc16> chardata = that->data();
base::Vector<const base::uc16> chardata = that->data();
for (int i = 0; i < chardata.length(); i++) {
os_ << AsUC16(chardata[i]);
}
@ -311,8 +315,9 @@ RegExpDisjunction::RegExpDisjunction(ZoneList<RegExpTree*>* alternatives)
}
}
namespace {
static int IncreaseBy(int previous, int increase) {
int IncreaseBy(int previous, int increase) {
if (RegExpTree::kInfinity - previous < increase) {
return RegExpTree::kInfinity;
} else {
@ -320,6 +325,7 @@ static int IncreaseBy(int previous, int increase) {
}
}
} // namespace
RegExpAlternative::RegExpAlternative(ZoneList<RegExpTree*>* nodes)
: nodes_(nodes) {

Просмотреть файл

@ -41,29 +41,25 @@ class RegExpVisitor {
#undef MAKE_CASE
};
// A simple closed interval.
class Interval {
public:
Interval() : from_(kNone), to_(kNone - 1) {} // '- 1' for branchless size().
Interval(int from, int to) : from_(from), to_(to) {}
Interval Union(Interval that) {
if (that.from_ == kNone)
return *this;
else if (from_ == kNone)
return that;
else
return Interval(std::min(from_, that.from_), std::max(to_, that.to_));
if (that.from_ == kNone) return *this;
if (from_ == kNone) return that;
return Interval(std::min(from_, that.from_), std::max(to_, that.to_));
}
bool Contains(int value) { return (from_ <= value) && (value <= to_); }
bool is_empty() { return from_ == kNone; }
static Interval Empty() { return Interval(); }
bool Contains(int value) const { return (from_ <= value) && (value <= to_); }
bool is_empty() const { return from_ == kNone; }
int from() const { return from_; }
int to() const { return to_; }
int size() const { return to_ - from_ + 1; }
static Interval Empty() { return Interval(); }
static constexpr int kNone = -1;
private:
@ -71,32 +67,39 @@ class Interval {
int to_;
};
// Named standard character sets.
enum class StandardCharacterSet : char {
kWhitespace = 's', // Like /\s/.
kNotWhitespace = 'S', // Like /\S/.
kWord = 'w', // Like /\w/.
kNotWord = 'W', // Like /\W/.
kDigit = 'd', // Like /\d/.
kNotDigit = 'D', // Like /\D/.
kLineTerminator = 'n', // The inverse of /./.
kNotLineTerminator = '.', // Like /./.
kEverything = '*', // Matches every character, like /./s.
};
// Represents code points (with values up to 0x10FFFF) in the range from from_
// to to_, both ends are inclusive.
class CharacterRange {
public:
CharacterRange() : from_(0), to_(0) {}
// For compatibility with the CHECK_OK macro
CharacterRange() = default;
// For compatibility with the CHECK_OK macro.
CharacterRange(void* null) { DCHECK_NULL(null); } // NOLINT
V8_EXPORT_PRIVATE static void AddClassEscape(char type,
ZoneList<CharacterRange>* ranges,
Zone* zone);
// Add class escapes. Add case equivalent closure for \w and \W if necessary.
V8_EXPORT_PRIVATE static void AddClassEscape(
char type, ZoneList<CharacterRange>* ranges,
bool add_unicode_case_equivalents, Zone* zone);
static Vector<const int> GetWordBounds();
static inline CharacterRange Singleton(uc32 value) {
static inline CharacterRange Singleton(base::uc32 value) {
return CharacterRange(value, value);
}
static inline CharacterRange Range(uc32 from, uc32 to) {
DCHECK(0 <= from && to <= String::kMaxCodePoint);
static inline CharacterRange Range(base::uc32 from, base::uc32 to) {
DCHECK(0 <= from && to <= kMaxCodePoint);
DCHECK(static_cast<uint32_t>(from) <= static_cast<uint32_t>(to));
return CharacterRange(from, to);
}
static inline CharacterRange Everything() {
return CharacterRange(0, String::kMaxCodePoint);
return CharacterRange(0, kMaxCodePoint);
}
static inline ZoneList<CharacterRange>* List(Zone* zone,
CharacterRange range) {
ZoneList<CharacterRange>* list =
@ -104,17 +107,21 @@ class CharacterRange {
list->Add(range, zone);
return list;
}
bool Contains(uc32 i) { return from_ <= i && i <= to_; }
uc32 from() const { return from_; }
void set_from(uc32 value) { from_ = value; }
uc32 to() const { return to_; }
void set_to(uc32 value) { to_ = value; }
bool is_valid() { return from_ <= to_; }
bool IsEverything(uc32 max) { return from_ == 0 && to_ >= max; }
bool IsSingleton() { return (from_ == to_); }
// Add class escapes. Add case equivalent closure for \w and \W if necessary.
V8_EXPORT_PRIVATE static void AddClassEscape(
StandardCharacterSet standard_character_set,
ZoneList<CharacterRange>* ranges, bool add_unicode_case_equivalents,
Zone* zone);
V8_EXPORT_PRIVATE static void AddCaseEquivalents(
Isolate* isolate, Zone* zone, ZoneList<CharacterRange>* ranges,
bool is_one_byte);
bool Contains(base::uc32 i) const { return from_ <= i && i <= to_; }
base::uc32 from() const { return from_; }
base::uc32 to() const { return to_; }
bool IsEverything(base::uc32 max) const { return from_ == 0 && to_ >= max; }
bool IsSingleton() const { return from_ == to_; }
// Whether a range list is in canonical form: Ranges ordered by from value,
// and ranges non-overlapping and non-adjacent.
V8_EXPORT_PRIVATE static bool IsCanonical(ZoneList<CharacterRange>* ranges);
@ -126,35 +133,214 @@ class CharacterRange {
// Negate the contents of a character range in canonical form.
static void Negate(ZoneList<CharacterRange>* src,
ZoneList<CharacterRange>* dst, Zone* zone);
static const int kStartMarker = (1 << 24);
static const int kPayloadMask = (1 << 24) - 1;
// Remove all ranges outside the one-byte range.
static void ClampToOneByte(ZoneList<CharacterRange>* ranges);
private:
CharacterRange(uc32 from, uc32 to) : from_(from), to_(to) {}
CharacterRange(base::uc32 from, base::uc32 to) : from_(from), to_(to) {}
uc32 from_;
uc32 to_;
static constexpr int kMaxCodePoint = 0x10ffff;
base::uc32 from_ = 0;
base::uc32 to_ = 0;
};
#define DECL_BOILERPLATE(Name) \
void* Accept(RegExpVisitor* visitor, void* data) override; \
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) \
override; \
RegExp##Name* As##Name() override; \
bool Is##Name() override
class RegExpTree : public ZoneObject {
public:
static const int kInfinity = kMaxInt;
virtual ~RegExpTree() = default;
virtual void* Accept(RegExpVisitor* visitor, void* data) = 0;
virtual RegExpNode* ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) = 0;
virtual bool IsTextElement() { return false; }
virtual bool IsAnchoredAtStart() { return false; }
virtual bool IsAnchoredAtEnd() { return false; }
virtual int min_match() = 0;
virtual int max_match() = 0;
// Returns the interval of registers used for captures within this
// expression.
virtual Interval CaptureRegisters() { return Interval::Empty(); }
virtual void AppendToText(RegExpText* text, Zone* zone);
V8_EXPORT_PRIVATE std::ostream& Print(std::ostream& os, Zone* zone);
#define MAKE_ASTYPE(Name) \
virtual RegExp##Name* As##Name(); \
virtual bool Is##Name();
FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ASTYPE)
#undef MAKE_ASTYPE
};
class RegExpDisjunction final : public RegExpTree {
public:
explicit RegExpDisjunction(ZoneList<RegExpTree*>* alternatives);
DECL_BOILERPLATE(Disjunction);
Interval CaptureRegisters() override;
bool IsAnchoredAtStart() override;
bool IsAnchoredAtEnd() override;
int min_match() override { return min_match_; }
int max_match() override { return max_match_; }
ZoneList<RegExpTree*>* alternatives() const { return alternatives_; }
private:
bool SortConsecutiveAtoms(RegExpCompiler* compiler);
void RationalizeConsecutiveAtoms(RegExpCompiler* compiler);
void FixSingleCharacterDisjunctions(RegExpCompiler* compiler);
ZoneList<RegExpTree*>* alternatives_;
int min_match_;
int max_match_;
};
class RegExpAlternative final : public RegExpTree {
public:
explicit RegExpAlternative(ZoneList<RegExpTree*>* nodes);
DECL_BOILERPLATE(Alternative);
Interval CaptureRegisters() override;
bool IsAnchoredAtStart() override;
bool IsAnchoredAtEnd() override;
int min_match() override { return min_match_; }
int max_match() override { return max_match_; }
ZoneList<RegExpTree*>* nodes() const { return nodes_; }
private:
ZoneList<RegExpTree*>* nodes_;
int min_match_;
int max_match_;
};
class RegExpAssertion final : public RegExpTree {
public:
enum class Type {
START_OF_LINE = 0,
START_OF_INPUT = 1,
END_OF_LINE = 2,
END_OF_INPUT = 3,
BOUNDARY = 4,
NON_BOUNDARY = 5,
LAST_ASSERTION_TYPE = NON_BOUNDARY,
};
explicit RegExpAssertion(Type type) : assertion_type_(type) {}
DECL_BOILERPLATE(Assertion);
bool IsAnchoredAtStart() override;
bool IsAnchoredAtEnd() override;
int min_match() override { return 0; }
int max_match() override { return 0; }
Type assertion_type() const { return assertion_type_; }
private:
const Type assertion_type_;
};
class CharacterSet final {
public:
explicit CharacterSet(uc16 standard_set_type)
: ranges_(nullptr), standard_set_type_(standard_set_type) {}
explicit CharacterSet(ZoneList<CharacterRange>* ranges)
: ranges_(ranges), standard_set_type_(0) {}
explicit CharacterSet(StandardCharacterSet standard_set_type)
: standard_set_type_(standard_set_type) {}
explicit CharacterSet(ZoneList<CharacterRange>* ranges) : ranges_(ranges) {}
ZoneList<CharacterRange>* ranges(Zone* zone);
uc16 standard_set_type() const { return standard_set_type_; }
void set_standard_set_type(uc16 special_set_type) {
standard_set_type_ = special_set_type;
StandardCharacterSet standard_set_type() const {
return standard_set_type_.value();
}
bool is_standard() { return standard_set_type_ != 0; }
void set_standard_set_type(StandardCharacterSet standard_set_type) {
standard_set_type_ = standard_set_type;
}
bool is_standard() const { return standard_set_type_.has_value(); }
V8_EXPORT_PRIVATE void Canonicalize();
private:
ZoneList<CharacterRange>* ranges_;
// If non-zero, the value represents a standard set (e.g., all whitespace
// characters) without having to expand the ranges.
uc16 standard_set_type_;
ZoneList<CharacterRange>* ranges_ = nullptr;
base::Optional<StandardCharacterSet> standard_set_type_;
};
class RegExpCharacterClass final : public RegExpTree {
public:
// NEGATED: The character class is negated and should match everything but
// the specified ranges.
// CONTAINS_SPLIT_SURROGATE: The character class contains part of a split
// surrogate and should not be unicode-desugared (crbug.com/641091).
enum Flag {
NEGATED = 1 << 0,
CONTAINS_SPLIT_SURROGATE = 1 << 1,
};
using CharacterClassFlags = base::Flags<Flag>;
RegExpCharacterClass(
Zone* zone, ZoneList<CharacterRange>* ranges,
CharacterClassFlags character_class_flags = CharacterClassFlags())
: set_(ranges), character_class_flags_(character_class_flags) {
// Convert the empty set of ranges to the negated Everything() range.
if (ranges->is_empty()) {
ranges->Add(CharacterRange::Everything(), zone);
character_class_flags_ ^= NEGATED;
}
}
explicit RegExpCharacterClass(StandardCharacterSet standard_set_type)
: set_(standard_set_type), character_class_flags_() {}
DECL_BOILERPLATE(CharacterClass);
bool IsTextElement() override { return true; }
int min_match() override { return 1; }
// The character class may match two code units for unicode regexps.
// TODO(yangguo): we should split this class for usage in TextElement, and
// make max_match() dependent on the character class content.
int max_match() override { return 2; }
void AppendToText(RegExpText* text, Zone* zone) override;
// TODO(lrn): Remove need for complex version if is_standard that
// recognizes a mangled standard set and just do { return set_.is_special(); }
bool is_standard(Zone* zone);
// Returns a value representing the standard character set if is_standard()
// returns true.
StandardCharacterSet standard_type() const {
return set_.standard_set_type();
}
CharacterSet character_set() const { return set_; }
ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); }
bool is_negated() const { return (character_class_flags_ & NEGATED) != 0; }
bool contains_split_surrogate() const {
return (character_class_flags_ & CONTAINS_SPLIT_SURROGATE) != 0;
}
private:
CharacterSet set_;
CharacterClassFlags character_class_flags_;
};
class RegExpAtom final : public RegExpTree {
public:
explicit RegExpAtom(base::Vector<const base::uc16> data) : data_(data) {}
DECL_BOILERPLATE(Atom);
bool IsTextElement() override { return true; }
int min_match() override { return data_.length(); }
int max_match() override { return data_.length(); }
void AppendToText(RegExpText* text, Zone* zone) override;
base::Vector<const base::uc16> data() const { return data_; }
int length() const { return data_.length(); }
private:
base::Vector<const base::uc16> data_;
};
class TextElement final {
@ -191,206 +377,12 @@ class TextElement final {
RegExpTree* tree_;
};
class RegExpTree : public ZoneObject {
public:
static const int kInfinity = kMaxInt;
virtual ~RegExpTree() = default;
virtual void* Accept(RegExpVisitor* visitor, void* data) = 0;
virtual RegExpNode* ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) = 0;
virtual bool IsTextElement() { return false; }
virtual bool IsAnchoredAtStart() { return false; }
virtual bool IsAnchoredAtEnd() { return false; }
virtual int min_match() = 0;
virtual int max_match() = 0;
// Returns the interval of registers used for captures within this
// expression.
virtual Interval CaptureRegisters() { return Interval::Empty(); }
virtual void AppendToText(RegExpText* text, Zone* zone);
V8_EXPORT_PRIVATE std::ostream& Print(std::ostream& os, Zone* zone);
#define MAKE_ASTYPE(Name) \
virtual RegExp##Name* As##Name(); \
virtual bool Is##Name();
FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ASTYPE)
#undef MAKE_ASTYPE
};
class RegExpDisjunction final : public RegExpTree {
public:
explicit RegExpDisjunction(ZoneList<RegExpTree*>* alternatives);
void* Accept(RegExpVisitor* visitor, void* data) override;
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
RegExpDisjunction* AsDisjunction() override;
Interval CaptureRegisters() override;
bool IsDisjunction() override;
bool IsAnchoredAtStart() override;
bool IsAnchoredAtEnd() override;
int min_match() override { return min_match_; }
int max_match() override { return max_match_; }
ZoneList<RegExpTree*>* alternatives() { return alternatives_; }
private:
bool SortConsecutiveAtoms(RegExpCompiler* compiler);
void RationalizeConsecutiveAtoms(RegExpCompiler* compiler);
void FixSingleCharacterDisjunctions(RegExpCompiler* compiler);
ZoneList<RegExpTree*>* alternatives_;
int min_match_;
int max_match_;
};
class RegExpAlternative final : public RegExpTree {
public:
explicit RegExpAlternative(ZoneList<RegExpTree*>* nodes);
void* Accept(RegExpVisitor* visitor, void* data) override;
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
RegExpAlternative* AsAlternative() override;
Interval CaptureRegisters() override;
bool IsAlternative() override;
bool IsAnchoredAtStart() override;
bool IsAnchoredAtEnd() override;
int min_match() override { return min_match_; }
int max_match() override { return max_match_; }
ZoneList<RegExpTree*>* nodes() { return nodes_; }
private:
ZoneList<RegExpTree*>* nodes_;
int min_match_;
int max_match_;
};
class RegExpAssertion final : public RegExpTree {
public:
enum AssertionType {
START_OF_LINE = 0,
START_OF_INPUT = 1,
END_OF_LINE = 2,
END_OF_INPUT = 3,
BOUNDARY = 4,
NON_BOUNDARY = 5,
LAST_TYPE = NON_BOUNDARY,
};
RegExpAssertion(AssertionType type, JSRegExp::Flags flags)
: assertion_type_(type), flags_(flags) {}
void* Accept(RegExpVisitor* visitor, void* data) override;
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
RegExpAssertion* AsAssertion() override;
bool IsAssertion() override;
bool IsAnchoredAtStart() override;
bool IsAnchoredAtEnd() override;
int min_match() override { return 0; }
int max_match() override { return 0; }
AssertionType assertion_type() const { return assertion_type_; }
JSRegExp::Flags flags() const { return flags_; }
private:
const AssertionType assertion_type_;
const JSRegExp::Flags flags_;
};
class RegExpCharacterClass final : public RegExpTree {
public:
// NEGATED: The character class is negated and should match everything but
// the specified ranges.
// CONTAINS_SPLIT_SURROGATE: The character class contains part of a split
// surrogate and should not be unicode-desugared (crbug.com/641091).
enum Flag {
NEGATED = 1 << 0,
CONTAINS_SPLIT_SURROGATE = 1 << 1,
};
using CharacterClassFlags = base::Flags<Flag>;
RegExpCharacterClass(
Zone* zone, ZoneList<CharacterRange>* ranges, JSRegExp::Flags flags,
CharacterClassFlags character_class_flags = CharacterClassFlags())
: set_(ranges),
flags_(flags),
character_class_flags_(character_class_flags) {
// Convert the empty set of ranges to the negated Everything() range.
if (ranges->is_empty()) {
ranges->Add(CharacterRange::Everything(), zone);
character_class_flags_ ^= NEGATED;
}
}
RegExpCharacterClass(uc16 type, JSRegExp::Flags flags)
: set_(type),
flags_(flags),
character_class_flags_(CharacterClassFlags()) {}
void* Accept(RegExpVisitor* visitor, void* data) override;
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
RegExpCharacterClass* AsCharacterClass() override;
bool IsCharacterClass() override;
bool IsTextElement() override { return true; }
int min_match() override { return 1; }
// The character class may match two code units for unicode regexps.
// TODO(yangguo): we should split this class for usage in TextElement, and
// make max_match() dependent on the character class content.
int max_match() override { return 2; }
void AppendToText(RegExpText* text, Zone* zone) override;
CharacterSet character_set() { return set_; }
// TODO(lrn): Remove need for complex version if is_standard that
// recognizes a mangled standard set and just do { return set_.is_special(); }
bool is_standard(Zone* zone);
// Returns a value representing the standard character set if is_standard()
// returns true.
// Currently used values are:
// s : unicode whitespace
// S : unicode non-whitespace
// w : ASCII word character (digit, letter, underscore)
// W : non-ASCII word character
// d : ASCII digit
// D : non-ASCII digit
// . : non-newline
// * : All characters, for advancing unanchored regexp
uc16 standard_type() const { return set_.standard_set_type(); }
ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); }
bool is_negated() const { return (character_class_flags_ & NEGATED) != 0; }
JSRegExp::Flags flags() const { return flags_; }
bool contains_split_surrogate() const {
return (character_class_flags_ & CONTAINS_SPLIT_SURROGATE) != 0;
}
private:
CharacterSet set_;
const JSRegExp::Flags flags_;
CharacterClassFlags character_class_flags_;
};
class RegExpAtom final : public RegExpTree {
public:
explicit RegExpAtom(Vector<const uc16> data, JSRegExp::Flags flags)
: data_(data), flags_(flags) {}
void* Accept(RegExpVisitor* visitor, void* data) override;
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
RegExpAtom* AsAtom() override;
bool IsAtom() override;
bool IsTextElement() override { return true; }
int min_match() override { return data_.length(); }
int max_match() override { return data_.length(); }
void AppendToText(RegExpText* text, Zone* zone) override;
Vector<const uc16> data() { return data_; }
int length() { return data_.length(); }
JSRegExp::Flags flags() const { return flags_; }
bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
private:
Vector<const uc16> data_;
const JSRegExp::Flags flags_;
};
class RegExpText final : public RegExpTree {
public:
explicit RegExpText(Zone* zone) : elements_(2, zone), length_(0) {}
void* Accept(RegExpVisitor* visitor, void* data) override;
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
RegExpText* AsText() override;
bool IsText() override;
explicit RegExpText(Zone* zone) : elements_(2, zone) {}
DECL_BOILERPLATE(Text);
bool IsTextElement() override { return true; }
int min_match() override { return length_; }
int max_match() override { return length_; }
@ -403,7 +395,7 @@ class RegExpText final : public RegExpTree {
private:
ZoneList<TextElement> elements_;
int length_;
int length_ = 0;
};
@ -426,23 +418,22 @@ class RegExpQuantifier final : public RegExpTree {
max_match_ = max * body->max_match();
}
}
void* Accept(RegExpVisitor* visitor, void* data) override;
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
DECL_BOILERPLATE(Quantifier);
static RegExpNode* ToNode(int min, int max, bool is_greedy, RegExpTree* body,
RegExpCompiler* compiler, RegExpNode* on_success,
bool not_at_start = false);
RegExpQuantifier* AsQuantifier() override;
Interval CaptureRegisters() override;
bool IsQuantifier() override;
int min_match() override { return min_match_; }
int max_match() override { return max_match_; }
int min() const { return min_; }
int max() const { return max_; }
QuantifierType quantifier_type() const { return quantifier_type_; }
bool is_possessive() const { return quantifier_type_ == POSSESSIVE; }
bool is_non_greedy() { return quantifier_type_ == NON_GREEDY; }
bool is_non_greedy() const { return quantifier_type_ == NON_GREEDY; }
bool is_greedy() const { return quantifier_type_ == GREEDY; }
RegExpTree* body() { return body_; }
RegExpTree* body() const { return body_; }
private:
RegExpTree* body_;
@ -462,15 +453,14 @@ class RegExpCapture final : public RegExpTree {
min_match_(0),
max_match_(0),
name_(nullptr) {}
void* Accept(RegExpVisitor* visitor, void* data) override;
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
DECL_BOILERPLATE(Capture);
static RegExpNode* ToNode(RegExpTree* body, int index,
RegExpCompiler* compiler, RegExpNode* on_success);
RegExpCapture* AsCapture() override;
bool IsAnchoredAtStart() override;
bool IsAnchoredAtEnd() override;
Interval CaptureRegisters() override;
bool IsCapture() override;
int min_match() override { return min_match_; }
int max_match() override { return max_match_; }
RegExpTree* body() { return body_; }
@ -480,17 +470,17 @@ class RegExpCapture final : public RegExpTree {
max_match_ = body->max_match();
}
int index() const { return index_; }
const ZoneVector<uc16>* name() const { return name_; }
void set_name(const ZoneVector<uc16>* name) { name_ = name; }
const ZoneVector<base::uc16>* name() const { return name_; }
void set_name(const ZoneVector<base::uc16>* name) { name_ = name; }
static int StartRegister(int index) { return index * 2; }
static int EndRegister(int index) { return index * 2 + 1; }
private:
RegExpTree* body_;
RegExpTree* body_ = nullptr;
int index_;
int min_match_;
int max_match_;
const ZoneVector<uc16>* name_;
int min_match_ = 0;
int max_match_ = 0;
const ZoneVector<base::uc16>* name_ = nullptr;
};
class RegExpGroup final : public RegExpTree {
@ -499,19 +489,15 @@ class RegExpGroup final : public RegExpTree {
: body_(body),
min_match_(body->min_match()),
max_match_(body->max_match()) {}
void* Accept(RegExpVisitor* visitor, void* data) override;
RegExpNode* ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) override {
return body_->ToNode(compiler, on_success);
}
RegExpGroup* AsGroup() override;
DECL_BOILERPLATE(Group);
bool IsAnchoredAtStart() override { return body_->IsAnchoredAtStart(); }
bool IsAnchoredAtEnd() override { return body_->IsAnchoredAtEnd(); }
bool IsGroup() override;
int min_match() override { return min_match_; }
int max_match() override { return max_match_; }
Interval CaptureRegisters() override { return body_->CaptureRegisters(); }
RegExpTree* body() { return body_; }
RegExpTree* body() const { return body_; }
private:
RegExpTree* body_;
@ -531,26 +517,24 @@ class RegExpLookaround final : public RegExpTree {
capture_from_(capture_from),
type_(type) {}
void* Accept(RegExpVisitor* visitor, void* data) override;
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
RegExpLookaround* AsLookaround() override;
DECL_BOILERPLATE(Lookaround);
Interval CaptureRegisters() override;
bool IsLookaround() override;
bool IsAnchoredAtStart() override;
int min_match() override { return 0; }
int max_match() override { return 0; }
RegExpTree* body() { return body_; }
bool is_positive() { return is_positive_; }
int capture_count() { return capture_count_; }
int capture_from() { return capture_from_; }
Type type() { return type_; }
RegExpTree* body() const { return body_; }
bool is_positive() const { return is_positive_; }
int capture_count() const { return capture_count_; }
int capture_from() const { return capture_from_; }
Type type() const { return type_; }
class Builder {
public:
Builder(bool is_positive, RegExpNode* on_success,
int stack_pointer_register, int position_register,
int capture_register_count = 0, int capture_register_start = 0);
RegExpNode* on_match_success() { return on_match_success_; }
RegExpNode* on_match_success() const { return on_match_success_; }
RegExpNode* ForMatch(RegExpNode* match);
private:
@ -572,38 +556,32 @@ class RegExpLookaround final : public RegExpTree {
class RegExpBackReference final : public RegExpTree {
public:
explicit RegExpBackReference(JSRegExp::Flags flags)
: capture_(nullptr), name_(nullptr), flags_(flags) {}
RegExpBackReference(RegExpCapture* capture, JSRegExp::Flags flags)
: capture_(capture), name_(nullptr), flags_(flags) {}
void* Accept(RegExpVisitor* visitor, void* data) override;
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
RegExpBackReference* AsBackReference() override;
bool IsBackReference() override;
explicit RegExpBackReference(RegExpFlags flags) : flags_(flags) {}
RegExpBackReference(RegExpCapture* capture, RegExpFlags flags)
: capture_(capture), flags_(flags) {}
DECL_BOILERPLATE(BackReference);
int min_match() override { return 0; }
// The back reference may be recursive, e.g. /(\2)(\1)/. To avoid infinite
// recursion, we give up. Ignorance is bliss.
int max_match() override { return kInfinity; }
int index() { return capture_->index(); }
RegExpCapture* capture() { return capture_; }
int index() const { return capture_->index(); }
RegExpCapture* capture() const { return capture_; }
void set_capture(RegExpCapture* capture) { capture_ = capture; }
const ZoneVector<uc16>* name() const { return name_; }
void set_name(const ZoneVector<uc16>* name) { name_ = name; }
const ZoneVector<base::uc16>* name() const { return name_; }
void set_name(const ZoneVector<base::uc16>* name) { name_ = name; }
private:
RegExpCapture* capture_;
const ZoneVector<uc16>* name_;
const JSRegExp::Flags flags_;
RegExpCapture* capture_ = nullptr;
const ZoneVector<base::uc16>* name_ = nullptr;
const RegExpFlags flags_;
};
class RegExpEmpty final : public RegExpTree {
public:
RegExpEmpty() = default;
void* Accept(RegExpVisitor* visitor, void* data) override;
RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
RegExpEmpty* AsEmpty() override;
bool IsEmpty() override;
DECL_BOILERPLATE(Empty);
int min_match() override { return 0; }
int max_match() override { return 0; }
};
@ -611,4 +589,6 @@ class RegExpEmpty final : public RegExpTree {
} // namespace internal
} // namespace v8
#undef DECL_BOILERPLATE
#endif // V8_REGEXP_REGEXP_AST_H_

Просмотреть файл

@ -23,29 +23,29 @@ void RegExpBytecodeGenerator::Emit(uint32_t byte, int32_t twenty_four_bits) {
}
void RegExpBytecodeGenerator::Emit16(uint32_t word) {
DCHECK(pc_ <= buffer_.length());
if (pc_ + 1 >= buffer_.length()) {
Expand();
DCHECK(pc_ <= static_cast<int>(buffer_.size()));
if (pc_ + 1 >= static_cast<int>(buffer_.size())) {
ExpandBuffer();
}
*reinterpret_cast<uint16_t*>(buffer_.begin() + pc_) = word;
*reinterpret_cast<uint16_t*>(buffer_.data() + pc_) = word;
pc_ += 2;
}
void RegExpBytecodeGenerator::Emit8(uint32_t word) {
DCHECK(pc_ <= buffer_.length());
if (pc_ == buffer_.length()) {
Expand();
DCHECK(pc_ <= static_cast<int>(buffer_.size()));
if (pc_ == static_cast<int>(buffer_.size())) {
ExpandBuffer();
}
*reinterpret_cast<unsigned char*>(buffer_.begin() + pc_) = word;
*reinterpret_cast<unsigned char*>(buffer_.data() + pc_) = word;
pc_ += 1;
}
void RegExpBytecodeGenerator::Emit32(uint32_t word) {
DCHECK(pc_ <= buffer_.length());
if (pc_ + 3 >= buffer_.length()) {
Expand();
DCHECK(pc_ <= static_cast<int>(buffer_.size()));
if (pc_ + 3 >= static_cast<int>(buffer_.size())) {
ExpandBuffer();
}
*reinterpret_cast<uint32_t*>(buffer_.begin() + pc_) = word;
*reinterpret_cast<uint32_t*>(buffer_.data() + pc_) = word;
pc_ += 4;
}

Просмотреть файл

@ -14,7 +14,7 @@ namespace internal {
RegExpBytecodeGenerator::RegExpBytecodeGenerator(Isolate* isolate, Zone* zone)
: RegExpMacroAssembler(isolate, zone),
buffer_(Vector<byte>::New(1024)),
buffer_(kInitialBufferSize, zone),
pc_(0),
advance_current_end_(kInvalidPC),
jump_edges_(zone),
@ -22,7 +22,6 @@ RegExpBytecodeGenerator::RegExpBytecodeGenerator(Isolate* isolate, Zone* zone)
RegExpBytecodeGenerator::~RegExpBytecodeGenerator() {
if (backtrack_.is_linked()) backtrack_.Unuse();
buffer_.Dispose();
}
RegExpBytecodeGenerator::IrregexpImplementation
@ -37,8 +36,8 @@ void RegExpBytecodeGenerator::Bind(Label* l) {
int pos = l->pos();
while (pos != 0) {
int fixup = pos;
pos = *reinterpret_cast<int32_t*>(buffer_.begin() + fixup);
*reinterpret_cast<uint32_t*>(buffer_.begin() + fixup) = pc_;
pos = *reinterpret_cast<int32_t*>(buffer_.data() + fixup);
*reinterpret_cast<uint32_t*>(buffer_.data() + fixup) = pc_;
jump_edges_.emplace(fixup, pc_);
}
}
@ -218,12 +217,14 @@ void RegExpBytecodeGenerator::LoadCurrentCharacterImpl(int cp_offset,
if (check_bounds) EmitOrLink(on_failure);
}
void RegExpBytecodeGenerator::CheckCharacterLT(uc16 limit, Label* on_less) {
void RegExpBytecodeGenerator::CheckCharacterLT(base::uc16 limit,
Label* on_less) {
Emit(BC_CHECK_LT, limit);
EmitOrLink(on_less);
}
void RegExpBytecodeGenerator::CheckCharacterGT(uc16 limit, Label* on_greater) {
void RegExpBytecodeGenerator::CheckCharacterGT(base::uc16 limit,
Label* on_greater) {
Emit(BC_CHECK_GT, limit);
EmitOrLink(on_greater);
}
@ -286,14 +287,15 @@ void RegExpBytecodeGenerator::CheckNotCharacterAfterAnd(uint32_t c,
}
void RegExpBytecodeGenerator::CheckNotCharacterAfterMinusAnd(
uc16 c, uc16 minus, uc16 mask, Label* on_not_equal) {
base::uc16 c, base::uc16 minus, base::uc16 mask, Label* on_not_equal) {
Emit(BC_MINUS_AND_CHECK_NOT_CHAR, c);
Emit16(minus);
Emit16(mask);
EmitOrLink(on_not_equal);
}
void RegExpBytecodeGenerator::CheckCharacterInRange(uc16 from, uc16 to,
void RegExpBytecodeGenerator::CheckCharacterInRange(base::uc16 from,
base::uc16 to,
Label* on_in_range) {
Emit(BC_CHECK_CHAR_IN_RANGE, 0);
Emit16(from);
@ -301,7 +303,8 @@ void RegExpBytecodeGenerator::CheckCharacterInRange(uc16 from, uc16 to,
EmitOrLink(on_in_range);
}
void RegExpBytecodeGenerator::CheckCharacterNotInRange(uc16 from, uc16 to,
void RegExpBytecodeGenerator::CheckCharacterNotInRange(base::uc16 from,
base::uc16 to,
Label* on_not_in_range) {
Emit(BC_CHECK_CHAR_NOT_IN_RANGE, 0);
Emit16(from);
@ -377,7 +380,7 @@ Handle<HeapObject> RegExpBytecodeGenerator::GetCode(Handle<String> source) {
Handle<ByteArray> array;
if (FLAG_regexp_peephole_optimization) {
array = RegExpBytecodePeepholeOptimization::OptimizeBytecode(
isolate_, zone(), source, buffer_.begin(), length(), jump_edges_);
isolate_, zone(), source, buffer_.data(), length(), jump_edges_);
} else {
array = isolate_->factory()->NewByteArray(length());
Copy(array->GetDataStartAddress());
@ -389,14 +392,13 @@ Handle<HeapObject> RegExpBytecodeGenerator::GetCode(Handle<String> source) {
int RegExpBytecodeGenerator::length() { return pc_; }
void RegExpBytecodeGenerator::Copy(byte* a) {
MemCopy(a, buffer_.begin(), length());
MemCopy(a, buffer_.data(), length());
}
void RegExpBytecodeGenerator::Expand() {
Vector<byte> old_buffer = buffer_;
buffer_ = Vector<byte>::New(old_buffer.length() * 2);
MemCopy(buffer_.begin(), old_buffer.begin(), old_buffer.length());
old_buffer.Dispose();
void RegExpBytecodeGenerator::ExpandBuffer() {
// TODO(jgruber): The growth strategy could be smarter for large sizes.
// TODO(jgruber): It's not necessary to default-initialize new elements.
buffer_.resize(buffer_.size() * 2);
}
} // namespace internal

Просмотреть файл

@ -25,7 +25,7 @@ class V8_EXPORT_PRIVATE RegExpBytecodeGenerator : public RegExpMacroAssembler {
~RegExpBytecodeGenerator() override;
// The byte-code interpreter checks on each push anyway.
int stack_limit_slack() override { return 1; }
bool CanReadUnaligned() override { return false; }
bool CanReadUnaligned() const override { return false; }
void Bind(Label* label) override;
void AdvanceCurrentPosition(int by) override; // Signed cp change.
void PopCurrentPosition() override;
@ -52,19 +52,36 @@ class V8_EXPORT_PRIVATE RegExpBytecodeGenerator : public RegExpMacroAssembler {
void CheckCharacter(unsigned c, Label* on_equal) override;
void CheckCharacterAfterAnd(unsigned c, unsigned mask,
Label* on_equal) override;
void CheckCharacterGT(uc16 limit, Label* on_greater) override;
void CheckCharacterLT(uc16 limit, Label* on_less) override;
void CheckCharacterGT(base::uc16 limit, Label* on_greater) override;
void CheckCharacterLT(base::uc16 limit, Label* on_less) override;
void CheckGreedyLoop(Label* on_tos_equals_current_position) override;
void CheckAtStart(int cp_offset, Label* on_at_start) override;
void CheckNotAtStart(int cp_offset, Label* on_not_at_start) override;
void CheckNotCharacter(unsigned c, Label* on_not_equal) override;
void CheckNotCharacterAfterAnd(unsigned c, unsigned mask,
Label* on_not_equal) override;
void CheckNotCharacterAfterMinusAnd(uc16 c, uc16 minus, uc16 mask,
void CheckNotCharacterAfterMinusAnd(base::uc16 c, base::uc16 minus,
base::uc16 mask,
Label* on_not_equal) override;
void CheckCharacterInRange(uc16 from, uc16 to, Label* on_in_range) override;
void CheckCharacterNotInRange(uc16 from, uc16 to,
void CheckCharacterInRange(base::uc16 from, base::uc16 to,
Label* on_in_range) override;
void CheckCharacterNotInRange(base::uc16 from, base::uc16 to,
Label* on_not_in_range) override;
bool CheckCharacterInRangeArray(const ZoneList<CharacterRange>* ranges,
Label* on_in_range) override {
// Disabled in the interpreter, because 1) there is no constant pool that
// could store the ByteArray pointer, 2) bytecode size limits are not as
// restrictive as code (e.g. branch distances on arm), 3) bytecode for
// large character classes is already quite compact.
// TODO(jgruber): Consider using BytecodeArrays (with a constant pool)
// instead of plain ByteArrays; then we could implement
// CheckCharacterInRangeArray in the interpreter.
return false;
}
bool CheckCharacterNotInRangeArray(const ZoneList<CharacterRange>* ranges,
Label* on_not_in_range) override {
return false;
}
void CheckBitInTable(Handle<ByteArray> table, Label* on_bit_set) override;
void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match) override;
@ -79,7 +96,8 @@ class V8_EXPORT_PRIVATE RegExpBytecodeGenerator : public RegExpMacroAssembler {
Handle<HeapObject> GetCode(Handle<String> source) override;
private:
void Expand();
void ExpandBuffer();
// Code and bitmap emission.
inline void EmitOrLink(Label* label);
inline void Emit32(uint32_t x);
@ -92,7 +110,9 @@ class V8_EXPORT_PRIVATE RegExpBytecodeGenerator : public RegExpMacroAssembler {
void Copy(byte* a);
// The buffer into which code and relocation info are generated.
Vector<byte> buffer_;
static constexpr int kInitialBufferSize = 1024;
ZoneVector<byte> buffer_;
// The program counter.
int pc_;
Label backtrack_;

Просмотреть файл

@ -258,13 +258,10 @@ int32_t GetArgumentValue(const byte* bytecode, int offset, int length) {
switch (length) {
case 1:
return GetValue<byte>(bytecode, offset);
break;
case 2:
return GetValue<int16_t>(bytecode, offset);
break;
case 4:
return GetValue<int32_t>(bytecode, offset);
break;
default:
UNREACHABLE();
}

Просмотреть файл

@ -22,8 +22,9 @@ constexpr int BYTECODE_MASK = kRegExpPaddedBytecodeCount - 1;
// positive values.
const unsigned int MAX_FIRST_ARG = 0x7fffffu;
const int BYTECODE_SHIFT = 8;
STATIC_ASSERT(1 << BYTECODE_SHIFT > BYTECODE_MASK);
static_assert(1 << BYTECODE_SHIFT > BYTECODE_MASK);
// The list of bytecodes, in format: V(Name, Code, ByteLength).
// TODO(pthier): Argument offsets of bytecodes should be easily accessible by
// name or at least by position.
// TODO(jgruber): More precise types (e.g. int32/uint32 instead of value32).
@ -85,12 +86,14 @@ STATIC_ASSERT(1 << BYTECODE_SHIFT > BYTECODE_MASK);
/* 0x10 - 0x1F: Character to match against (after mask aplied) */ \
/* 0x20 - 0x3F: Bitmask bitwise and combined with current character */ \
/* 0x40 - 0x5F: Address of bytecode when matched */ \
V(AND_CHECK_CHAR, 28, 12) /* bc8 pad8 uint16 uint32 addr32 */ \
V(AND_CHECK_NOT_4_CHARS, 29, 16) /* bc8 pad24 uint32 uint32 addr32 */ \
V(AND_CHECK_NOT_CHAR, 30, 12) /* bc8 pad8 uint16 uint32 addr32 */ \
V(MINUS_AND_CHECK_NOT_CHAR, 31, 12) /* bc8 pad8 uc16 uc16 uc16 addr32 */ \
V(CHECK_CHAR_IN_RANGE, 32, 12) /* bc8 pad24 uc16 uc16 addr32 */ \
V(CHECK_CHAR_NOT_IN_RANGE, 33, 12) /* bc8 pad24 uc16 uc16 addr32 */ \
V(AND_CHECK_CHAR, 28, 12) /* bc8 pad8 uint16 uint32 addr32 */ \
V(AND_CHECK_NOT_4_CHARS, 29, 16) /* bc8 pad24 uint32 uint32 addr32 */ \
V(AND_CHECK_NOT_CHAR, 30, 12) /* bc8 pad8 uint16 uint32 addr32 */ \
V(MINUS_AND_CHECK_NOT_CHAR, 31, \
12) /* bc8 pad8 base::uc16 base::uc16 base::uc16 addr32 */ \
V(CHECK_CHAR_IN_RANGE, 32, 12) /* bc8 pad24 base::uc16 base::uc16 addr32 */ \
V(CHECK_CHAR_NOT_IN_RANGE, 33, \
12) /* bc8 pad24 base::uc16 base::uc16 addr32 */ \
/* Checks if the current character matches any of the characters encoded */ \
/* in a bit table. Similar to/inspired by boyer moore string search */ \
/* Bit Layout: */ \
@ -99,8 +102,8 @@ STATIC_ASSERT(1 << BYTECODE_SHIFT > BYTECODE_MASK);
/* 0x20 - 0x3F: Address of bytecode when bit is set */ \
/* 0x40 - 0xBF: Bit table */ \
V(CHECK_BIT_IN_TABLE, 34, 24) /* bc8 pad24 addr32 bits128 */ \
V(CHECK_LT, 35, 8) /* bc8 pad8 uc16 addr32 */ \
V(CHECK_GT, 36, 8) /* bc8 pad8 uc16 addr32 */ \
V(CHECK_LT, 35, 8) /* bc8 pad8 base::uc16 addr32 */ \
V(CHECK_GT, 36, 8) /* bc8 pad8 base::uc16 addr32 */ \
V(CHECK_NOT_BACK_REF, 37, 8) /* bc8 reg_idx24 addr32 */ \
V(CHECK_NOT_BACK_REF_NO_CASE, 38, 8) /* bc8 reg_idx24 addr32 */ \
V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE, 39, 8) \
@ -215,7 +218,7 @@ static constexpr int kRegExpBytecodeCount = BYTECODE_ITERATOR(COUNT);
// contiguous, strictly increasing, and start at 0.
// TODO(jgruber): Do not explicitly assign values, instead generate them
// implicitly from the list order.
STATIC_ASSERT(kRegExpBytecodeCount == 59);
static_assert(kRegExpBytecodeCount == 59);
#define DECLARE_BYTECODES(name, code, length) \
static constexpr int BC_##name = code;

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -5,11 +5,9 @@
#include "irregexp/imported/regexp-compiler.h"
#include "irregexp/imported/regexp-macro-assembler-arch.h"
#ifdef V8_INTL_SUPPORT
#include "irregexp/imported/special-case.h"
#endif // V8_INTL_SUPPORT
#ifdef V8_INTL_SUPPORT
#include "irregexp/imported/special-case.h"
#include "unicode/locid.h"
#include "unicode/uniset.h"
#include "unicode/utypes.h"
@ -171,17 +169,17 @@ using namespace regexp_compiler_constants; // NOLINT(build/namespaces)
namespace {
constexpr uc32 MaxCodeUnit(const bool one_byte) {
STATIC_ASSERT(String::kMaxOneByteCharCodeU <=
constexpr base::uc32 MaxCodeUnit(const bool one_byte) {
static_assert(String::kMaxOneByteCharCodeU <=
std::numeric_limits<uint16_t>::max());
STATIC_ASSERT(String::kMaxUtf16CodeUnitU <=
static_assert(String::kMaxUtf16CodeUnitU <=
std::numeric_limits<uint16_t>::max());
return one_byte ? String::kMaxOneByteCharCodeU : String::kMaxUtf16CodeUnitU;
}
constexpr uint32_t CharMask(const bool one_byte) {
STATIC_ASSERT(base::bits::IsPowerOfTwo(String::kMaxOneByteCharCodeU + 1));
STATIC_ASSERT(base::bits::IsPowerOfTwo(String::kMaxUtf16CodeUnitU + 1));
static_assert(base::bits::IsPowerOfTwo(String::kMaxOneByteCharCodeU + 1));
static_assert(base::bits::IsPowerOfTwo(String::kMaxUtf16CodeUnitU + 1));
return MaxCodeUnit(one_byte);
}
@ -235,12 +233,13 @@ class RecursionCheck {
// Attempts to compile the regexp using an Irregexp code generator. Returns
// a fixed array or a null handle depending on whether it succeeded.
RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
bool one_byte)
RegExpFlags flags, bool one_byte)
: next_register_(JSRegExp::RegistersForCaptureCount(capture_count)),
unicode_lookaround_stack_register_(kNoRegister),
unicode_lookaround_position_register_(kNoRegister),
work_list_(nullptr),
recursion_depth_(0),
flags_(flags),
one_byte_(one_byte),
reg_exp_too_big_(false),
limiting_recursion_(false),
@ -274,6 +273,9 @@ RegExpCompiler::CompilationResult RegExpCompiler::Assemble(
if (!node->label()->is_bound()) node->Emit(this, &new_trace);
}
if (reg_exp_too_big_) {
if (FLAG_correctness_fuzzer_suppressions) {
FATAL("Aborting on excess zone allocation");
}
macro_assembler_->AbortedCodeGeneration();
return CompilationResult::RegExpTooBig();
}
@ -480,7 +482,6 @@ void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
}
default:
UNREACHABLE();
break;
}
}
}
@ -734,7 +735,7 @@ namespace {
#ifdef DEBUG
bool ContainsOnlyUtf16CodeUnits(unibrow::uchar* chars, int length) {
STATIC_ASSERT(sizeof(unibrow::uchar) == 4);
static_assert(sizeof(unibrow::uchar) == 4);
for (int i = 0; i < length; i++) {
if (chars[i] > String::kMaxUtf16CodeUnit) return false;
}
@ -742,14 +743,11 @@ bool ContainsOnlyUtf16CodeUnits(unibrow::uchar* chars, int length) {
}
#endif // DEBUG
} // namespace
// Returns the number of characters in the equivalence class, omitting those
// that cannot occur in the source string because it is Latin1.
static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
bool one_byte_subject,
unibrow::uchar* letters,
int letter_length) {
int GetCaseIndependentLetters(Isolate* isolate, base::uc16 character,
bool one_byte_subject, unibrow::uchar* letters,
int letter_length) {
#ifdef V8_INTL_SUPPORT
if (RegExpCaseFolding::IgnoreSet().contains(character)) {
letters[0] = character;
@ -809,10 +807,9 @@ static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
#endif // V8_INTL_SUPPORT
}
static inline bool EmitSimpleCharacter(Isolate* isolate,
RegExpCompiler* compiler, uc16 c,
Label* on_failure, int cp_offset,
bool check, bool preloaded) {
inline bool EmitSimpleCharacter(Isolate* isolate, RegExpCompiler* compiler,
base::uc16 c, Label* on_failure, int cp_offset,
bool check, bool preloaded) {
RegExpMacroAssembler* assembler = compiler->macro_assembler();
bool bound_checked = false;
if (!preloaded) {
@ -825,9 +822,9 @@ static inline bool EmitSimpleCharacter(Isolate* isolate,
// Only emits non-letters (things that don't have case). Only used for case
// independent matches.
static inline bool EmitAtomNonLetter(Isolate* isolate, RegExpCompiler* compiler,
uc16 c, Label* on_failure, int cp_offset,
bool check, bool preloaded) {
inline bool EmitAtomNonLetter(Isolate* isolate, RegExpCompiler* compiler,
base::uc16 c, Label* on_failure, int cp_offset,
bool check, bool preloaded) {
RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
bool one_byte = compiler->one_byte();
unibrow::uchar chars[4];
@ -854,28 +851,28 @@ static inline bool EmitAtomNonLetter(Isolate* isolate, RegExpCompiler* compiler,
return checked;
}
static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
bool one_byte, uc16 c1, uc16 c2,
Label* on_failure) {
bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
bool one_byte, base::uc16 c1, base::uc16 c2,
Label* on_failure) {
const uint32_t char_mask = CharMask(one_byte);
uc16 exor = c1 ^ c2;
base::uc16 exor = c1 ^ c2;
// Check whether exor has only one bit set.
if (((exor - 1) & exor) == 0) {
// If c1 and c2 differ only by one bit.
// Ecma262UnCanonicalize always gives the highest number last.
DCHECK(c2 > c1);
uc16 mask = char_mask ^ exor;
base::uc16 mask = char_mask ^ exor;
macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
return true;
}
DCHECK(c2 > c1);
uc16 diff = c2 - c1;
base::uc16 diff = c2 - c1;
if (((diff - 1) & diff) == 0 && c1 >= diff) {
// If the characters differ by 2^n but don't differ by one bit then
// subtract the difference from the found character, then do the or
// trick. We avoid the theoretical case where negative numbers are
// involved in order to simplify code generation.
uc16 mask = char_mask ^ diff;
base::uc16 mask = char_mask ^ diff;
macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff, diff, mask,
on_failure);
return true;
@ -885,9 +882,9 @@ static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
// Only emits letters (things that have case). Only used for case independent
// matches.
static inline bool EmitAtomLetter(Isolate* isolate, RegExpCompiler* compiler,
uc16 c, Label* on_failure, int cp_offset,
bool check, bool preloaded) {
inline bool EmitAtomLetter(Isolate* isolate, RegExpCompiler* compiler,
base::uc16 c, Label* on_failure, int cp_offset,
bool check, bool preloaded) {
RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
bool one_byte = compiler->one_byte();
unibrow::uchar chars[4];
@ -925,9 +922,9 @@ static inline bool EmitAtomLetter(Isolate* isolate, RegExpCompiler* compiler,
return true;
}
static void EmitBoundaryTest(RegExpMacroAssembler* masm, int border,
Label* fall_through, Label* above_or_equal,
Label* below) {
void EmitBoundaryTest(RegExpMacroAssembler* masm, int border,
Label* fall_through, Label* above_or_equal,
Label* below) {
if (below != fall_through) {
masm->CheckCharacterLT(border, below);
if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
@ -936,9 +933,9 @@ static void EmitBoundaryTest(RegExpMacroAssembler* masm, int border,
}
}
static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm, int first,
int last, Label* fall_through,
Label* in_range, Label* out_of_range) {
void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm, int first, int last,
Label* fall_through, Label* in_range,
Label* out_of_range) {
if (in_range == fall_through) {
if (first == last) {
masm->CheckNotCharacter(first, out_of_range);
@ -957,15 +954,15 @@ static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm, int first,
// even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
// odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
static void EmitUseLookupTable(RegExpMacroAssembler* masm,
ZoneList<uc32>* ranges, uint32_t start_index,
uint32_t end_index, uc32 min_char,
Label* fall_through, Label* even_label,
Label* odd_label) {
void EmitUseLookupTable(RegExpMacroAssembler* masm,
ZoneList<base::uc32>* ranges, uint32_t start_index,
uint32_t end_index, base::uc32 min_char,
Label* fall_through, Label* even_label,
Label* odd_label) {
static const uint32_t kSize = RegExpMacroAssembler::kTableSize;
static const uint32_t kMask = RegExpMacroAssembler::kTableMask;
uc32 base = (min_char & ~kMask);
base::uc32 base = (min_char & ~kMask);
USE(base);
// Assert that everything is on one kTableSize page.
@ -1012,10 +1009,9 @@ static void EmitUseLookupTable(RegExpMacroAssembler* masm,
if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
}
static void CutOutRange(RegExpMacroAssembler* masm, ZoneList<uc32>* ranges,
uint32_t start_index, uint32_t end_index,
uint32_t cut_index, Label* even_label,
Label* odd_label) {
void CutOutRange(RegExpMacroAssembler* masm, ZoneList<base::uc32>* ranges,
uint32_t start_index, uint32_t end_index, uint32_t cut_index,
Label* even_label, Label* odd_label) {
bool odd = (((cut_index - start_index) & 1) == 1);
Label* in_range_label = odd ? odd_label : even_label;
Label dummy;
@ -1036,14 +1032,14 @@ static void CutOutRange(RegExpMacroAssembler* masm, ZoneList<uc32>* ranges,
// Unicode case. Split the search space into kSize spaces that are handled
// with recursion.
static void SplitSearchSpace(ZoneList<uc32>* ranges, uint32_t start_index,
uint32_t end_index, uint32_t* new_start_index,
uint32_t* new_end_index, uc32* border) {
void SplitSearchSpace(ZoneList<base::uc32>* ranges, uint32_t start_index,
uint32_t end_index, uint32_t* new_start_index,
uint32_t* new_end_index, base::uc32* border) {
static const uint32_t kSize = RegExpMacroAssembler::kTableSize;
static const uint32_t kMask = RegExpMacroAssembler::kTableMask;
uc32 first = ranges->at(start_index);
uc32 last = ranges->at(end_index) - 1;
base::uc32 first = ranges->at(start_index);
base::uc32 last = ranges->at(end_index) - 1;
*new_start_index = start_index;
*border = (ranges->at(start_index) & ~kMask) + kSize;
@ -1102,15 +1098,16 @@ static void SplitSearchSpace(ZoneList<uc32>* ranges, uint32_t start_index,
// know that the character is in the range of min_char to max_char inclusive.
// Either label can be nullptr indicating backtracking. Either label can also
// be equal to the fall_through label.
static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<uc32>* ranges,
uint32_t start_index, uint32_t end_index,
uc32 min_char, uc32 max_char, Label* fall_through,
Label* even_label, Label* odd_label) {
void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<base::uc32>* ranges,
uint32_t start_index, uint32_t end_index,
base::uc32 min_char, base::uc32 max_char,
Label* fall_through, Label* even_label,
Label* odd_label) {
DCHECK_LE(min_char, String::kMaxUtf16CodeUnit);
DCHECK_LE(max_char, String::kMaxUtf16CodeUnit);
uc32 first = ranges->at(start_index);
uc32 last = ranges->at(end_index) - 1;
base::uc32 first = ranges->at(start_index);
base::uc32 last = ranges->at(end_index) - 1;
DCHECK_LT(min_char, first);
@ -1170,7 +1167,7 @@ static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<uc32>* ranges,
uint32_t new_start_index = 0;
uint32_t new_end_index = 0;
uc32 border = 0;
base::uc32 border = 0;
SplitSearchSpace(ranges, start_index, end_index, &new_start_index,
&new_end_index, &border);
@ -1213,24 +1210,19 @@ static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<uc32>* ranges,
}
}
static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
RegExpCharacterClass* cc, bool one_byte,
Label* on_failure, int cp_offset, bool check_offset,
bool preloaded, Zone* zone) {
void EmitCharClass(RegExpMacroAssembler* macro_assembler,
RegExpCharacterClass* cc, bool one_byte, Label* on_failure,
int cp_offset, bool check_offset, bool preloaded,
Zone* zone) {
ZoneList<CharacterRange>* ranges = cc->ranges(zone);
CharacterRange::Canonicalize(ranges);
const uc32 max_char = MaxCodeUnit(one_byte);
int range_count = ranges->length();
// Now that all processing (like case-insensitivity) is done, clamp the
// ranges to the set of ranges that may actually occur in the subject string.
if (one_byte) CharacterRange::ClampToOneByte(ranges);
int last_valid_range = range_count - 1;
while (last_valid_range >= 0) {
CharacterRange& range = ranges->at(last_valid_range);
if (range.from() <= max_char) break;
last_valid_range--;
}
if (last_valid_range < 0) {
const int ranges_length = ranges->length();
if (ranges_length == 0) {
if (!cc->is_negated()) {
macro_assembler->GoTo(on_failure);
}
@ -1240,7 +1232,8 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
return;
}
if (last_valid_range == 0 && ranges->at(0).IsEverything(max_char)) {
const base::uc32 max_char = MaxCodeUnit(one_byte);
if (ranges_length == 1 && ranges->at(0).IsEverything(max_char)) {
if (cc->is_negated()) {
macro_assembler->GoTo(on_failure);
} else {
@ -1261,18 +1254,33 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
return;
}
// A new list with ascending entries. Each entry is a code unit
// where there is a boundary between code units that are part of
// the class and code units that are not. Normally we insert an
// entry at zero which goes to the failure label, but if there
// was already one there we fall through for success on that entry.
// Subsequent entries have alternating meaning (success/failure).
ZoneList<uc32>* range_boundaries =
zone->New<ZoneList<uc32>>(last_valid_range, zone);
static constexpr int kMaxRangesForInlineBranchGeneration = 16;
if (ranges_length > kMaxRangesForInlineBranchGeneration) {
// For large range sets, emit a more compact instruction sequence to avoid
// a potentially problematic increase in code size.
// Note the flipped logic below (we check InRange if negated, NotInRange if
// not negated); this is necessary since the method falls through on
// failure whereas we want to fall through on success.
if (cc->is_negated()) {
if (macro_assembler->CheckCharacterInRangeArray(ranges, on_failure)) {
return;
}
} else {
if (macro_assembler->CheckCharacterNotInRangeArray(ranges, on_failure)) {
return;
}
}
}
// Generate a flat list of range boundaries for consumption by
// GenerateBranches. See the comment on that function for how the list should
// be structured
ZoneList<base::uc32>* range_boundaries =
zone->New<ZoneList<base::uc32>>(ranges_length * 2, zone);
bool zeroth_entry_is_failure = !cc->is_negated();
for (int i = 0; i <= last_valid_range; i++) {
for (int i = 0; i < ranges_length; i++) {
CharacterRange& range = ranges->at(i);
if (range.from() == 0) {
DCHECK_EQ(i, 0);
@ -1280,6 +1288,8 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
} else {
range_boundaries->Add(range.from(), zone);
}
// `+ 1` to convert from inclusive to exclusive `to`.
// [from, to] == [from, to+1[.
range_boundaries->Add(range.to() + 1, zone);
}
int end_index = range_boundaries->length() - 1;
@ -1298,6 +1308,8 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
macro_assembler->Bind(&fall_through);
}
} // namespace
RegExpNode::~RegExpNode() = default;
RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
@ -1385,8 +1397,10 @@ void NegativeLookaroundChoiceNode::GetQuickCheckDetails(
return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
}
namespace {
// Takes the left-most 1-bit and smears it out, setting all bits to its right.
static inline uint32_t SmearBitsRight(uint32_t v) {
inline uint32_t SmearBitsRight(uint32_t v) {
v |= v >> 1;
v |= v >> 2;
v |= v >> 4;
@ -1395,6 +1409,8 @@ static inline uint32_t SmearBitsRight(uint32_t v) {
return v;
}
} // namespace
bool QuickCheckDetails::Rationalize(bool asc) {
bool found_useful_op = false;
const uint32_t char_mask = CharMask(asc);
@ -1574,12 +1590,12 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
for (int k = 0; k < elements()->length(); k++) {
TextElement elm = elements()->at(k);
if (elm.text_type() == TextElement::ATOM) {
Vector<const uc16> quarks = elm.atom()->data();
base::Vector<const base::uc16> quarks = elm.atom()->data();
for (int i = 0; i < characters && i < quarks.length(); i++) {
QuickCheckDetails::Position* pos =
details->positions(characters_filled_in);
uc16 c = quarks[i];
if (elm.atom()->ignore_case()) {
base::uc16 c = quarks[i];
if (IsIgnoreCase(compiler->flags())) {
unibrow::uchar chars[4];
int length = GetCaseIndependentLetters(
isolate, c, compiler->one_byte(), chars, 4);
@ -1640,12 +1656,14 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
details->positions(characters_filled_in);
RegExpCharacterClass* tree = elm.char_class();
ZoneList<CharacterRange>* ranges = tree->ranges(zone());
DCHECK(!ranges->is_empty());
if (tree->is_negated()) {
if (tree->is_negated() || ranges->is_empty()) {
// A quick check uses multi-character mask and compare. There is no
// useful way to incorporate a negative char class into this scheme
// so we just conservatively create a mask and value that will always
// succeed.
// Likewise for empty ranges (empty ranges can occur e.g. when
// compiling for one-byte subjects and impossible (non-one-byte) ranges
// have been removed).
pos->mask = 0;
pos->value = 0;
} else {
@ -1659,8 +1677,9 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
}
}
CharacterRange range = ranges->at(first_range);
const uc32 first_from = range.from();
const uc32 first_to = (range.to() > char_mask) ? char_mask : range.to();
const base::uc32 first_from = range.from();
const base::uc32 first_to =
(range.to() > char_mask) ? char_mask : range.to();
const uint32_t differing_bits = (first_from ^ first_to);
// A mask and compare is only perfect if the differing bits form a
// number like 00011111 with one single block of trailing 1s.
@ -1671,10 +1690,11 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
uint32_t common_bits = ~SmearBitsRight(differing_bits);
uint32_t bits = (first_from & common_bits);
for (int i = first_range + 1; i < ranges->length(); i++) {
CharacterRange range = ranges->at(i);
const uc32 from = range.from();
range = ranges->at(i);
const base::uc32 from = range.from();
if (from > char_mask) continue;
const uc32 to = (range.to() > char_mask) ? char_mask : range.to();
const base::uc32 to =
(range.to() > char_mask) ? char_mask : range.to();
// Here we are combining more ranges into the mask and compare
// value. With each new range the mask becomes more sparse and
// so the chances of a false positive rise. A character class
@ -1685,8 +1705,8 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
new_common_bits = ~SmearBitsRight(new_common_bits);
common_bits &= new_common_bits;
bits &= new_common_bits;
uint32_t differing_bits = (from & common_bits) ^ bits;
common_bits ^= differing_bits;
uint32_t new_differing_bits = (from & common_bits) ^ bits;
common_bits ^= new_differing_bits;
bits &= common_bits;
}
pos->mask = common_bits;
@ -1807,16 +1827,16 @@ class IterationDecrementer {
LoopChoiceNode* node_;
};
RegExpNode* SeqRegExpNode::FilterOneByte(int depth) {
RegExpNode* SeqRegExpNode::FilterOneByte(int depth, RegExpFlags flags) {
if (info()->replacement_calculated) return replacement();
if (depth < 0) return this;
DCHECK(!info()->visited);
VisitMarker marker(info());
return FilterSuccessor(depth - 1);
return FilterSuccessor(depth - 1, flags);
}
RegExpNode* SeqRegExpNode::FilterSuccessor(int depth) {
RegExpNode* next = on_success_->FilterOneByte(depth - 1);
RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, RegExpFlags flags) {
RegExpNode* next = on_success_->FilterOneByte(depth - 1, flags);
if (next == nullptr) return set_replacement(nullptr);
on_success_ = next;
return set_replacement(this);
@ -1829,7 +1849,9 @@ bool RangeContainsLatin1Equivalents(CharacterRange range) {
range.Contains(0x0178);
}
static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
namespace {
bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
for (int i = 0; i < ranges->length(); i++) {
// TODO(dcarney): this could be a lot more efficient.
if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
@ -1837,7 +1859,9 @@ static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
return false;
}
RegExpNode* TextNode::FilterOneByte(int depth) {
} // namespace
RegExpNode* TextNode::FilterOneByte(int depth, RegExpFlags flags) {
if (info()->replacement_calculated) return replacement();
if (depth < 0) return this;
DCHECK(!info()->visited);
@ -1846,15 +1870,15 @@ RegExpNode* TextNode::FilterOneByte(int depth) {
for (int i = 0; i < element_count; i++) {
TextElement elm = elements()->at(i);
if (elm.text_type() == TextElement::ATOM) {
Vector<const uc16> quarks = elm.atom()->data();
base::Vector<const base::uc16> quarks = elm.atom()->data();
for (int j = 0; j < quarks.length(); j++) {
uc16 c = quarks[j];
if (elm.atom()->ignore_case()) {
base::uc16 c = quarks[j];
if (IsIgnoreCase(flags)) {
c = unibrow::Latin1::TryConvertToLatin1(c);
}
if (c > unibrow::Latin1::kMaxChar) return set_replacement(nullptr);
// Replace quark in case we converted to Latin-1.
uc16* writable_quarks = const_cast<uc16*>(quarks.begin());
base::uc16* writable_quarks = const_cast<base::uc16*>(quarks.begin());
writable_quarks[j] = c;
}
} else {
@ -1868,8 +1892,7 @@ RegExpNode* TextNode::FilterOneByte(int depth) {
if (range_count != 0 && ranges->at(0).from() == 0 &&
ranges->at(0).to() >= String::kMaxOneByteCharCode) {
// This will be handled in a later filter.
if (IgnoreCase(cc->flags()) &&
RangesContainLatin1Equivalents(ranges)) {
if (IsIgnoreCase(flags) && RangesContainLatin1Equivalents(ranges)) {
continue;
}
return set_replacement(nullptr);
@ -1878,8 +1901,7 @@ RegExpNode* TextNode::FilterOneByte(int depth) {
if (range_count == 0 ||
ranges->at(0).from() > String::kMaxOneByteCharCode) {
// This will be handled in a later filter.
if (IgnoreCase(cc->flags()) &&
RangesContainLatin1Equivalents(ranges)) {
if (IsIgnoreCase(flags) && RangesContainLatin1Equivalents(ranges)) {
continue;
}
return set_replacement(nullptr);
@ -1887,26 +1909,27 @@ RegExpNode* TextNode::FilterOneByte(int depth) {
}
}
}
return FilterSuccessor(depth - 1);
return FilterSuccessor(depth - 1, flags);
}
RegExpNode* LoopChoiceNode::FilterOneByte(int depth) {
RegExpNode* LoopChoiceNode::FilterOneByte(int depth, RegExpFlags flags) {
if (info()->replacement_calculated) return replacement();
if (depth < 0) return this;
if (info()->visited) return this;
{
VisitMarker marker(info());
RegExpNode* continue_replacement = continue_node_->FilterOneByte(depth - 1);
RegExpNode* continue_replacement =
continue_node_->FilterOneByte(depth - 1, flags);
// If we can't continue after the loop then there is no sense in doing the
// loop.
if (continue_replacement == nullptr) return set_replacement(nullptr);
}
return ChoiceNode::FilterOneByte(depth - 1);
return ChoiceNode::FilterOneByte(depth - 1, flags);
}
RegExpNode* ChoiceNode::FilterOneByte(int depth) {
RegExpNode* ChoiceNode::FilterOneByte(int depth, RegExpFlags flags) {
if (info()->replacement_calculated) return replacement();
if (depth < 0) return this;
if (info()->visited) return this;
@ -1926,7 +1949,8 @@ RegExpNode* ChoiceNode::FilterOneByte(int depth) {
RegExpNode* survivor = nullptr;
for (int i = 0; i < choice_count; i++) {
GuardedAlternative alternative = alternatives_->at(i);
RegExpNode* replacement = alternative.node()->FilterOneByte(depth - 1);
RegExpNode* replacement =
alternative.node()->FilterOneByte(depth - 1, flags);
DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK.
if (replacement != nullptr) {
alternatives_->at(i).set_node(replacement);
@ -1946,7 +1970,7 @@ RegExpNode* ChoiceNode::FilterOneByte(int depth) {
zone()->New<ZoneList<GuardedAlternative>>(surviving, zone());
for (int i = 0; i < choice_count; i++) {
RegExpNode* replacement =
alternatives_->at(i).node()->FilterOneByte(depth - 1);
alternatives_->at(i).node()->FilterOneByte(depth - 1, flags);
if (replacement != nullptr) {
alternatives_->at(i).set_node(replacement);
new_alternatives->Add(alternatives_->at(i), zone());
@ -1956,7 +1980,8 @@ RegExpNode* ChoiceNode::FilterOneByte(int depth) {
return this;
}
RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth) {
RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth,
RegExpFlags flags) {
if (info()->replacement_calculated) return replacement();
if (depth < 0) return this;
if (info()->visited) return this;
@ -1964,12 +1989,12 @@ RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth) {
// Alternative 0 is the negative lookahead, alternative 1 is what comes
// afterwards.
RegExpNode* node = continue_node();
RegExpNode* replacement = node->FilterOneByte(depth - 1);
RegExpNode* replacement = node->FilterOneByte(depth - 1, flags);
if (replacement == nullptr) return set_replacement(nullptr);
alternatives_->at(kContinueIndex).set_node(replacement);
RegExpNode* neg_node = lookaround_node();
RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1);
RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, flags);
// If the negative lookahead is always going to fail then
// we don't need to check it.
if (neg_replacement == nullptr) return set_replacement(replacement);
@ -2062,7 +2087,8 @@ namespace {
void EmitWordCheck(RegExpMacroAssembler* assembler, Label* word,
Label* non_word, bool fall_through_on_word) {
if (assembler->CheckSpecialCharacterClass(
fall_through_on_word ? 'w' : 'W',
fall_through_on_word ? StandardCharacterSet::kWord
: StandardCharacterSet::kNotWord,
fall_through_on_word ? non_word : word)) {
// Optimized implementation available.
return;
@ -2108,7 +2134,8 @@ void EmitHat(RegExpCompiler* compiler, RegExpNode* on_success, Trace* trace) {
const bool can_skip_bounds_check = !may_be_at_or_before_subject_string_start;
assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
new_trace.backtrack(), can_skip_bounds_check);
if (!assembler->CheckSpecialCharacterClass('n', new_trace.backtrack())) {
if (!assembler->CheckSpecialCharacterClass(
StandardCharacterSet::kLineTerminator, new_trace.backtrack())) {
// Newline means \n, \r, 0x2028 or 0x2029.
if (!compiler->one_byte()) {
assembler->CheckCharacterAfterAnd(0x2028, 0xFFFE, &ok);
@ -2253,18 +2280,22 @@ void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
on_success()->Emit(compiler, trace);
}
static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
namespace {
bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
if (quick_check == nullptr) return false;
if (offset >= quick_check->characters()) return false;
return quick_check->positions(offset)->determines_perfectly;
}
static void UpdateBoundsCheck(int index, int* checked_up_to) {
void UpdateBoundsCheck(int index, int* checked_up_to) {
if (index > *checked_up_to) {
*checked_up_to = index;
}
}
} // namespace
// We call this repeatedly to generate code for each pass over the text node.
// The passes are in increasing order of difficulty because we hope one
// of the first passes will fail in which case we are saved the work of the
@ -2308,13 +2339,13 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler, TextEmitPassType pass,
TextElement elm = elements()->at(i);
int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
if (elm.text_type() == TextElement::ATOM) {
if (SkipPass(pass, elm.atom()->ignore_case())) continue;
Vector<const uc16> quarks = elm.atom()->data();
if (SkipPass(pass, IsIgnoreCase(compiler->flags()))) continue;
base::Vector<const base::uc16> quarks = elm.atom()->data();
for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
if (first_element_checked && i == 0 && j == 0) continue;
if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
uc16 quark = quarks[j];
if (elm.atom()->ignore_case()) {
base::uc16 quark = quarks[j];
if (IsIgnoreCase(compiler->flags())) {
// Everywhere else we assume that a non-Latin-1 character cannot match
// a Latin-1 character. Avoid the cases where this is assumption is
// invalid by using the Latin1 equivalent instead.
@ -2383,29 +2414,38 @@ bool TextNode::SkipPass(TextEmitPassType pass, bool ignore_case) {
TextNode* TextNode::CreateForCharacterRanges(Zone* zone,
ZoneList<CharacterRange>* ranges,
bool read_backward,
RegExpNode* on_success,
JSRegExp::Flags flags) {
RegExpNode* on_success) {
DCHECK_NOT_NULL(ranges);
ZoneList<TextElement>* elms = zone->New<ZoneList<TextElement>>(1, zone);
// TODO(jgruber): There's no fundamental need to create this
// RegExpCharacterClass; we could refactor to avoid the allocation.
return zone->New<TextNode>(zone->New<RegExpCharacterClass>(zone, ranges),
read_backward, on_success);
}
TextNode* TextNode::CreateForSurrogatePair(
Zone* zone, CharacterRange lead, ZoneList<CharacterRange>* trail_ranges,
bool read_backward, RegExpNode* on_success) {
ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);
ZoneList<TextElement>* elms = zone->New<ZoneList<TextElement>>(2, zone);
elms->Add(TextElement::CharClass(
zone->New<RegExpCharacterClass>(zone, ranges, flags)),
zone->New<RegExpCharacterClass>(zone, lead_ranges)),
zone);
elms->Add(TextElement::CharClass(
zone->New<RegExpCharacterClass>(zone, trail_ranges)),
zone);
return zone->New<TextNode>(elms, read_backward, on_success);
}
TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead,
CharacterRange trail,
bool read_backward,
RegExpNode* on_success,
JSRegExp::Flags flags) {
ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);
TextNode* TextNode::CreateForSurrogatePair(
Zone* zone, ZoneList<CharacterRange>* lead_ranges, CharacterRange trail,
bool read_backward, RegExpNode* on_success) {
ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);
ZoneList<TextElement>* elms = zone->New<ZoneList<TextElement>>(2, zone);
elms->Add(TextElement::CharClass(
zone->New<RegExpCharacterClass>(zone, lead_ranges, flags)),
zone->New<RegExpCharacterClass>(zone, lead_ranges)),
zone);
elms->Add(TextElement::CharClass(
zone->New<RegExpCharacterClass>(zone, trail_ranges, flags)),
zone->New<RegExpCharacterClass>(zone, trail_ranges)),
zone);
return zone->New<TextNode>(elms, read_backward, on_success);
}
@ -2479,26 +2519,23 @@ void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
bound_checked_up_to_ = std::max(0, bound_checked_up_to_ - by);
}
void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) {
void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte,
RegExpFlags flags) {
if (!IsIgnoreCase(flags)) return;
#ifdef V8_INTL_SUPPORT
if (NeedsUnicodeCaseEquivalents(flags)) return;
#endif
int element_count = elements()->length();
for (int i = 0; i < element_count; i++) {
TextElement elm = elements()->at(i);
if (elm.text_type() == TextElement::CHAR_CLASS) {
RegExpCharacterClass* cc = elm.char_class();
#ifdef V8_INTL_SUPPORT
bool case_equivalents_already_added =
NeedsUnicodeCaseEquivalents(cc->flags());
#else
bool case_equivalents_already_added = false;
#endif
if (IgnoreCase(cc->flags()) && !case_equivalents_already_added) {
// None of the standard character classes is different in the case
// independent case and it slows us down if we don't know that.
if (cc->is_standard(zone())) continue;
ZoneList<CharacterRange>* ranges = cc->ranges(zone());
CharacterRange::AddCaseEquivalents(isolate, zone(), ranges,
is_one_byte);
}
// None of the standard character classes is different in the case
// independent case and it slows us down if we don't know that.
if (cc->is_standard(zone())) continue;
ZoneList<CharacterRange>* ranges = cc->ranges(zone());
CharacterRange::AddCaseEquivalents(isolate, zone(), ranges, is_one_byte);
}
}
}
@ -2518,7 +2555,7 @@ RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
return ranges->length() == 0 ? on_success() : nullptr;
}
if (ranges->length() != 1) return nullptr;
const uc32 max_char = MaxCodeUnit(compiler->one_byte());
const base::uc32 max_char = MaxCodeUnit(compiler->one_byte());
return ranges->at(0).IsEverything(max_char) ? on_success() : nullptr;
}
@ -2681,7 +2718,7 @@ ContainedInLattice AddRange(ContainedInLattice containment, const int* ranges,
}
int BitsetFirstSetBit(BoyerMoorePositionInfo::Bitset bitset) {
STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
static_assert(BoyerMoorePositionInfo::kMapSize ==
2 * kInt64Size * kBitsPerByte);
// Slight fiddling is needed here, since the bitset is of length 128 while
@ -2692,7 +2729,7 @@ int BitsetFirstSetBit(BoyerMoorePositionInfo::Bitset bitset) {
{
static constexpr BoyerMoorePositionInfo::Bitset mask(~uint64_t{0});
BoyerMoorePositionInfo::Bitset masked_bitset = bitset & mask;
STATIC_ASSERT(kInt64Size >= sizeof(decltype(masked_bitset.to_ullong())));
static_assert(kInt64Size >= sizeof(decltype(masked_bitset.to_ullong())));
uint64_t lsb = masked_bitset.to_ullong();
if (lsb != 0) return base::bits::CountTrailingZeros(lsb);
}
@ -3436,7 +3473,7 @@ void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
RecursionCheck rc(compiler);
DCHECK_EQ(start_reg_ + 1, end_reg_);
if (IgnoreCase(flags_)) {
if (IsIgnoreCase(flags_)) {
bool unicode = IsUnicode(flags_);
assembler->CheckNotBackReferenceIgnoreCase(start_reg_, read_backward(),
unicode, trace->backtrack());
@ -3626,9 +3663,10 @@ class EatsAtLeastPropagator : public AllStatic {
template <typename... Propagators>
class Analysis : public NodeVisitor {
public:
Analysis(Isolate* isolate, bool is_one_byte)
Analysis(Isolate* isolate, bool is_one_byte, RegExpFlags flags)
: isolate_(isolate),
is_one_byte_(is_one_byte),
flags_(flags),
error_(RegExpError::kNone) {}
void EnsureAnalyzed(RegExpNode* that) {
@ -3669,7 +3707,7 @@ class Analysis : public NodeVisitor {
} while (false)
void VisitText(TextNode* that) override {
that->MakeCaseIndependent(isolate(), is_one_byte_);
that->MakeCaseIndependent(isolate(), is_one_byte_, flags_);
EnsureAnalyzed(that->on_success());
if (has_failed()) return;
that->CalculateOffsets();
@ -3736,16 +3774,17 @@ class Analysis : public NodeVisitor {
private:
Isolate* isolate_;
bool is_one_byte_;
const bool is_one_byte_;
const RegExpFlags flags_;
RegExpError error_;
DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis);
};
RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte,
RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpFlags flags,
RegExpNode* node) {
Analysis<AssertionPropagator, EatsAtLeastPropagator> analysis(isolate,
is_one_byte);
Analysis<AssertionPropagator, EatsAtLeastPropagator> analysis(
isolate, is_one_byte, flags);
DCHECK_EQ(node->info()->been_analyzed, false);
analysis.EnsureAnalyzed(node);
DCHECK_IMPLIES(analysis.has_failed(), analysis.error() != RegExpError::kNone);
@ -3761,7 +3800,7 @@ void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
SaveBMInfo(bm, not_at_start, offset);
}
STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
static_assert(BoyerMoorePositionInfo::kMapSize ==
RegExpMacroAssembler::kTableSize);
void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
@ -3798,14 +3837,14 @@ void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
if (initial_offset == 0) set_bm_info(not_at_start, bm);
return;
}
uc16 character = atom->data()[j];
if (IgnoreCase(atom->flags())) {
base::uc16 character = atom->data()[j];
if (IsIgnoreCase(bm->compiler()->flags())) {
unibrow::uchar chars[4];
int length = GetCaseIndependentLetters(
isolate, character, bm->max_char() == String::kMaxOneByteCharCode,
chars, 4);
for (int j = 0; j < length; j++) {
bm->Set(offset, chars[j]);
for (int k = 0; k < length; k++) {
bm->Set(offset, chars[k]);
}
} else {
if (character <= max_char) bm->Set(offset, character);
@ -3838,7 +3877,7 @@ void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
}
RegExpNode* RegExpCompiler::OptionallyStepBackToLeadSurrogate(
RegExpNode* on_success, JSRegExp::Flags flags) {
RegExpNode* on_success) {
DCHECK(!read_backward());
ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
zone(), CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
@ -3850,11 +3889,11 @@ RegExpNode* RegExpCompiler::OptionallyStepBackToLeadSurrogate(
int stack_register = UnicodeLookaroundStackRegister();
int position_register = UnicodeLookaroundPositionRegister();
RegExpNode* step_back = TextNode::CreateForCharacterRanges(
zone(), lead_surrogates, true, on_success, flags);
zone(), lead_surrogates, true, on_success);
RegExpLookaround::Builder builder(true, step_back, stack_register,
position_register);
RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
zone(), trail_surrogates, false, builder.on_match_success(), flags);
zone(), trail_surrogates, false, builder.on_match_success());
optional_step_back->AddAlternative(
GuardedAlternative(builder.ForMatch(match_trail)));
@ -3864,7 +3903,7 @@ RegExpNode* RegExpCompiler::OptionallyStepBackToLeadSurrogate(
}
RegExpNode* RegExpCompiler::PreprocessRegExp(RegExpCompileData* data,
JSRegExp::Flags flags,
RegExpFlags flags,
bool is_one_byte) {
// Wrap the body of the regexp in capture #0.
RegExpNode* captured_body =
@ -3873,11 +3912,10 @@ RegExpNode* RegExpCompiler::PreprocessRegExp(RegExpCompileData* data,
if (!data->tree->IsAnchoredAtStart() && !IsSticky(flags)) {
// Add a .*? at the beginning, outside the body capture, unless
// this expression is anchored at the beginning or sticky.
JSRegExp::Flags default_flags = JSRegExp::Flags();
RegExpNode* loop_node = RegExpQuantifier::ToNode(
0, RegExpTree::kInfinity, false,
zone()->New<RegExpCharacterClass>('*', default_flags), this,
captured_body, data->contains_anchor);
zone()->New<RegExpCharacterClass>(StandardCharacterSet::kEverything),
this, captured_body, data->contains_anchor);
if (data->contains_anchor) {
// Unroll loop once, to take care of the case that might start
@ -3885,27 +3923,33 @@ RegExpNode* RegExpCompiler::PreprocessRegExp(RegExpCompileData* data,
ChoiceNode* first_step_node = zone()->New<ChoiceNode>(2, zone());
first_step_node->AddAlternative(GuardedAlternative(captured_body));
first_step_node->AddAlternative(GuardedAlternative(zone()->New<TextNode>(
zone()->New<RegExpCharacterClass>('*', default_flags), false,
loop_node)));
zone()->New<RegExpCharacterClass>(StandardCharacterSet::kEverything),
false, loop_node)));
node = first_step_node;
} else {
node = loop_node;
}
}
if (is_one_byte) {
node = node->FilterOneByte(RegExpCompiler::kMaxRecursion);
node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, flags);
// Do it again to propagate the new nodes to places where they were not
// put because they had not been calculated yet.
if (node != nullptr) {
node = node->FilterOneByte(RegExpCompiler::kMaxRecursion);
node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, flags);
}
} else if (IsUnicode(flags) && (IsGlobal(flags) || IsSticky(flags))) {
node = OptionallyStepBackToLeadSurrogate(node, flags);
node = OptionallyStepBackToLeadSurrogate(node);
}
if (node == nullptr) node = zone()->New<EndNode>(EndNode::BACKTRACK, zone());
return node;
}
void RegExpCompiler::ToNodeCheckForStackOverflow() {
if (StackLimitCheck{isolate()}.HasOverflowed()) {
V8::FatalProcessOutOfMemory(isolate(), "RegExpCompiler");
}
}
} // namespace internal
} // namespace v8

Просмотреть файл

@ -20,7 +20,7 @@ namespace regexp_compiler_constants {
// The '2' variant is has inclusive from and exclusive to.
// This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
// which include WhiteSpace (7.2) or LineTerminator (7.3) values.
constexpr uc32 kRangeEndMarker = 0x110000;
constexpr base::uc32 kRangeEndMarker = 0x110000;
constexpr int kSpaceRanges[] = {
'\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680,
0x1681, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030,
@ -47,34 +47,10 @@ constexpr int kPatternTooShortForBoyerMoore = 2;
} // namespace regexp_compiler_constants
inline bool IgnoreCase(JSRegExp::Flags flags) {
return (flags & JSRegExp::kIgnoreCase) != 0;
}
inline bool IsUnicode(JSRegExp::Flags flags) {
return (flags & JSRegExp::kUnicode) != 0;
}
inline bool IsSticky(JSRegExp::Flags flags) {
return (flags & JSRegExp::kSticky) != 0;
}
inline bool IsGlobal(JSRegExp::Flags flags) {
return (flags & JSRegExp::kGlobal) != 0;
}
inline bool DotAll(JSRegExp::Flags flags) {
return (flags & JSRegExp::kDotAll) != 0;
}
inline bool Multiline(JSRegExp::Flags flags) {
return (flags & JSRegExp::kMultiline) != 0;
}
inline bool NeedsUnicodeCaseEquivalents(JSRegExp::Flags flags) {
inline bool NeedsUnicodeCaseEquivalents(RegExpFlags flags) {
// Both unicode and ignore_case flags are set. We need to use ICU to find
// the closure over case equivalents.
return IsUnicode(flags) && IgnoreCase(flags);
return IsUnicode(flags) && IsIgnoreCase(flags);
}
// Details of a quick mask-compare check that can look ahead in the
@ -95,8 +71,8 @@ class QuickCheckDetails {
void set_cannot_match() { cannot_match_ = true; }
struct Position {
Position() : mask(0), value(0), determines_perfectly(false) {}
uc32 mask;
uc32 value;
base::uc32 mask;
base::uc32 value;
bool determines_perfectly;
};
int characters() { return characters_; }
@ -422,7 +398,8 @@ struct PreloadState {
// Analysis performs assertion propagation and computes eats_at_least_ values.
// See the comments on AssertionPropagator and EatsAtLeastPropagator for more
// details.
RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpNode* node);
RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpFlags flags,
RegExpNode* node);
class FrequencyCollator {
public:
@ -472,7 +449,7 @@ class FrequencyCollator {
class RegExpCompiler {
public:
RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
bool is_one_byte);
RegExpFlags flags, bool is_one_byte);
int AllocateRegister() {
if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
@ -524,13 +501,12 @@ class RegExpCompiler {
// - Inserting the implicit .* before/after the regexp if necessary.
// - If the input is a one-byte string, filtering out nodes that can't match.
// - Fixing up regexp matches that start within a surrogate pair.
RegExpNode* PreprocessRegExp(RegExpCompileData* data, JSRegExp::Flags flags,
RegExpNode* PreprocessRegExp(RegExpCompileData* data, RegExpFlags flags,
bool is_one_byte);
// If the regexp matching starts within a surrogate pair, step back to the
// lead surrogate and start matching from there.
RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpNode* on_success,
JSRegExp::Flags flags);
RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpNode* on_success);
inline void AddWork(RegExpNode* node) {
if (!node->on_work_list() && !node->label()->is_bound()) {
@ -551,6 +527,8 @@ class RegExpCompiler {
inline void IncrementRecursionDepth() { recursion_depth_++; }
inline void DecrementRecursionDepth() { recursion_depth_--; }
RegExpFlags flags() const { return flags_; }
void SetRegExpTooBig() { reg_exp_too_big_ = true; }
inline bool one_byte() { return one_byte_; }
@ -569,6 +547,18 @@ class RegExpCompiler {
current_expansion_factor_ = value;
}
// The recursive nature of ToNode node generation means we may run into stack
// overflow issues. We introduce periodic checks to detect these, and the
// tick counter helps limit overhead of these checks.
// TODO(jgruber): This is super hacky and should be replaced by an abort
// mechanism or iterative node generation.
void ToNodeMaybeCheckForStackOverflow() {
if ((to_node_overflow_check_ticks_++ % 16 == 0)) {
ToNodeCheckForStackOverflow();
}
}
void ToNodeCheckForStackOverflow();
Isolate* isolate() const { return isolate_; }
Zone* zone() const { return zone_; }
@ -581,10 +571,12 @@ class RegExpCompiler {
int unicode_lookaround_position_register_;
ZoneVector<RegExpNode*>* work_list_;
int recursion_depth_;
const RegExpFlags flags_;
RegExpMacroAssembler* macro_assembler_;
bool one_byte_;
bool reg_exp_too_big_;
bool limiting_recursion_;
int to_node_overflow_check_ticks_ = 0;
bool optimize_;
bool read_backward_;
int current_expansion_factor_;

Просмотреть файл

@ -127,9 +127,9 @@ void DotPrinterImpl::VisitText(TextNode* that) {
TextElement elm = that->elements()->at(i);
switch (elm.text_type()) {
case TextElement::ATOM: {
Vector<const uc16> data = elm.atom()->data();
for (int i = 0; i < data.length(); i++) {
os_ << static_cast<char>(data[i]);
base::Vector<const base::uc16> data = elm.atom()->data();
for (int j = 0; j < data.length(); j++) {
os_ << static_cast<char>(data[j]);
}
break;
}

Просмотреть файл

@ -52,6 +52,11 @@ enum class RegExpError : uint32_t {
V8_EXPORT_PRIVATE const char* RegExpErrorString(RegExpError error);
inline constexpr bool RegExpErrorIsStackOverflow(RegExpError error) {
return error == RegExpError::kStackOverflow ||
error == RegExpError::kAnalysisStackOverflow;
}
} // namespace internal
} // namespace v8

Просмотреть файл

@ -28,12 +28,13 @@ namespace internal {
namespace {
bool BackRefMatchesNoCase(Isolate* isolate, int from, int current, int len,
Vector<const uc16> subject, bool unicode) {
base::Vector<const base::uc16> subject,
bool unicode) {
Address offset_a =
reinterpret_cast<Address>(const_cast<uc16*>(&subject.at(from)));
reinterpret_cast<Address>(const_cast<base::uc16*>(&subject.at(from)));
Address offset_b =
reinterpret_cast<Address>(const_cast<uc16*>(&subject.at(current)));
size_t length = len * kUC16Size;
reinterpret_cast<Address>(const_cast<base::uc16*>(&subject.at(current)));
size_t length = len * base::kUC16Size;
bool result = unicode
? RegExpMacroAssembler::CaseInsensitiveCompareUnicode(
@ -44,7 +45,7 @@ bool BackRefMatchesNoCase(Isolate* isolate, int from, int current, int len,
}
bool BackRefMatchesNoCase(Isolate* isolate, int from, int current, int len,
Vector<const uint8_t> subject, bool unicode) {
base::Vector<const uint8_t> subject, bool unicode) {
// For Latin1 characters the unicode flag makes no difference.
for (int i = 0; i < len; i++) {
unsigned int old_char = subject[from++];
@ -170,7 +171,7 @@ class InterpreterRegisters {
output_register_count_(output_register_count) {
// TODO(jgruber): Use int32_t consistently for registers. Currently, CSA
// uses int32_t while runtime uses int.
STATIC_ASSERT(sizeof(int) == sizeof(int32_t));
static_assert(sizeof(int) == sizeof(int32_t));
DCHECK_GE(output_register_count, 2); // At least 2 for the match itself.
DCHECK_GE(total_register_count, output_register_count);
DCHECK_LE(total_register_count, RegExpMacroAssembler::kMaxRegisterCount);
@ -222,7 +223,7 @@ void UpdateCodeAndSubjectReferences(
Isolate* isolate, Handle<ByteArray> code_array,
Handle<String> subject_string, ByteArray* code_array_out,
const byte** code_base_out, const byte** pc_out, String* subject_string_out,
Vector<const Char>* subject_string_vector_out) {
base::Vector<const Char>* subject_string_vector_out) {
DisallowGarbageCollection no_gc;
if (*code_base_out != code_array->GetDataStartAddress()) {
@ -244,7 +245,7 @@ template <typename Char>
IrregexpInterpreter::Result HandleInterrupts(
Isolate* isolate, RegExp::CallOrigin call_origin, ByteArray* code_array_out,
String* subject_string_out, const byte** code_base_out,
Vector<const Char>* subject_string_vector_out, const byte** pc_out) {
base::Vector<const Char>* subject_string_vector_out, const byte** pc_out) {
DisallowGarbageCollection no_gc;
StackLimitCheck check(isolate);
@ -282,8 +283,8 @@ IrregexpInterpreter::Result HandleInterrupts(
return IrregexpInterpreter::EXCEPTION;
}
// If we changed between a LATIN1 and a UC16 string, we need to restart
// regexp matching with the appropriate template instantiation of
// If we changed between a LATIN1 and a UC16 string, we need to
// restart regexp matching with the appropriate template instantiation of
// RawMatch.
if (String::IsOneByteRepresentationUnderneath(*subject_handle) !=
was_one_byte) {
@ -373,7 +374,7 @@ bool IndexIsInBounds(int index, int length) {
template <typename Char>
IrregexpInterpreter::Result RawMatch(
Isolate* isolate, ByteArray code_array, String subject_string,
Vector<const Char> subject, int* output_registers,
base::Vector<const Char> subject, int* output_registers,
int output_register_count, int total_register_count, int current,
uint32_t current_char, RegExp::CallOrigin call_origin,
const uint32_t backtrack_limit) {
@ -414,8 +415,8 @@ IrregexpInterpreter::Result RawMatch(
base::bits::RoundUpToPowerOfTwo32(kRegExpBytecodeCount));
// Make sure every bytecode we get by using BYTECODE_MASK is well defined.
STATIC_ASSERT(kRegExpBytecodeCount <= kRegExpPaddedBytecodeCount);
STATIC_ASSERT(kRegExpBytecodeCount + kRegExpBytecodeFillerCount ==
static_assert(kRegExpBytecodeCount <= kRegExpPaddedBytecodeCount);
static_assert(kRegExpBytecodeCount + kRegExpBytecodeFillerCount ==
kRegExpPaddedBytecodeCount);
#define DECLARE_DISPATCH_TABLE_ENTRY(name, ...) &&BC_##name,
@ -512,7 +513,7 @@ IrregexpInterpreter::Result RawMatch(
DISPATCH();
}
BYTECODE(POP_BT) {
STATIC_ASSERT(JSRegExp::kNoBacktrackLimit == 0);
static_assert(JSRegExp::kNoBacktrackLimit == 0);
if (++backtrack_count == backtrack_limit) {
int return_code = LoadPacked24Signed(insn);
return static_cast<IrregexpInterpreter::Result>(return_code);
@ -1050,12 +1051,12 @@ IrregexpInterpreter::Result IrregexpInterpreter::Match(
if (FLAG_regexp_tier_up) regexp.TierUpTick();
bool is_one_byte = String::IsOneByteRepresentationUnderneath(subject_string);
ByteArray code_array = ByteArray::cast(regexp.Bytecode(is_one_byte));
int total_register_count = regexp.MaxRegisterCount();
ByteArray code_array = ByteArray::cast(regexp.bytecode(is_one_byte));
int total_register_count = regexp.max_register_count();
return MatchInternal(isolate, code_array, subject_string, output_registers,
output_register_count, total_register_count,
start_position, call_origin, regexp.BacktrackLimit());
start_position, call_origin, regexp.backtrack_limit());
}
IrregexpInterpreter::Result IrregexpInterpreter::MatchInternal(
@ -1065,6 +1066,9 @@ IrregexpInterpreter::Result IrregexpInterpreter::MatchInternal(
uint32_t backtrack_limit) {
DCHECK(subject_string.IsFlat());
// TODO(chromium:1262676): Remove this CHECK once fixed.
CHECK(code_array.IsByteArray());
// Note: Heap allocation *is* allowed in two situations if calling from
// Runtime:
// 1. When creating & throwing a stack overflow exception. The interpreter
@ -1073,10 +1077,15 @@ IrregexpInterpreter::Result IrregexpInterpreter::MatchInternal(
// after interrupts have run.
DisallowGarbageCollection no_gc;
uc16 previous_char = '\n';
base::uc16 previous_char = '\n';
String::FlatContent subject_content = subject_string.GetFlatContent(no_gc);
// Because interrupts can result in GC and string content relocation, the
// checksum verification in FlatContent may fail even though this code is
// safe. See (2) above.
subject_content.UnsafeDisableChecksumVerification();
if (subject_content.IsOneByte()) {
Vector<const uint8_t> subject_vector = subject_content.ToOneByteVector();
base::Vector<const uint8_t> subject_vector =
subject_content.ToOneByteVector();
if (start_position != 0) previous_char = subject_vector[start_position - 1];
return RawMatch(isolate, code_array, subject_string, subject_vector,
output_registers, output_register_count,
@ -1084,7 +1093,8 @@ IrregexpInterpreter::Result IrregexpInterpreter::MatchInternal(
call_origin, backtrack_limit);
} else {
DCHECK(subject_content.IsTwoByte());
Vector<const uc16> subject_vector = subject_content.ToUC16Vector();
base::Vector<const base::uc16> subject_vector =
subject_content.ToUC16Vector();
if (start_position != 0) previous_char = subject_vector[start_position - 1];
return RawMatch(isolate, code_array, subject_string, subject_vector,
output_registers, output_register_count,
@ -1099,7 +1109,7 @@ IrregexpInterpreter::Result IrregexpInterpreter::MatchInternal(
// builtin.
IrregexpInterpreter::Result IrregexpInterpreter::MatchForCallFromJs(
Address subject, int32_t start_position, Address, Address,
int* output_registers, int32_t output_register_count, Address,
int* output_registers, int32_t output_register_count,
RegExp::CallOrigin call_origin, Isolate* isolate, Address regexp) {
DCHECK_NOT_NULL(isolate);
DCHECK_NOT_NULL(output_registers);

Просмотреть файл

@ -12,6 +12,8 @@
namespace v8 {
namespace internal {
class ByteArray;
class V8_EXPORT_PRIVATE IrregexpInterpreter : public AllStatic {
public:
enum Result {
@ -34,9 +36,8 @@ class V8_EXPORT_PRIVATE IrregexpInterpreter : public AllStatic {
// RETRY is returned if a retry through the runtime is needed (e.g. when
// interrupts have been scheduled or the regexp is marked for tier-up).
//
// Arguments input_start, input_end and backtrack_stack are
// unused. They are only passed to match the signature of the native irregex
// code.
// Arguments input_start and input_end are unused. They are only passed to
// match the signature of the native irregex code.
//
// Arguments output_registers and output_register_count describe the results
// array, which will contain register values of all captures if SUCCESS is
@ -45,7 +46,6 @@ class V8_EXPORT_PRIVATE IrregexpInterpreter : public AllStatic {
Address input_start, Address input_end,
int* output_registers,
int32_t output_register_count,
Address backtrack_stack,
RegExp::CallOrigin call_origin,
Isolate* isolate, Address regexp);

Просмотреть файл

@ -170,9 +170,11 @@ void RegExpMacroAssemblerTracer::LoadCurrentCharacterImpl(
characters, eats_at_least);
}
namespace {
class PrintablePrinter {
public:
explicit PrintablePrinter(uc16 character) : character_(character) { }
explicit PrintablePrinter(base::uc16 character) : character_(character) {}
const char* operator*() {
if (character_ >= ' ' && character_ <= '~') {
@ -187,12 +189,14 @@ class PrintablePrinter {
}
private:
uc16 character_;
base::uc16 character_;
char buffer_[4];
};
} // namespace
void RegExpMacroAssemblerTracer::CheckCharacterLT(uc16 limit, Label* on_less) {
void RegExpMacroAssemblerTracer::CheckCharacterLT(base::uc16 limit,
Label* on_less) {
PrintablePrinter printable(limit);
PrintF(" CheckCharacterLT(c=0x%04x%s, label[%08x]);\n",
limit,
@ -201,8 +205,7 @@ void RegExpMacroAssemblerTracer::CheckCharacterLT(uc16 limit, Label* on_less) {
assembler_->CheckCharacterLT(limit, on_less);
}
void RegExpMacroAssemblerTracer::CheckCharacterGT(uc16 limit,
void RegExpMacroAssemblerTracer::CheckCharacterGT(base::uc16 limit,
Label* on_greater) {
PrintablePrinter printable(limit);
PrintF(" CheckCharacterGT(c=0x%04x%s, label[%08x]);\n",
@ -212,7 +215,6 @@ void RegExpMacroAssemblerTracer::CheckCharacterGT(uc16 limit,
assembler_->CheckCharacterGT(limit, on_greater);
}
void RegExpMacroAssemblerTracer::CheckCharacter(unsigned c, Label* on_equal) {
PrintablePrinter printable(c);
PrintF(" CheckCharacter(c=0x%04x%s, label[%08x]);\n",
@ -275,12 +277,8 @@ void RegExpMacroAssemblerTracer::CheckNotCharacterAfterAnd(
assembler_->CheckNotCharacterAfterAnd(c, mask, on_not_equal);
}
void RegExpMacroAssemblerTracer::CheckNotCharacterAfterMinusAnd(
uc16 c,
uc16 minus,
uc16 mask,
Label* on_not_equal) {
base::uc16 c, base::uc16 minus, base::uc16 mask, Label* on_not_equal) {
PrintF(" CheckNotCharacterAfterMinusAnd(c=0x%04x, minus=%04x, mask=0x%04x, "
"label[%08x]);\n",
c,
@ -290,11 +288,9 @@ void RegExpMacroAssemblerTracer::CheckNotCharacterAfterMinusAnd(
assembler_->CheckNotCharacterAfterMinusAnd(c, minus, mask, on_not_equal);
}
void RegExpMacroAssemblerTracer::CheckCharacterInRange(
uc16 from,
uc16 to,
Label* on_not_in_range) {
void RegExpMacroAssemblerTracer::CheckCharacterInRange(base::uc16 from,
base::uc16 to,
Label* on_not_in_range) {
PrintablePrinter printable_from(from);
PrintablePrinter printable_to(to);
PrintF(" CheckCharacterInRange(from=0x%04x%s, to=0x%04x%s, label[%08x]);\n",
@ -306,11 +302,9 @@ void RegExpMacroAssemblerTracer::CheckCharacterInRange(
assembler_->CheckCharacterInRange(from, to, on_not_in_range);
}
void RegExpMacroAssemblerTracer::CheckCharacterNotInRange(
uc16 from,
uc16 to,
Label* on_in_range) {
void RegExpMacroAssemblerTracer::CheckCharacterNotInRange(base::uc16 from,
base::uc16 to,
Label* on_in_range) {
PrintablePrinter printable_from(from);
PrintablePrinter printable_to(to);
PrintF(
@ -323,6 +317,40 @@ void RegExpMacroAssemblerTracer::CheckCharacterNotInRange(
assembler_->CheckCharacterNotInRange(from, to, on_in_range);
}
namespace {
void PrintRangeArray(const ZoneList<CharacterRange>* ranges) {
for (int i = 0; i < ranges->length(); i++) {
base::uc16 from = ranges->at(i).from();
base::uc16 to = ranges->at(i).to();
PrintablePrinter printable_from(from);
PrintablePrinter printable_to(to);
PrintF(" [from=0x%04x%s, to=%04x%s],\n", from, *printable_from, to,
*printable_to);
}
}
} // namespace
bool RegExpMacroAssemblerTracer::CheckCharacterInRangeArray(
const ZoneList<CharacterRange>* ranges, Label* on_in_range) {
PrintF(
" CheckCharacterInRangeArray(\n"
" label[%08x]);\n",
LabelToInt(on_in_range));
PrintRangeArray(ranges);
return assembler_->CheckCharacterInRangeArray(ranges, on_in_range);
}
bool RegExpMacroAssemblerTracer::CheckCharacterNotInRangeArray(
const ZoneList<CharacterRange>* ranges, Label* on_not_in_range) {
PrintF(
" CheckCharacterNotInRangeArray(\n"
" label[%08x]);\n",
LabelToInt(on_not_in_range));
PrintRangeArray(ranges);
return assembler_->CheckCharacterNotInRangeArray(ranges, on_not_in_range);
}
void RegExpMacroAssemblerTracer::CheckBitInTable(
Handle<ByteArray> table, Label* on_bit_set) {
@ -362,20 +390,16 @@ void RegExpMacroAssemblerTracer::CheckPosition(int cp_offset,
assembler_->CheckPosition(cp_offset, on_outside_input);
}
bool RegExpMacroAssemblerTracer::CheckSpecialCharacterClass(
uc16 type,
Label* on_no_match) {
StandardCharacterSet type, Label* on_no_match) {
bool supported = assembler_->CheckSpecialCharacterClass(type,
on_no_match);
PrintF(" CheckSpecialCharacterClass(type='%c', label[%08x]): %s;\n",
type,
LabelToInt(on_no_match),
static_cast<char>(type), LabelToInt(on_no_match),
supported ? "true" : "false");
return supported;
}
void RegExpMacroAssemblerTracer::IfRegisterLT(int register_index,
int comparand, Label* if_lt) {
PrintF(" IfRegisterLT(register=%d, number=%d, label[%08x]);\n",

Просмотреть файл

@ -17,7 +17,9 @@ class RegExpMacroAssemblerTracer: public RegExpMacroAssembler {
~RegExpMacroAssemblerTracer() override;
void AbortedCodeGeneration() override;
int stack_limit_slack() override { return assembler_->stack_limit_slack(); }
bool CanReadUnaligned() override { return assembler_->CanReadUnaligned(); }
bool CanReadUnaligned() const override {
return assembler_->CanReadUnaligned();
}
void AdvanceCurrentPosition(int by) override; // Signed cp change.
void AdvanceRegister(int reg, int by) override; // r[reg] += by.
void Backtrack() override;
@ -25,8 +27,8 @@ class RegExpMacroAssemblerTracer: public RegExpMacroAssembler {
void CheckCharacter(unsigned c, Label* on_equal) override;
void CheckCharacterAfterAnd(unsigned c, unsigned and_with,
Label* on_equal) override;
void CheckCharacterGT(uc16 limit, Label* on_greater) override;
void CheckCharacterLT(uc16 limit, Label* on_less) override;
void CheckCharacterGT(base::uc16 limit, Label* on_greater) override;
void CheckCharacterLT(base::uc16 limit, Label* on_less) override;
void CheckGreedyLoop(Label* on_tos_equals_current_position) override;
void CheckAtStart(int cp_offset, Label* on_at_start) override;
void CheckNotAtStart(int cp_offset, Label* on_not_at_start) override;
@ -38,14 +40,21 @@ class RegExpMacroAssemblerTracer: public RegExpMacroAssembler {
void CheckNotCharacter(unsigned c, Label* on_not_equal) override;
void CheckNotCharacterAfterAnd(unsigned c, unsigned and_with,
Label* on_not_equal) override;
void CheckNotCharacterAfterMinusAnd(uc16 c, uc16 minus, uc16 and_with,
void CheckNotCharacterAfterMinusAnd(base::uc16 c, base::uc16 minus,
base::uc16 and_with,
Label* on_not_equal) override;
void CheckCharacterInRange(uc16 from, uc16 to, Label* on_in_range) override;
void CheckCharacterNotInRange(uc16 from, uc16 to,
void CheckCharacterInRange(base::uc16 from, base::uc16 to,
Label* on_in_range) override;
void CheckCharacterNotInRange(base::uc16 from, base::uc16 to,
Label* on_not_in_range) override;
bool CheckCharacterInRangeArray(const ZoneList<CharacterRange>* ranges,
Label* on_in_range) override;
bool CheckCharacterNotInRangeArray(const ZoneList<CharacterRange>* ranges,
Label* on_not_in_range) override;
void CheckBitInTable(Handle<ByteArray> table, Label* on_bit_set) override;
void CheckPosition(int cp_offset, Label* on_outside_input) override;
bool CheckSpecialCharacterClass(uc16 type, Label* on_no_match) override;
bool CheckSpecialCharacterClass(StandardCharacterSet type,
Label* on_no_match) override;
void Fail() override;
Handle<HeapObject> GetCode(Handle<String> source) override;
void GoTo(Label* label) override;

Просмотреть файл

@ -17,12 +17,16 @@ namespace internal {
RegExpMacroAssembler::RegExpMacroAssembler(Isolate* isolate, Zone* zone)
: slow_safe_compiler_(false),
backtrack_limit_(JSRegExp::kNoBacktrackLimit),
global_mode_(NOT_GLOBAL),
isolate_(isolate),
zone_(zone) {}
RegExpMacroAssembler::~RegExpMacroAssembler() = default;
bool RegExpMacroAssembler::has_backtrack_limit() const {
return backtrack_limit_ != JSRegExp::kNoBacktrackLimit;
}
// static
int RegExpMacroAssembler::CaseInsensitiveCompareNonUnicode(Address byte_offset1,
Address byte_offset2,
size_t byte_length,
@ -34,8 +38,8 @@ int RegExpMacroAssembler::CaseInsensitiveCompareNonUnicode(Address byte_offset1,
DisallowGarbageCollection no_gc;
DCHECK_EQ(0, byte_length % 2);
size_t length = byte_length / 2;
uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
base::uc16* substring1 = reinterpret_cast<base::uc16*>(byte_offset1);
base::uc16* substring2 = reinterpret_cast<base::uc16*>(byte_offset2);
for (size_t i = 0; i < length; i++) {
UChar32 c1 = RegExpCaseFolding::Canonicalize(substring1[i]);
@ -51,6 +55,7 @@ int RegExpMacroAssembler::CaseInsensitiveCompareNonUnicode(Address byte_offset1,
#endif
}
// static
int RegExpMacroAssembler::CaseInsensitiveCompareUnicode(Address byte_offset1,
Address byte_offset2,
size_t byte_length,
@ -68,8 +73,8 @@ int RegExpMacroAssembler::CaseInsensitiveCompareUnicode(Address byte_offset1,
return uni_str_1.caseCompare(reinterpret_cast<const char16_t*>(byte_offset2),
length, U_FOLD_CASE_DEFAULT) == 0;
#else
uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
base::uc16* substring1 = reinterpret_cast<base::uc16*>(byte_offset1);
base::uc16* substring2 = reinterpret_cast<base::uc16*>(byte_offset2);
size_t length = byte_length >> 1;
DCHECK_NOT_NULL(isolate);
unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
@ -93,6 +98,130 @@ int RegExpMacroAssembler::CaseInsensitiveCompareUnicode(Address byte_offset1,
#endif // V8_INTL_SUPPORT
}
namespace {
uint32_t Hash(const ZoneList<CharacterRange>* ranges) {
size_t seed = 0;
for (int i = 0; i < ranges->length(); i++) {
const CharacterRange& r = ranges->at(i);
seed = base::hash_combine(seed, r.from(), r.to());
}
return static_cast<uint32_t>(seed);
}
constexpr base::uc32 MaskEndOfRangeMarker(base::uc32 c) {
// CharacterRanges may use 0x10ffff as the end-of-range marker irrespective
// of whether the regexp IsUnicode or not; translate the marker value here.
DCHECK_IMPLIES(c > kMaxUInt16, c == String::kMaxCodePoint);
return c & 0xffff;
}
int RangeArrayLengthFor(const ZoneList<CharacterRange>* ranges) {
const int ranges_length = ranges->length();
return MaskEndOfRangeMarker(ranges->at(ranges_length - 1).to()) == kMaxUInt16
? ranges_length * 2 - 1
: ranges_length * 2;
}
bool Equals(const ZoneList<CharacterRange>* lhs, const Handle<ByteArray>& rhs) {
DCHECK_EQ(rhs->length() % kUInt16Size, 0); // uc16 elements.
const int rhs_length = rhs->length() / kUInt16Size;
if (rhs_length != RangeArrayLengthFor(lhs)) return false;
for (int i = 0; i < lhs->length(); i++) {
const CharacterRange& r = lhs->at(i);
if (rhs->get_uint16(i * 2 + 0) != r.from()) return false;
if (i * 2 + 1 == rhs_length) break;
if (rhs->get_uint16(i * 2 + 1) != r.to() + 1) return false;
}
return true;
}
Handle<ByteArray> MakeRangeArray(Isolate* isolate,
const ZoneList<CharacterRange>* ranges) {
const int ranges_length = ranges->length();
const int byte_array_length = RangeArrayLengthFor(ranges);
const int size_in_bytes = byte_array_length * kUInt16Size;
Handle<ByteArray> range_array =
isolate->factory()->NewByteArray(size_in_bytes);
for (int i = 0; i < ranges_length; i++) {
const CharacterRange& r = ranges->at(i);
DCHECK_LE(r.from(), kMaxUInt16);
range_array->set_uint16(i * 2 + 0, r.from());
const base::uc32 to = MaskEndOfRangeMarker(r.to());
if (i == ranges_length - 1 && to == kMaxUInt16) {
DCHECK_EQ(byte_array_length, ranges_length * 2 - 1);
break; // Avoid overflow by leaving the last range open-ended.
}
DCHECK_LT(to, kMaxUInt16);
range_array->set_uint16(i * 2 + 1, to + 1); // Exclusive.
}
return range_array;
}
} // namespace
Handle<ByteArray> NativeRegExpMacroAssembler::GetOrAddRangeArray(
const ZoneList<CharacterRange>* ranges) {
const uint32_t hash = Hash(ranges);
if (range_array_cache_.count(hash) != 0) {
Handle<ByteArray> range_array = range_array_cache_[hash];
if (Equals(ranges, range_array)) return range_array;
}
Handle<ByteArray> range_array = MakeRangeArray(isolate(), ranges);
range_array_cache_[hash] = range_array;
return range_array;
}
// static
uint32_t RegExpMacroAssembler::IsCharacterInRangeArray(uint32_t current_char,
Address raw_byte_array,
Isolate* isolate) {
// Use uint32_t to avoid complexity around bool return types (which may be
// optimized to use only the least significant byte).
static constexpr uint32_t kTrue = 1;
static constexpr uint32_t kFalse = 0;
ByteArray ranges = ByteArray::cast(Object(raw_byte_array));
DCHECK_EQ(ranges.length() % kUInt16Size, 0); // uc16 elements.
const int length = ranges.length() / kUInt16Size;
DCHECK_GE(length, 1);
// Shortcut for fully out of range chars.
if (current_char < ranges.get_uint16(0)) return kFalse;
if (current_char >= ranges.get_uint16(length - 1)) {
// The last range may be open-ended.
return (length % 2) == 0 ? kFalse : kTrue;
}
// Binary search for the matching range. `ranges` is encoded as
// [from0, to0, from1, to1, ..., fromN, toN], or
// [from0, to0, from1, to1, ..., fromN] (open-ended last interval).
int mid, lower = 0, upper = length;
do {
mid = lower + (upper - lower) / 2;
const base::uc16 elem = ranges.get_uint16(mid);
if (current_char < elem) {
upper = mid;
} else if (current_char > elem) {
lower = mid + 1;
} else {
DCHECK_EQ(current_char, elem);
break;
}
} while (lower < upper);
const bool current_char_ge_last_elem = current_char >= ranges.get_uint16(mid);
const int current_range_start_index =
current_char_ge_last_elem ? mid : mid - 1;
// Ranges start at even indices and end at odd indices.
return (current_range_start_index % 2) == 0 ? kTrue : kFalse;
}
void RegExpMacroAssembler::CheckNotInSurrogatePair(int cp_offset,
Label* on_failure) {
Label ok;
@ -124,17 +253,6 @@ void RegExpMacroAssembler::LoadCurrentCharacter(int cp_offset,
eats_at_least);
}
bool RegExpMacroAssembler::CheckSpecialCharacterClass(uc16 type,
Label* on_no_match) {
return false;
}
NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate,
Zone* zone)
: RegExpMacroAssembler(isolate, zone) {}
NativeRegExpMacroAssembler::~NativeRegExpMacroAssembler() = default;
void NativeRegExpMacroAssembler::LoadCurrentCharacterImpl(
int cp_offset, Label* on_end_of_input, bool check_bounds, int characters,
int eats_at_least) {
@ -153,13 +271,14 @@ void NativeRegExpMacroAssembler::LoadCurrentCharacterImpl(
LoadCurrentCharacterUnchecked(cp_offset, characters);
}
bool NativeRegExpMacroAssembler::CanReadUnaligned() {
bool NativeRegExpMacroAssembler::CanReadUnaligned() const {
return FLAG_enable_regexp_unaligned_accesses && !slow_safe();
}
#ifndef COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER
// This method may only be called after an interrupt.
// static
int NativeRegExpMacroAssembler::CheckStackGuardState(
Isolate* isolate, int start_index, RegExp::CallOrigin call_origin,
Address* return_address, Code re_code, Address* subject,
@ -287,6 +406,15 @@ int NativeRegExpMacroAssembler::Match(Handle<JSRegExp> regexp,
offsets_vector_length, isolate, *regexp);
}
// static
int NativeRegExpMacroAssembler::ExecuteForTesting(
String input, int start_offset, const byte* input_start,
const byte* input_end, int* output, int output_size, Isolate* isolate,
JSRegExp regexp) {
return Execute(input, start_offset, input_start, input_end, output,
output_size, isolate, regexp);
}
// Returns a {Result} sentinel, or the number of successful matches.
// TODO(pthier): The JSRegExp object is passed to native irregexp code to match
// the signature of the interpreter. We should get rid of JS objects passed to
@ -295,23 +423,21 @@ int NativeRegExpMacroAssembler::Execute(
String input, // This needs to be the unpacked (sliced, cons) string.
int start_offset, const byte* input_start, const byte* input_end,
int* output, int output_size, Isolate* isolate, JSRegExp regexp) {
// Ensure that the minimum stack has been allocated.
RegExpStackScope stack_scope(isolate);
Address stack_base = stack_scope.stack()->stack_base();
bool is_one_byte = String::IsOneByteRepresentationUnderneath(input);
Code code = Code::cast(regexp.Code(is_one_byte));
Code code = FromCodeT(CodeT::cast(regexp.code(is_one_byte)));
RegExp::CallOrigin call_origin = RegExp::CallOrigin::kFromRuntime;
using RegexpMatcherSig = int(
Address input_string, int start_offset, const byte* input_start,
const byte* input_end, int* output, int output_size, Address stack_base,
int call_origin, Isolate* isolate, Address regexp);
using RegexpMatcherSig =
// NOLINTNEXTLINE(readability/casting)
int(Address input_string, int start_offset, const byte* input_start,
const byte* input_end, int* output, int output_size, int call_origin,
Isolate* isolate, Address regexp);
auto fn = GeneratedCode<RegexpMatcherSig>::FromCode(code);
int result =
fn.Call(input.ptr(), start_offset, input_start, input_end, output,
output_size, stack_base, call_origin, isolate, regexp.ptr());
int result = fn.Call(input.ptr(), start_offset, input_start, input_end,
output, output_size, call_origin, isolate, regexp.ptr());
DCHECK_GE(result, SMALLEST_REGEXP_RESULT);
if (result == EXCEPTION && !isolate->has_pending_exception()) {
@ -371,22 +497,24 @@ const byte NativeRegExpMacroAssembler::word_character_map[] = {
};
// clang-format on
Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer,
Address* stack_base,
Isolate* isolate) {
// static
Address NativeRegExpMacroAssembler::GrowStack(Isolate* isolate) {
DisallowGarbageCollection no_gc;
RegExpStack* regexp_stack = isolate->regexp_stack();
size_t size = regexp_stack->stack_capacity();
Address old_stack_base = regexp_stack->stack_base();
DCHECK(old_stack_base == *stack_base);
DCHECK(stack_pointer <= old_stack_base);
DCHECK(static_cast<size_t>(old_stack_base - stack_pointer) <= size);
Address new_stack_base = regexp_stack->EnsureCapacity(size * 2);
if (new_stack_base == kNullAddress) {
return kNullAddress;
}
*stack_base = new_stack_base;
intptr_t stack_content_size = old_stack_base - stack_pointer;
return new_stack_base - stack_content_size;
const size_t old_size = regexp_stack->memory_size();
#ifdef DEBUG
const Address old_stack_top = regexp_stack->memory_top();
const Address old_stack_pointer = regexp_stack->stack_pointer();
CHECK_LE(old_stack_pointer, old_stack_top);
CHECK_LE(static_cast<size_t>(old_stack_top - old_stack_pointer), old_size);
#endif // DEBUG
Address new_stack_base = regexp_stack->EnsureCapacity(old_size * 2);
if (new_stack_base == kNullAddress) return kNullAddress;
return regexp_stack->stack_pointer();
}
} // namespace internal

Просмотреть файл

@ -12,18 +12,17 @@
namespace v8 {
namespace internal {
static const uc32 kLeadSurrogateStart = 0xd800;
static const uc32 kLeadSurrogateEnd = 0xdbff;
static const uc32 kTrailSurrogateStart = 0xdc00;
static const uc32 kTrailSurrogateEnd = 0xdfff;
static const uc32 kNonBmpStart = 0x10000;
static const uc32 kNonBmpEnd = 0x10ffff;
struct DisjunctDecisionRow {
RegExpCharacterClass cc;
Label* on_match;
};
class ByteArray;
class JSRegExp;
class Label;
class String;
static const base::uc32 kLeadSurrogateStart = 0xd800;
static const base::uc32 kLeadSurrogateEnd = 0xdbff;
static const base::uc32 kTrailSurrogateStart = 0xdc00;
static const base::uc32 kTrailSurrogateEnd = 0xdfff;
static const base::uc32 kNonBmpStart = 0x10000;
static const base::uc32 kNonBmpEnd = 0x10ffff;
class RegExpMacroAssembler {
public:
@ -39,11 +38,134 @@ class RegExpMacroAssembler {
static constexpr int kUseCharactersValue = -1;
RegExpMacroAssembler(Isolate* isolate, Zone* zone);
virtual ~RegExpMacroAssembler() = default;
virtual Handle<HeapObject> GetCode(Handle<String> source) = 0;
// This function is called when code generation is aborted, so that
// the assembler could clean up internal data structures.
virtual void AbortedCodeGeneration() {}
// The maximal number of pushes between stack checks. Users must supply
// kCheckStackLimit flag to push operations (instead of kNoStackLimitCheck)
// at least once for every stack_limit() pushes that are executed.
virtual int stack_limit_slack() = 0;
virtual bool CanReadUnaligned() const = 0;
virtual void AdvanceCurrentPosition(int by) = 0; // Signed cp change.
virtual void AdvanceRegister(int reg, int by) = 0; // r[reg] += by.
// Continues execution from the position pushed on the top of the backtrack
// stack by an earlier PushBacktrack(Label*).
virtual void Backtrack() = 0;
virtual void Bind(Label* label) = 0;
// Dispatch after looking the current character up in a 2-bits-per-entry
// map. The destinations vector has up to 4 labels.
virtual void CheckCharacter(unsigned c, Label* on_equal) = 0;
// Bitwise and the current character with the given constant and then
// check for a match with c.
virtual void CheckCharacterAfterAnd(unsigned c,
unsigned and_with,
Label* on_equal) = 0;
virtual void CheckCharacterGT(base::uc16 limit, Label* on_greater) = 0;
virtual void CheckCharacterLT(base::uc16 limit, Label* on_less) = 0;
virtual void CheckGreedyLoop(Label* on_tos_equals_current_position) = 0;
virtual void CheckAtStart(int cp_offset, Label* on_at_start) = 0;
virtual void CheckNotAtStart(int cp_offset, Label* on_not_at_start) = 0;
virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match) = 0;
virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
bool read_backward, bool unicode,
Label* on_no_match) = 0;
// Check the current character for a match with a literal character. If we
// fail to match then goto the on_failure label. End of input always
// matches. If the label is nullptr then we should pop a backtrack address
// off the stack and go to that.
virtual void CheckNotCharacter(unsigned c, Label* on_not_equal) = 0;
virtual void CheckNotCharacterAfterAnd(unsigned c,
unsigned and_with,
Label* on_not_equal) = 0;
// Subtract a constant from the current character, then and with the given
// constant and then check for a match with c.
virtual void CheckNotCharacterAfterMinusAnd(base::uc16 c, base::uc16 minus,
base::uc16 and_with,
Label* on_not_equal) = 0;
virtual void CheckCharacterInRange(base::uc16 from,
base::uc16 to, // Both inclusive.
Label* on_in_range) = 0;
virtual void CheckCharacterNotInRange(base::uc16 from,
base::uc16 to, // Both inclusive.
Label* on_not_in_range) = 0;
// Returns true if the check was emitted, false otherwise.
virtual bool CheckCharacterInRangeArray(
const ZoneList<CharacterRange>* ranges, Label* on_in_range) = 0;
virtual bool CheckCharacterNotInRangeArray(
const ZoneList<CharacterRange>* ranges, Label* on_not_in_range) = 0;
// The current character (modulus the kTableSize) is looked up in the byte
// array, and if the found byte is non-zero, we jump to the on_bit_set label.
virtual void CheckBitInTable(Handle<ByteArray> table, Label* on_bit_set) = 0;
// Checks whether the given offset from the current position is before
// the end of the string. May overwrite the current character.
virtual void CheckPosition(int cp_offset, Label* on_outside_input);
// Check whether a standard/default character class matches the current
// character. Returns false if the type of special character class does
// not have custom support.
// May clobber the current loaded character.
virtual bool CheckSpecialCharacterClass(StandardCharacterSet type,
Label* on_no_match) {
return false;
}
// Control-flow integrity:
// Define a jump target and bind a label.
virtual void BindJumpTarget(Label* label) { Bind(label); }
virtual void Fail() = 0;
virtual void GoTo(Label* label) = 0;
// Check whether a register is >= a given constant and go to a label if it
// is. Backtracks instead if the label is nullptr.
virtual void IfRegisterGE(int reg, int comparand, Label* if_ge) = 0;
// Check whether a register is < a given constant and go to a label if it is.
// Backtracks instead if the label is nullptr.
virtual void IfRegisterLT(int reg, int comparand, Label* if_lt) = 0;
// Check whether a register is == to the current position and go to a
// label if it is.
virtual void IfRegisterEqPos(int reg, Label* if_eq) = 0;
V8_EXPORT_PRIVATE void LoadCurrentCharacter(
int cp_offset, Label* on_end_of_input, bool check_bounds = true,
int characters = 1, int eats_at_least = kUseCharactersValue);
virtual void LoadCurrentCharacterImpl(int cp_offset, Label* on_end_of_input,
bool check_bounds, int characters,
int eats_at_least) = 0;
virtual void PopCurrentPosition() = 0;
virtual void PopRegister(int register_index) = 0;
// Pushes the label on the backtrack stack, so that a following Backtrack
// will go to this label. Always checks the backtrack stack limit.
virtual void PushBacktrack(Label* label) = 0;
virtual void PushCurrentPosition() = 0;
enum StackCheckFlag { kNoStackLimitCheck = false, kCheckStackLimit = true };
virtual void PushRegister(int register_index,
StackCheckFlag check_stack_limit) = 0;
virtual void ReadCurrentPositionFromRegister(int reg) = 0;
virtual void ReadStackPointerFromRegister(int reg) = 0;
virtual void SetCurrentPositionFromEnd(int by) = 0;
virtual void SetRegister(int register_index, int to) = 0;
// Return whether the matching (with a global regexp) will be restarted.
virtual bool Succeed() = 0;
virtual void WriteCurrentPositionToRegister(int reg, int cp_offset) = 0;
virtual void ClearRegisters(int reg_from, int reg_to) = 0;
virtual void WriteStackPointerToRegister(int reg) = 0;
// Check that we are not in the middle of a surrogate pair.
void CheckNotInSurrogatePair(int cp_offset, Label* on_failure);
#define IMPLEMENTATIONS_LIST(V) \
V(IA32) \
V(ARM) \
V(ARM64) \
V(MIPS) \
V(LOONG64) \
V(RISCV) \
V(S390) \
V(PPC) \
@ -65,123 +187,11 @@ class RegExpMacroAssembler {
return kNames[impl];
}
#undef IMPLEMENTATIONS_LIST
enum StackCheckFlag {
kNoStackLimitCheck = false,
kCheckStackLimit = true
};
RegExpMacroAssembler(Isolate* isolate, Zone* zone);
virtual ~RegExpMacroAssembler();
// This function is called when code generation is aborted, so that
// the assembler could clean up internal data structures.
virtual void AbortedCodeGeneration() {}
// The maximal number of pushes between stack checks. Users must supply
// kCheckStackLimit flag to push operations (instead of kNoStackLimitCheck)
// at least once for every stack_limit() pushes that are executed.
virtual int stack_limit_slack() = 0;
virtual bool CanReadUnaligned() = 0;
virtual void AdvanceCurrentPosition(int by) = 0; // Signed cp change.
virtual void AdvanceRegister(int reg, int by) = 0; // r[reg] += by.
// Continues execution from the position pushed on the top of the backtrack
// stack by an earlier PushBacktrack(Label*).
virtual void Backtrack() = 0;
virtual void Bind(Label* label) = 0;
// Dispatch after looking the current character up in a 2-bits-per-entry
// map. The destinations vector has up to 4 labels.
virtual void CheckCharacter(unsigned c, Label* on_equal) = 0;
// Bitwise and the current character with the given constant and then
// check for a match with c.
virtual void CheckCharacterAfterAnd(unsigned c,
unsigned and_with,
Label* on_equal) = 0;
virtual void CheckCharacterGT(uc16 limit, Label* on_greater) = 0;
virtual void CheckCharacterLT(uc16 limit, Label* on_less) = 0;
virtual void CheckGreedyLoop(Label* on_tos_equals_current_position) = 0;
virtual void CheckAtStart(int cp_offset, Label* on_at_start) = 0;
virtual void CheckNotAtStart(int cp_offset, Label* on_not_at_start) = 0;
virtual void CheckNotBackReference(int start_reg, bool read_backward,
Label* on_no_match) = 0;
virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
bool read_backward, bool unicode,
Label* on_no_match) = 0;
// Check the current character for a match with a literal character. If we
// fail to match then goto the on_failure label. End of input always
// matches. If the label is nullptr then we should pop a backtrack address
// off the stack and go to that.
virtual void CheckNotCharacter(unsigned c, Label* on_not_equal) = 0;
virtual void CheckNotCharacterAfterAnd(unsigned c,
unsigned and_with,
Label* on_not_equal) = 0;
// Subtract a constant from the current character, then and with the given
// constant and then check for a match with c.
virtual void CheckNotCharacterAfterMinusAnd(uc16 c,
uc16 minus,
uc16 and_with,
Label* on_not_equal) = 0;
virtual void CheckCharacterInRange(uc16 from,
uc16 to, // Both inclusive.
Label* on_in_range) = 0;
virtual void CheckCharacterNotInRange(uc16 from,
uc16 to, // Both inclusive.
Label* on_not_in_range) = 0;
// The current character (modulus the kTableSize) is looked up in the byte
// array, and if the found byte is non-zero, we jump to the on_bit_set label.
virtual void CheckBitInTable(Handle<ByteArray> table, Label* on_bit_set) = 0;
// Checks whether the given offset from the current position is before
// the end of the string. May overwrite the current character.
virtual void CheckPosition(int cp_offset, Label* on_outside_input);
// Check whether a standard/default character class matches the current
// character. Returns false if the type of special character class does
// not have custom support.
// May clobber the current loaded character.
virtual bool CheckSpecialCharacterClass(uc16 type, Label* on_no_match);
// Control-flow integrity:
// Define a jump target and bind a label.
virtual void BindJumpTarget(Label* label) { Bind(label); }
virtual void Fail() = 0;
virtual Handle<HeapObject> GetCode(Handle<String> source) = 0;
virtual void GoTo(Label* label) = 0;
// Check whether a register is >= a given constant and go to a label if it
// is. Backtracks instead if the label is nullptr.
virtual void IfRegisterGE(int reg, int comparand, Label* if_ge) = 0;
// Check whether a register is < a given constant and go to a label if it is.
// Backtracks instead if the label is nullptr.
virtual void IfRegisterLT(int reg, int comparand, Label* if_lt) = 0;
// Check whether a register is == to the current position and go to a
// label if it is.
virtual void IfRegisterEqPos(int reg, Label* if_eq) = 0;
virtual IrregexpImplementation Implementation() = 0;
V8_EXPORT_PRIVATE void LoadCurrentCharacter(
int cp_offset, Label* on_end_of_input, bool check_bounds = true,
int characters = 1, int eats_at_least = kUseCharactersValue);
virtual void LoadCurrentCharacterImpl(int cp_offset, Label* on_end_of_input,
bool check_bounds, int characters,
int eats_at_least) = 0;
virtual void PopCurrentPosition() = 0;
virtual void PopRegister(int register_index) = 0;
// Pushes the label on the backtrack stack, so that a following Backtrack
// will go to this label. Always checks the backtrack stack limit.
virtual void PushBacktrack(Label* label) = 0;
virtual void PushCurrentPosition() = 0;
virtual void PushRegister(int register_index,
StackCheckFlag check_stack_limit) = 0;
virtual void ReadCurrentPositionFromRegister(int reg) = 0;
virtual void ReadStackPointerFromRegister(int reg) = 0;
virtual void SetCurrentPositionFromEnd(int by) = 0;
virtual void SetRegister(int register_index, int to) = 0;
// Return whether the matching (with a global regexp) will be restarted.
virtual bool Succeed() = 0;
virtual void WriteCurrentPositionToRegister(int reg, int cp_offset) = 0;
virtual void ClearRegisters(int reg_from, int reg_to) = 0;
virtual void WriteStackPointerToRegister(int reg) = 0;
// Compare two-byte strings case insensitively.
// Called from generated RegExp code.
//
// Called from generated code.
static int CaseInsensitiveCompareNonUnicode(Address byte_offset1,
Address byte_offset2,
size_t byte_length,
@ -191,12 +201,23 @@ class RegExpMacroAssembler {
size_t byte_length,
Isolate* isolate);
// Check that we are not in the middle of a surrogate pair.
void CheckNotInSurrogatePair(int cp_offset, Label* on_failure);
// `raw_byte_array` is a ByteArray containing a set of character ranges,
// where ranges are encoded as uint16_t elements:
//
// [from0, to0, from1, to1, ..., fromN, toN], or
// [from0, to0, from1, to1, ..., fromN] (open-ended last interval).
//
// fromN is inclusive, toN is exclusive. Returns zero if not in a range,
// non-zero otherwise.
//
// Called from generated code.
static uint32_t IsCharacterInRangeArray(uint32_t current_char,
Address raw_byte_array,
Isolate* isolate);
// Controls the generation of large inlined constants in the code.
void set_slow_safe(bool ssc) { slow_safe_compiler_ = ssc; }
bool slow_safe() { return slow_safe_compiler_; }
bool slow_safe() const { return slow_safe_compiler_; }
// Controls after how many backtracks irregexp should abort execution. If it
// can fall back to the experimental engine (see `set_can_fallback`), it will
@ -220,30 +241,28 @@ class RegExpMacroAssembler {
// Set whether the regular expression has the global flag. Exiting due to
// a failure in a global regexp may still mean success overall.
inline void set_global_mode(GlobalMode mode) { global_mode_ = mode; }
inline bool global() { return global_mode_ != NOT_GLOBAL; }
inline bool global_with_zero_length_check() {
inline bool global() const { return global_mode_ != NOT_GLOBAL; }
inline bool global_with_zero_length_check() const {
return global_mode_ == GLOBAL || global_mode_ == GLOBAL_UNICODE;
}
inline bool global_unicode() { return global_mode_ == GLOBAL_UNICODE; }
inline bool global_unicode() const { return global_mode_ == GLOBAL_UNICODE; }
Isolate* isolate() const { return isolate_; }
Zone* zone() const { return zone_; }
protected:
bool has_backtrack_limit() const {
return backtrack_limit_ != JSRegExp::kNoBacktrackLimit;
}
bool has_backtrack_limit() const;
uint32_t backtrack_limit() const { return backtrack_limit_; }
bool can_fallback() const { return can_fallback_; }
private:
bool slow_safe_compiler_;
uint32_t backtrack_limit_ = JSRegExp::kNoBacktrackLimit;
uint32_t backtrack_limit_;
bool can_fallback_ = false;
GlobalMode global_mode_;
Isolate* isolate_;
Zone* zone_;
Isolate* const isolate_;
Zone* const zone_;
};
class NativeRegExpMacroAssembler: public RegExpMacroAssembler {
@ -271,44 +290,24 @@ class NativeRegExpMacroAssembler: public RegExpMacroAssembler {
SMALLEST_REGEXP_RESULT = RegExp::kInternalRegExpSmallestResult,
};
NativeRegExpMacroAssembler(Isolate* isolate, Zone* zone);
~NativeRegExpMacroAssembler() override;
bool CanReadUnaligned() override;
NativeRegExpMacroAssembler(Isolate* isolate, Zone* zone)
: RegExpMacroAssembler(isolate, zone), range_array_cache_(zone) {}
~NativeRegExpMacroAssembler() override = default;
// Returns a {Result} sentinel, or the number of successful matches.
static int Match(Handle<JSRegExp> regexp, Handle<String> subject,
int* offsets_vector, int offsets_vector_length,
int previous_index, Isolate* isolate);
// Called from RegExp if the backtrack stack limit is hit.
// Tries to expand the stack. Returns the new stack-pointer if
// successful, and updates the stack_top address, or returns 0 if unable
// to grow the stack.
// This function must not trigger a garbage collection.
static Address GrowStack(Address stack_pointer, Address* stack_top,
Isolate* isolate);
V8_EXPORT_PRIVATE static int ExecuteForTesting(String input, int start_offset,
const byte* input_start,
const byte* input_end,
int* output, int output_size,
Isolate* isolate,
JSRegExp regexp);
static int CheckStackGuardState(Isolate* isolate, int start_index,
RegExp::CallOrigin call_origin,
Address* return_address, Code re_code,
Address* subject, const byte** input_start,
const byte** input_end);
bool CanReadUnaligned() const override;
// Byte map of one byte characters with a 0xff if the character is a word
// character (digit, letter or underscore) and 0x00 otherwise.
// Used by generated RegExp code.
static const byte word_character_map[256];
static Address word_character_map_address() {
return reinterpret_cast<Address>(&word_character_map[0]);
}
// Returns a {Result} sentinel, or the number of successful matches.
V8_EXPORT_PRIVATE static int Execute(String input, int start_offset,
const byte* input_start,
const byte* input_end, int* output,
int output_size, Isolate* isolate,
JSRegExp regexp);
void LoadCurrentCharacterImpl(int cp_offset, Label* on_end_of_input,
bool check_bounds, int characters,
int eats_at_least) override;
@ -316,6 +315,41 @@ class NativeRegExpMacroAssembler: public RegExpMacroAssembler {
// current position, into the current-character register.
virtual void LoadCurrentCharacterUnchecked(int cp_offset,
int character_count) = 0;
// Called from RegExp if the backtrack stack limit is hit. Tries to expand
// the stack. Returns the new stack-pointer if successful, or returns 0 if
// unable to grow the stack.
// This function must not trigger a garbage collection.
//
// Called from generated code.
static Address GrowStack(Isolate* isolate);
// Called from generated code.
static int CheckStackGuardState(Isolate* isolate, int start_index,
RegExp::CallOrigin call_origin,
Address* return_address, Code re_code,
Address* subject, const byte** input_start,
const byte** input_end);
static Address word_character_map_address() {
return reinterpret_cast<Address>(&word_character_map[0]);
}
protected:
// Byte map of one byte characters with a 0xff if the character is a word
// character (digit, letter or underscore) and 0x00 otherwise.
// Used by generated RegExp code.
static const byte word_character_map[256];
Handle<ByteArray> GetOrAddRangeArray(const ZoneList<CharacterRange>* ranges);
private:
// Returns a {Result} sentinel, or the number of successful matches.
static int Execute(String input, int start_offset, const byte* input_start,
const byte* input_end, int* output, int output_size,
Isolate* isolate, JSRegExp regexp);
ZoneUnorderedMap<uint32_t, Handle<ByteArray>> range_array_cache_;
};
} // namespace internal

Просмотреть файл

@ -13,7 +13,6 @@ namespace internal {
class AlternativeGenerationList;
class BoyerMooreLookahead;
class GreedyLoopState;
class Label;
class NodeVisitor;
class QuickCheckDetails;
class RegExpCompiler;
@ -204,7 +203,9 @@ class RegExpNode : public ZoneObject {
// If we know that the input is one-byte then there are some nodes that can
// never match. This method returns a node that can be substituted for
// itself, or nullptr if the node can never match.
virtual RegExpNode* FilterOneByte(int depth) { return this; }
virtual RegExpNode* FilterOneByte(int depth, RegExpFlags flags) {
return this;
}
// Helper for FilterOneByte.
RegExpNode* replacement() {
DCHECK(info()->replacement_calculated);
@ -293,7 +294,7 @@ class SeqRegExpNode : public RegExpNode {
: RegExpNode(on_success->zone()), on_success_(on_success) {}
RegExpNode* on_success() { return on_success_; }
void set_on_success(RegExpNode* node) { on_success_ = node; }
RegExpNode* FilterOneByte(int depth) override;
RegExpNode* FilterOneByte(int depth, RegExpFlags flags) override;
void FillInBMInfo(Isolate* isolate, int offset, int budget,
BoyerMooreLookahead* bm, bool not_at_start) override {
on_success_->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
@ -301,7 +302,7 @@ class SeqRegExpNode : public RegExpNode {
}
protected:
RegExpNode* FilterSuccessor(int depth);
RegExpNode* FilterSuccessor(int depth, RegExpFlags flags);
private:
RegExpNode* on_success_;
@ -405,15 +406,17 @@ class TextNode : public SeqRegExpNode {
static TextNode* CreateForCharacterRanges(Zone* zone,
ZoneList<CharacterRange>* ranges,
bool read_backward,
RegExpNode* on_success,
JSRegExp::Flags flags);
// Create TextNode for a surrogate pair with a range given for the
// lead and the trail surrogate each.
static TextNode* CreateForSurrogatePair(Zone* zone, CharacterRange lead,
RegExpNode* on_success);
// Create TextNode for a surrogate pair (i.e. match a sequence of two uc16
// code unit ranges).
static TextNode* CreateForSurrogatePair(
Zone* zone, CharacterRange lead, ZoneList<CharacterRange>* trail_ranges,
bool read_backward, RegExpNode* on_success);
static TextNode* CreateForSurrogatePair(Zone* zone,
ZoneList<CharacterRange>* lead_ranges,
CharacterRange trail,
bool read_backward,
RegExpNode* on_success,
JSRegExp::Flags flags);
RegExpNode* on_success);
void Accept(NodeVisitor* visitor) override;
void Emit(RegExpCompiler* compiler, Trace* trace) override;
void GetQuickCheckDetails(QuickCheckDetails* details,
@ -421,14 +424,15 @@ class TextNode : public SeqRegExpNode {
bool not_at_start) override;
ZoneList<TextElement>* elements() { return elms_; }
bool read_backward() { return read_backward_; }
void MakeCaseIndependent(Isolate* isolate, bool is_one_byte);
void MakeCaseIndependent(Isolate* isolate, bool is_one_byte,
RegExpFlags flags);
int GreedyLoopTextLength() override;
RegExpNode* GetSuccessorOfOmnivorousTextNode(
RegExpCompiler* compiler) override;
void FillInBMInfo(Isolate* isolate, int offset, int budget,
BoyerMooreLookahead* bm, bool not_at_start) override;
void CalculateOffsets();
RegExpNode* FilterOneByte(int depth) override;
RegExpNode* FilterOneByte(int depth, RegExpFlags flags) override;
int Length();
private:
@ -496,7 +500,7 @@ class AssertionNode : public SeqRegExpNode {
class BackReferenceNode : public SeqRegExpNode {
public:
BackReferenceNode(int start_reg, int end_reg, JSRegExp::Flags flags,
BackReferenceNode(int start_reg, int end_reg, RegExpFlags flags,
bool read_backward, RegExpNode* on_success)
: SeqRegExpNode(on_success),
start_reg_(start_reg),
@ -519,7 +523,7 @@ class BackReferenceNode : public SeqRegExpNode {
private:
int start_reg_;
int end_reg_;
JSRegExp::Flags flags_;
RegExpFlags flags_;
bool read_backward_;
};
@ -621,7 +625,7 @@ class ChoiceNode : public RegExpNode {
virtual bool try_to_emit_quick_check_for_alternative(bool is_first) {
return true;
}
RegExpNode* FilterOneByte(int depth) override;
RegExpNode* FilterOneByte(int depth, RegExpFlags flags) override;
virtual bool read_backward() { return false; }
protected:
@ -693,7 +697,7 @@ class NegativeLookaroundChoiceNode : public ChoiceNode {
return !is_first;
}
void Accept(NodeVisitor* visitor) override;
RegExpNode* FilterOneByte(int depth) override;
RegExpNode* FilterOneByte(int depth, RegExpFlags flags) override;
};
class LoopChoiceNode : public ChoiceNode {
@ -726,7 +730,7 @@ class LoopChoiceNode : public ChoiceNode {
int min_loop_iterations() const { return min_loop_iterations_; }
bool read_backward() override { return read_backward_; }
void Accept(NodeVisitor* visitor) override;
RegExpNode* FilterOneByte(int depth) override;
RegExpNode* FilterOneByte(int depth, RegExpFlags flags) override;
private:
// AddAlternative is made private for loop nodes because alternatives

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -5,363 +5,27 @@
#ifndef V8_REGEXP_REGEXP_PARSER_H_
#define V8_REGEXP_REGEXP_PARSER_H_
#include "irregexp/imported/regexp-ast.h"
#include "irregexp/imported/regexp-error.h"
#include "irregexp/RegExpShim.h"
namespace v8 {
namespace internal {
class String;
class Zone;
struct RegExpCompileData;
// A BufferedZoneList is an automatically growing list, just like (and backed
// by) a ZoneList, that is optimized for the case of adding and removing
// a single element. The last element added is stored outside the backing list,
// and if no more than one element is ever added, the ZoneList isn't even
// allocated.
// Elements must not be nullptr pointers.
template <typename T, int initial_size>
class BufferedZoneList {
class V8_EXPORT_PRIVATE RegExpParser : public AllStatic {
public:
BufferedZoneList() : list_(nullptr), last_(nullptr) {}
static bool ParseRegExpFromHeapString(Isolate* isolate, Zone* zone,
Handle<String> input, RegExpFlags flags,
RegExpCompileData* result);
// Adds element at end of list. This element is buffered and can
// be read using last() or removed using RemoveLast until a new Add or until
// RemoveLast or GetList has been called.
void Add(T* value, Zone* zone) {
if (last_ != nullptr) {
if (list_ == nullptr) {
list_ = zone->New<ZoneList<T*>>(initial_size, zone);
}
list_->Add(last_, zone);
}
last_ = value;
}
T* last() {
DCHECK(last_ != nullptr);
return last_;
}
T* RemoveLast() {
DCHECK(last_ != nullptr);
T* result = last_;
if ((list_ != nullptr) && (list_->length() > 0))
last_ = list_->RemoveLast();
else
last_ = nullptr;
return result;
}
T* Get(int i) {
DCHECK((0 <= i) && (i < length()));
if (list_ == nullptr) {
DCHECK_EQ(0, i);
return last_;
} else {
if (i == list_->length()) {
DCHECK(last_ != nullptr);
return last_;
} else {
return list_->at(i);
}
}
}
void Clear() {
list_ = nullptr;
last_ = nullptr;
}
int length() {
int length = (list_ == nullptr) ? 0 : list_->length();
return length + ((last_ == nullptr) ? 0 : 1);
}
ZoneList<T*>* GetList(Zone* zone) {
if (list_ == nullptr) {
list_ = zone->New<ZoneList<T*>>(initial_size, zone);
}
if (last_ != nullptr) {
list_->Add(last_, zone);
last_ = nullptr;
}
return list_;
}
private:
ZoneList<T*>* list_;
T* last_;
};
// Accumulates RegExp atoms and assertions into lists of terms and alternatives.
class RegExpBuilder : public ZoneObject {
public:
RegExpBuilder(Zone* zone, JSRegExp::Flags flags);
void AddCharacter(uc16 character);
void AddUnicodeCharacter(uc32 character);
void AddEscapedUnicodeCharacter(uc32 character);
// "Adds" an empty expression. Does nothing except consume a
// following quantifier
void AddEmpty();
void AddCharacterClass(RegExpCharacterClass* cc);
void AddCharacterClassForDesugaring(uc32 c);
void AddAtom(RegExpTree* tree);
void AddTerm(RegExpTree* tree);
void AddAssertion(RegExpTree* tree);
void NewAlternative(); // '|'
bool AddQuantifierToAtom(int min, int max,
RegExpQuantifier::QuantifierType type);
void FlushText();
RegExpTree* ToRegExp();
JSRegExp::Flags flags() const { return flags_; }
void set_flags(JSRegExp::Flags flags) { flags_ = flags; }
bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
bool multiline() const { return (flags_ & JSRegExp::kMultiline) != 0; }
bool dotall() const { return (flags_ & JSRegExp::kDotAll) != 0; }
private:
static const uc16 kNoPendingSurrogate = 0;
void AddLeadSurrogate(uc16 lead_surrogate);
void AddTrailSurrogate(uc16 trail_surrogate);
void FlushPendingSurrogate();
void FlushCharacters();
void FlushTerms();
bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc);
bool NeedsDesugaringForIgnoreCase(uc32 c);
Zone* zone() const { return zone_; }
bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }
Zone* zone_;
bool pending_empty_;
JSRegExp::Flags flags_;
ZoneList<uc16>* characters_;
uc16 pending_surrogate_;
BufferedZoneList<RegExpTree, 2> terms_;
BufferedZoneList<RegExpTree, 2> text_;
BufferedZoneList<RegExpTree, 2> alternatives_;
#ifdef DEBUG
enum { ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM } last_added_;
#define LAST(x) last_added_ = x;
#else
#define LAST(x)
#endif
};
class V8_EXPORT_PRIVATE RegExpParser {
public:
RegExpParser(FlatStringReader* in, JSRegExp::Flags flags, Isolate* isolate,
Zone* zone);
static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input,
JSRegExp::Flags flags, RegExpCompileData* result);
// Used by the SpiderMonkey embedding of irregexp.
static bool VerifyRegExpSyntax(Isolate* isolate, Zone* zone,
FlatStringReader* input, JSRegExp::Flags flags,
RegExpCompileData* result,
const DisallowGarbageCollection& nogc);
private:
bool Parse(RegExpCompileData* result, const DisallowGarbageCollection&);
RegExpTree* ParsePattern();
RegExpTree* ParseDisjunction();
RegExpTree* ParseGroup();
// Parses a {...,...} quantifier and stores the range in the given
// out parameters.
bool ParseIntervalQuantifier(int* min_out, int* max_out);
// Parses and returns a single escaped character. The character
// must not be 'b' or 'B' since they are usually handle specially.
uc32 ParseClassCharacterEscape();
// Checks whether the following is a length-digit hexadecimal number,
// and sets the value if it is.
bool ParseHexEscape(int length, uc32* value);
bool ParseUnicodeEscape(uc32* value);
bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value);
bool ParsePropertyClassName(ZoneVector<char>* name_1,
ZoneVector<char>* name_2);
bool AddPropertyClassRange(ZoneList<CharacterRange>* add_to, bool negate,
const ZoneVector<char>& name_1,
const ZoneVector<char>& name_2);
RegExpTree* GetPropertySequence(const ZoneVector<char>& name_1);
RegExpTree* ParseCharacterClass(const RegExpBuilder* state);
uc32 ParseOctalLiteral();
// Tries to parse the input as a back reference. If successful it
// stores the result in the output parameter and returns true. If
// it fails it will push back the characters read so the same characters
// can be reparsed.
bool ParseBackReferenceIndex(int* index_out);
// Parse inside a class. Either add escaped class to the range, or return
// false and pass parsed single character through |char_out|.
void ParseClassEscape(ZoneList<CharacterRange>* ranges, Zone* zone,
bool add_unicode_case_equivalents, uc32* char_out,
bool* is_class_escape);
char ParseClassEscape();
RegExpTree* ReportError(RegExpError error);
void Advance();
void Advance(int dist);
void Reset(int pos);
// Reports whether the pattern might be used as a literal search string.
// Only use if the result of the parse is a single atom node.
bool simple();
bool contains_anchor() { return contains_anchor_; }
void set_contains_anchor() { contains_anchor_ = true; }
int captures_started() { return captures_started_; }
int position() { return next_pos_ - 1; }
bool failed() { return failed_; }
// The Unicode flag can't be changed using in-regexp syntax, so it's OK to
// just read the initial flag value here.
bool unicode() const { return (top_level_flags_ & JSRegExp::kUnicode) != 0; }
static bool IsSyntaxCharacterOrSlash(uc32 c);
static const uc32 kEndMarker = (1 << 21);
private:
enum SubexpressionType {
INITIAL,
CAPTURE, // All positive values represent captures.
POSITIVE_LOOKAROUND,
NEGATIVE_LOOKAROUND,
GROUPING
};
class RegExpParserState : public ZoneObject {
public:
// Push a state on the stack.
RegExpParserState(RegExpParserState* previous_state,
SubexpressionType group_type,
RegExpLookaround::Type lookaround_type,
int disjunction_capture_index,
const ZoneVector<uc16>* capture_name,
JSRegExp::Flags flags, Zone* zone)
: previous_state_(previous_state),
builder_(zone->New<RegExpBuilder>(zone, flags)),
group_type_(group_type),
lookaround_type_(lookaround_type),
disjunction_capture_index_(disjunction_capture_index),
capture_name_(capture_name) {}
// Parser state of containing expression, if any.
RegExpParserState* previous_state() const { return previous_state_; }
bool IsSubexpression() { return previous_state_ != nullptr; }
// RegExpBuilder building this regexp's AST.
RegExpBuilder* builder() const { return builder_; }
// Type of regexp being parsed (parenthesized group or entire regexp).
SubexpressionType group_type() const { return group_type_; }
// Lookahead or Lookbehind.
RegExpLookaround::Type lookaround_type() const { return lookaround_type_; }
// Index in captures array of first capture in this sub-expression, if any.
// Also the capture index of this sub-expression itself, if group_type
// is CAPTURE.
int capture_index() const { return disjunction_capture_index_; }
// The name of the current sub-expression, if group_type is CAPTURE. Only
// used for named captures.
const ZoneVector<uc16>* capture_name() const { return capture_name_; }
bool IsNamedCapture() const { return capture_name_ != nullptr; }
// Check whether the parser is inside a capture group with the given index.
bool IsInsideCaptureGroup(int index);
// Check whether the parser is inside a capture group with the given name.
bool IsInsideCaptureGroup(const ZoneVector<uc16>* name);
private:
// Linked list implementation of stack of states.
RegExpParserState* const previous_state_;
// Builder for the stored disjunction.
RegExpBuilder* const builder_;
// Stored disjunction type (capture, look-ahead or grouping), if any.
const SubexpressionType group_type_;
// Stored read direction.
const RegExpLookaround::Type lookaround_type_;
// Stored disjunction's capture index (if any).
const int disjunction_capture_index_;
// Stored capture name (if any).
const ZoneVector<uc16>* const capture_name_;
};
// Return the 1-indexed RegExpCapture object, allocate if necessary.
RegExpCapture* GetCapture(int index);
// Creates a new named capture at the specified index. Must be called exactly
// once for each named capture. Fails if a capture with the same name is
// encountered.
bool CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, int index);
// Parses the name of a capture group (?<name>pattern). The name must adhere
// to IdentifierName in the ECMAScript standard.
const ZoneVector<uc16>* ParseCaptureGroupName();
bool ParseNamedBackReference(RegExpBuilder* builder,
RegExpParserState* state);
RegExpParserState* ParseOpenParenthesis(RegExpParserState* state);
// After the initial parsing pass, patch corresponding RegExpCapture objects
// into all RegExpBackReferences. This is done after initial parsing in order
// to avoid complicating cases in which references comes before the capture.
void PatchNamedBackReferences();
Handle<FixedArray> CreateCaptureNameMap();
// Returns true iff the pattern contains named captures. May call
// ScanForCaptures to look ahead at the remaining pattern.
bool HasNamedCaptures();
Isolate* isolate() { return isolate_; }
Zone* zone() const { return zone_; }
uc32 current() { return current_; }
bool has_more() { return has_more_; }
bool has_next() { return next_pos_ < in()->length(); }
uc32 Next();
template <bool update_position>
uc32 ReadNext();
FlatStringReader* in() { return in_; }
void ScanForCaptures();
struct RegExpCaptureNameLess {
bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const {
DCHECK_NOT_NULL(lhs);
DCHECK_NOT_NULL(rhs);
return *lhs->name() < *rhs->name();
}
};
Isolate* isolate_;
Zone* zone_;
RegExpError error_ = RegExpError::kNone;
int error_pos_ = 0;
ZoneList<RegExpCapture*>* captures_;
ZoneSet<RegExpCapture*, RegExpCaptureNameLess>* named_captures_;
ZoneList<RegExpBackReference*>* named_back_references_;
FlatStringReader* in_;
uc32 current_;
// These are the flags specified outside the regexp syntax ie after the
// terminating '/' or in the second argument to the constructor. The current
// flags are stored on the RegExpBuilder.
JSRegExp::Flags top_level_flags_;
int next_pos_;
int captures_started_;
int capture_count_; // Only valid after we have scanned for captures.
bool has_more_;
bool simple_;
bool contains_anchor_;
bool is_scanned_for_captures_;
bool has_named_captures_; // Only valid after we have scanned for captures.
bool failed_;
template <class CharT>
static bool VerifyRegExpSyntax(Zone* zone, uintptr_t stack_limit,
const CharT* input, int input_length,
RegExpFlags flags, RegExpCompileData* result,
const DisallowGarbageCollection& no_gc);
};
} // namespace internal

Просмотреть файл

@ -9,23 +9,17 @@ namespace v8 {
namespace internal {
RegExpStackScope::RegExpStackScope(Isolate* isolate)
: regexp_stack_(isolate->regexp_stack()) {
: regexp_stack_(isolate->regexp_stack()),
old_sp_top_delta_(regexp_stack_->sp_top_delta()) {
DCHECK(regexp_stack_->IsValid());
// Irregexp is not reentrant in several ways; in particular, the
// RegExpStackScope is not reentrant since the destructor frees allocated
// memory. Protect against reentrancy here.
CHECK(!regexp_stack_->is_in_use());
regexp_stack_->set_is_in_use(true);
}
RegExpStackScope::~RegExpStackScope() {
// Reset the buffer if it has grown.
regexp_stack_->Reset();
DCHECK(!regexp_stack_->is_in_use());
CHECK_EQ(old_sp_top_delta_, regexp_stack_->sp_top_delta());
regexp_stack_->ResetIfEmpty();
}
RegExpStack::RegExpStack() : thread_local_(this), isolate_(nullptr) {}
RegExpStack::RegExpStack() : thread_local_(this) {}
RegExpStack::~RegExpStack() { thread_local_.FreeAndInvalidate(); }
@ -50,18 +44,16 @@ char* RegExpStack::RestoreStack(char* from) {
return from + kThreadLocalSize;
}
void RegExpStack::Reset() { thread_local_.ResetToStaticStack(this); }
void RegExpStack::ThreadLocal::ResetToStaticStack(RegExpStack* regexp_stack) {
if (owns_memory_) DeleteArray(memory_);
memory_ = regexp_stack->static_stack_;
memory_top_ = regexp_stack->static_stack_ + kStaticStackSize;
memory_size_ = kStaticStackSize;
stack_pointer_ = memory_top_;
limit_ = reinterpret_cast<Address>(regexp_stack->static_stack_) +
kStackLimitSlack * kSystemPointerSize;
owns_memory_ = false;
is_in_use_ = false;
}
void RegExpStack::ThreadLocal::FreeAndInvalidate() {
@ -72,6 +64,7 @@ void RegExpStack::ThreadLocal::FreeAndInvalidate() {
memory_ = nullptr;
memory_top_ = nullptr;
memory_size_ = 0;
stack_pointer_ = nullptr;
limit_ = kMemoryTop;
}
@ -86,9 +79,11 @@ Address RegExpStack::EnsureCapacity(size_t size) {
thread_local_.memory_, thread_local_.memory_size_);
if (thread_local_.owns_memory_) DeleteArray(thread_local_.memory_);
}
ptrdiff_t delta = sp_top_delta();
thread_local_.memory_ = new_memory;
thread_local_.memory_top_ = new_memory + size;
thread_local_.memory_size_ = size;
thread_local_.stack_pointer_ = thread_local_.memory_top_ + delta;
thread_local_.limit_ = reinterpret_cast<Address>(new_memory) +
kStackLimitSlack * kSystemPointerSize;
thread_local_.owns_memory_ = true;

Просмотреть файл

@ -14,10 +14,7 @@ class RegExpStack;
// Maintains a per-v8thread stack area that can be used by irregexp
// implementation for its backtracking stack.
// Since there is only one stack area, the Irregexp implementation is not
// re-entrant. I.e., no regular expressions may be executed in the same thread
// during a preempted Irregexp execution.
class V8_NODISCARD RegExpStackScope {
class V8_NODISCARD RegExpStackScope final {
public:
// Create and delete an instance to control the life-time of a growing stack.
@ -30,46 +27,45 @@ class V8_NODISCARD RegExpStackScope {
RegExpStack* stack() const { return regexp_stack_; }
private:
RegExpStack* regexp_stack_;
RegExpStack* const regexp_stack_;
const ptrdiff_t old_sp_top_delta_;
};
class RegExpStack {
class RegExpStack final {
public:
RegExpStack();
~RegExpStack();
RegExpStack(const RegExpStack&) = delete;
RegExpStack& operator=(const RegExpStack&) = delete;
// Number of allocated locations on the stack below the limit.
// No sequence of pushes must be longer that this without doing a stack-limit
// check.
// Number of allocated locations on the stack below the limit. No sequence of
// pushes must be longer than this without doing a stack-limit check.
static constexpr int kStackLimitSlack = 32;
// Gives the top of the memory used as stack.
Address stack_base() {
Address memory_top() const {
DCHECK_NE(0, thread_local_.memory_size_);
DCHECK_EQ(thread_local_.memory_top_,
thread_local_.memory_ + thread_local_.memory_size_);
return reinterpret_cast<Address>(thread_local_.memory_top_);
}
// The total size of the memory allocated for the stack.
size_t stack_capacity() { return thread_local_.memory_size_; }
Address stack_pointer() const {
return reinterpret_cast<Address>(thread_local_.stack_pointer_);
}
size_t memory_size() const { return thread_local_.memory_size_; }
// If the stack pointer gets below the limit, we should react and
// either grow the stack or report an out-of-stack exception.
// There is only a limited number of locations below the stack limit,
// so users of the stack should check the stack limit during any
// sequence of pushes longer that this.
Address* limit_address_address() { return &(thread_local_.limit_); }
Address* limit_address_address() { return &thread_local_.limit_; }
// Ensures that there is a memory area with at least the specified size.
// If passing zero, the default/minimum size buffer is allocated.
Address EnsureCapacity(size_t size);
bool is_in_use() const { return thread_local_.is_in_use_; }
void set_is_in_use(bool v) { thread_local_.is_in_use_ = v; }
// Thread local archiving.
static constexpr int ArchiveSpacePerThread() {
return static_cast<int>(kThreadLocalSize);
@ -99,46 +95,61 @@ class RegExpStack {
2 * kStackLimitSlack * kSystemPointerSize;
byte static_stack_[kStaticStackSize] = {0};
STATIC_ASSERT(kStaticStackSize <= kMaximumStackSize);
static_assert(kStaticStackSize <= kMaximumStackSize);
// Structure holding the allocated memory, size and limit.
// Structure holding the allocated memory, size and limit. Thread switching
// archives and restores this struct.
struct ThreadLocal {
explicit ThreadLocal(RegExpStack* regexp_stack) {
ResetToStaticStack(regexp_stack);
}
// If memory_size_ > 0 then memory_ and memory_top_ must be non-nullptr
// and memory_top_ = memory_ + memory_size_
// If memory_size_ > 0 then
// - memory_, memory_top_, stack_pointer_ must be non-nullptr
// - memory_top_ = memory_ + memory_size_
// - memory_ <= stack_pointer_ <= memory_top_
byte* memory_ = nullptr;
byte* memory_top_ = nullptr;
size_t memory_size_ = 0;
byte* stack_pointer_ = nullptr;
Address limit_ = kNullAddress;
bool owns_memory_ = false; // Whether memory_ is owned and must be freed.
bool is_in_use_ = false; // To guard against reentrancy.
void ResetToStaticStack(RegExpStack* regexp_stack);
void ResetToStaticStackIfEmpty(RegExpStack* regexp_stack) {
if (stack_pointer_ == memory_top_) ResetToStaticStack(regexp_stack);
}
void FreeAndInvalidate();
};
static constexpr size_t kThreadLocalSize = sizeof(ThreadLocal);
// Address of top of memory used as stack.
Address memory_top_address_address() {
return reinterpret_cast<Address>(&thread_local_.memory_top_);
}
// Resets the buffer if it has grown beyond the default/minimum size.
// After this, the buffer is either the default size, or it is empty, so
// you have to call EnsureCapacity before using it again.
void Reset();
Address stack_pointer_address() {
return reinterpret_cast<Address>(&thread_local_.stack_pointer_);
}
// A position-independent representation of the stack pointer.
ptrdiff_t sp_top_delta() const {
ptrdiff_t result =
reinterpret_cast<intptr_t>(thread_local_.stack_pointer_) -
reinterpret_cast<intptr_t>(thread_local_.memory_top_);
DCHECK_LE(result, 0);
return result;
}
// Resets the buffer if it has grown beyond the default/minimum size and is
// empty.
void ResetIfEmpty() { thread_local_.ResetToStaticStackIfEmpty(this); }
// Whether the ThreadLocal storage has been invalidated.
bool IsValid() const { return thread_local_.memory_ != nullptr; }
ThreadLocal thread_local_;
Isolate* isolate_;
friend class ExternalReference;
friend class Isolate;
friend class RegExpStackScope;
};

Просмотреть файл

@ -11,6 +11,9 @@
namespace v8 {
namespace internal {
class JSRegExp;
class RegExpCapture;
class RegExpMatchInfo;
class RegExpNode;
class RegExpTree;
@ -37,9 +40,9 @@ struct RegExpCompileData {
// True, iff the pattern is anchored at the start of the string with '^'.
bool contains_anchor = false;
// Only use if the pattern contains named captures. If so, this contains a
// mapping of capture names to capture indices.
Handle<FixedArray> capture_name_map;
// Only set if the pattern contains named captures.
// Note: the lifetime equals that of the parse/compile zone.
ZoneVector<RegExpCapture*>* named_captures = nullptr;
// The error message. Only used if an error occurred during parsing or
// compilation.
@ -62,9 +65,15 @@ struct RegExpCompileData {
class RegExp final : public AllStatic {
public:
// Whether the irregexp engine generates interpreter bytecode.
static bool CanGenerateBytecode() {
return FLAG_regexp_interpret_all || FLAG_regexp_tier_up;
}
static bool CanGenerateBytecode();
// Verify the given pattern, i.e. check that parsing succeeds. If
// verification fails, `regexp_error_out` is set.
template <class CharT>
static bool VerifySyntax(Zone* zone, uintptr_t stack_limit,
const CharT* input, int input_length,
RegExpFlags flags, RegExpError* regexp_error_out,
const DisallowGarbageCollection& no_gc);
// Parses the RegExp pattern and prepares the JSRegExp object with
// generic data and choice of implementation - as well as what
@ -72,7 +81,7 @@ class RegExp final : public AllStatic {
// Returns false if compilation fails.
V8_WARN_UNUSED_RESULT static MaybeHandle<Object> Compile(
Isolate* isolate, Handle<JSRegExp> re, Handle<String> pattern,
JSRegExp::Flags flags, uint32_t backtrack_limit);
RegExpFlags flags, uint32_t backtrack_limit);
// Ensures that a regexp is fully compiled and ready to be executed on a
// subject string. Returns true on success. Return false on failure, and
@ -131,12 +140,9 @@ class RegExp final : public AllStatic {
Isolate* isolate, Handle<RegExpMatchInfo> last_match_info,
Handle<String> subject, int capture_count, int32_t* match);
V8_EXPORT_PRIVATE static bool CompileForTesting(Isolate* isolate, Zone* zone,
RegExpCompileData* input,
JSRegExp::Flags flags,
Handle<String> pattern,
Handle<String> sample_subject,
bool is_one_byte);
V8_EXPORT_PRIVATE static bool CompileForTesting(
Isolate* isolate, Zone* zone, RegExpCompileData* input, RegExpFlags flags,
Handle<String> pattern, Handle<String> sample_subject, bool is_one_byte);
V8_EXPORT_PRIVATE static void DotPrintForTesting(const char* label,
RegExpNode* node);
@ -152,6 +158,9 @@ class RegExp final : public AllStatic {
RegExpError error_text);
static bool IsUnmodifiedRegExp(Isolate* isolate, Handle<JSRegExp> regexp);
static Handle<FixedArray> CreateCaptureNameMap(
Isolate* isolate, ZoneVector<RegExpCapture*>* named_captures);
};
// Uses a special global mode of irregexp-generated code to perform a global