зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1627838: Simple string matches r=mgaudet
The old version only supported matching the exact pattern atom. This version supports patterns with escape characters (like `/\u212a/`).
The Boyer-Moore heuristic code is taken from here:
8e4a5e973e/src/regexp/regexp.cc (L116-L136)
Differential Revision: https://phabricator.services.mozilla.com/D69896
--HG--
extra : moz-landing-system : lando
This commit is contained in:
Родитель
f13813200e
Коммит
666cd90093
|
@ -13,10 +13,13 @@
|
||||||
#include "mozilla/ArrayUtils.h"
|
#include "mozilla/ArrayUtils.h"
|
||||||
#include "mozilla/Casting.h"
|
#include "mozilla/Casting.h"
|
||||||
|
|
||||||
|
#include "new-regexp/regexp-compiler.h"
|
||||||
|
#include "new-regexp/regexp-macro-assembler-arch.h"
|
||||||
#include "new-regexp/regexp-parser.h"
|
#include "new-regexp/regexp-parser.h"
|
||||||
#include "new-regexp/regexp-shim.h"
|
#include "new-regexp/regexp-shim.h"
|
||||||
#include "new-regexp/regexp.h"
|
#include "new-regexp/regexp.h"
|
||||||
#include "util/StringBuffer.h"
|
#include "util/StringBuffer.h"
|
||||||
|
#include "vm/RegExpShared.h"
|
||||||
|
|
||||||
namespace js {
|
namespace js {
|
||||||
namespace irregexp {
|
namespace irregexp {
|
||||||
|
@ -30,9 +33,12 @@ using frontend::TokenStreamAnyChars;
|
||||||
using v8::internal::FlatStringReader;
|
using v8::internal::FlatStringReader;
|
||||||
using v8::internal::RegExpCompileData;
|
using v8::internal::RegExpCompileData;
|
||||||
using v8::internal::RegExpError;
|
using v8::internal::RegExpError;
|
||||||
|
using v8::internal::RegExpNode;
|
||||||
using v8::internal::RegExpParser;
|
using v8::internal::RegExpParser;
|
||||||
using v8::internal::Zone;
|
using v8::internal::Zone;
|
||||||
|
|
||||||
|
using namespace v8::internal::regexp_compiler_constants;
|
||||||
|
|
||||||
static uint32_t ErrorNumber(RegExpError err) {
|
static uint32_t ErrorNumber(RegExpError err) {
|
||||||
switch (err) {
|
switch (err) {
|
||||||
case RegExpError::kNone:
|
case RegExpError::kNone:
|
||||||
|
@ -234,5 +240,93 @@ bool CheckPatternSyntax(JSContext* cx, TokenStreamAnyChars& ts,
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// A regexp is a good candidate for Boyer-Moore if it has at least 3
|
||||||
|
// times as many characters as it has unique characters. Note that
|
||||||
|
// table lookups in irregexp are done modulo tableSize (128).
|
||||||
|
template <typename CharT>
|
||||||
|
static bool HasFewDifferentCharacters(const CharT* chars, size_t length) {
|
||||||
|
const uint32_t tableSize =
|
||||||
|
v8::internal::NativeRegExpMacroAssembler::kTableSize;
|
||||||
|
bool character_found[tableSize];
|
||||||
|
uint32_t different = 0;
|
||||||
|
memset(&character_found[0], 0, sizeof(character_found));
|
||||||
|
for (uint32_t i = 0; i < length; i++) {
|
||||||
|
uint32_t ch = chars[i] % tableSize;
|
||||||
|
if (!character_found[ch]) {
|
||||||
|
character_found[ch] = true;
|
||||||
|
different++;
|
||||||
|
// We declare a regexp low-alphabet if it has at least 3 times as many
|
||||||
|
// characters as it has different characters.
|
||||||
|
if (different * 3 > length) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Identifies the sort of pattern where Boyer-Moore is faster than string search
|
||||||
|
static bool UseBoyerMoore(HandleAtom pattern, JS::AutoAssertNoGC& nogc) {
|
||||||
|
size_t length =
|
||||||
|
std::min(size_t(kMaxLookaheadForBoyerMoore), pattern->length());
|
||||||
|
if (length <= kPatternTooShortForBoyerMoore) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (pattern->hasLatin1Chars()) {
|
||||||
|
return HasFewDifferentCharacters(pattern->latin1Chars(nogc), length);
|
||||||
|
}
|
||||||
|
MOZ_ASSERT(pattern->hasTwoByteChars());
|
||||||
|
return HasFewDifferentCharacters(pattern->twoByteChars(nogc), length);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool CompilePattern(JSContext* cx, MutableHandleRegExpShared re,
|
||||||
|
HandleLinearString input) {
|
||||||
|
RootedAtom pattern(cx, re->getSource());
|
||||||
|
JS::RegExpFlags flags = re->getFlags();
|
||||||
|
LifoAllocScope allocScope(&cx->tempLifoAlloc());
|
||||||
|
Zone zone(allocScope.alloc());
|
||||||
|
|
||||||
|
RegExpCompileData data;
|
||||||
|
{
|
||||||
|
FlatStringReader patternBytes(pattern);
|
||||||
|
if (!RegExpParser::ParseRegExp(cx->isolate, &zone, &patternBytes, flags,
|
||||||
|
&data)) {
|
||||||
|
MOZ_ASSERT(data.error == RegExpError::kStackOverflow);
|
||||||
|
JS::CompileOptions options(cx);
|
||||||
|
TokenStream dummyTokenStream(cx, options, nullptr, 0, nullptr);
|
||||||
|
ReportSyntaxError(dummyTokenStream, data, pattern);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (re->kind() == RegExpShared::Kind::Unparsed) {
|
||||||
|
// This is the first time we have compiled this regexp.
|
||||||
|
// First, check to see if we should use simple string search
|
||||||
|
// with an atom.
|
||||||
|
if (!flags.ignoreCase() && !flags.sticky()) {
|
||||||
|
RootedAtom searchAtom(cx);
|
||||||
|
if (data.simple) {
|
||||||
|
// The parse-tree is a single atom that is equal to the pattern.
|
||||||
|
searchAtom = re->getSource();
|
||||||
|
} else if (data.tree->IsAtom() && data.capture_count == 0) {
|
||||||
|
// The parse-tree is a single atom that is not equal to the pattern.
|
||||||
|
v8::internal::RegExpAtom* atom = data.tree->AsAtom();
|
||||||
|
const char16_t* twoByteChars = atom->data().begin();
|
||||||
|
searchAtom = AtomizeChars(cx, twoByteChars, atom->length());
|
||||||
|
if (!searchAtom) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
JS::AutoAssertNoGC nogc(cx);
|
||||||
|
if (searchAtom && !UseBoyerMoore(searchAtom, nogc)) {
|
||||||
|
re->useAtomMatch(searchAtom);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
MOZ_CRASH("TODO");
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace irregexp
|
} // namespace irregexp
|
||||||
} // namespace js
|
} // namespace js
|
||||||
|
|
|
@ -24,6 +24,9 @@ bool CheckPatternSyntax(JSContext* cx, frontend::TokenStreamAnyChars& ts,
|
||||||
bool CheckPatternSyntax(JSContext* cx, frontend::TokenStreamAnyChars& ts,
|
bool CheckPatternSyntax(JSContext* cx, frontend::TokenStreamAnyChars& ts,
|
||||||
HandleAtom pattern, JS::RegExpFlags flags);
|
HandleAtom pattern, JS::RegExpFlags flags);
|
||||||
|
|
||||||
|
bool CompilePattern(JSContext* cx, MutableHandleRegExpShared re,
|
||||||
|
HandleLinearString input);
|
||||||
|
|
||||||
} // namespace irregexp
|
} // namespace irregexp
|
||||||
} // namespace js
|
} // namespace js
|
||||||
|
|
||||||
|
|
|
@ -946,10 +946,7 @@ bool js::StringHasRegExpMetaChars(JSLinearString* str) {
|
||||||
/* RegExpShared */
|
/* RegExpShared */
|
||||||
|
|
||||||
RegExpShared::RegExpShared(JSAtom* source, RegExpFlags flags)
|
RegExpShared::RegExpShared(JSAtom* source, RegExpFlags flags)
|
||||||
: headerAndSource(source),
|
: headerAndSource(source), parenCount(0), flags(flags) {}
|
||||||
parenCount(0),
|
|
||||||
flags(flags),
|
|
||||||
canStringMatch(false) {}
|
|
||||||
|
|
||||||
void RegExpShared::traceChildren(JSTracer* trc) {
|
void RegExpShared::traceChildren(JSTracer* trc) {
|
||||||
// Discard code to avoid holding onto ExecutablePools.
|
// Discard code to avoid holding onto ExecutablePools.
|
||||||
|
@ -958,9 +955,19 @@ void RegExpShared::traceChildren(JSTracer* trc) {
|
||||||
}
|
}
|
||||||
|
|
||||||
TraceNullableEdge(trc, &headerAndSource, "RegExpShared source");
|
TraceNullableEdge(trc, &headerAndSource, "RegExpShared source");
|
||||||
|
#ifdef ENABLE_NEW_REGEXP
|
||||||
|
if (kind() == RegExpShared::Kind::Atom) {
|
||||||
|
TraceNullableEdge(trc, &patternAtom_, "RegExpShared pattern atom");
|
||||||
|
} else {
|
||||||
|
for (auto& comp : compilationArray) {
|
||||||
|
TraceNullableEdge(trc, &comp.jitCode, "RegExpShared code");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
for (auto& comp : compilationArray) {
|
for (auto& comp : compilationArray) {
|
||||||
TraceNullableEdge(trc, &comp.jitCode, "RegExpShared code");
|
TraceNullableEdge(trc, &comp.jitCode, "RegExpShared code");
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void RegExpShared::discardJitCode() {
|
void RegExpShared::discardJitCode() {
|
||||||
|
@ -1003,7 +1010,17 @@ bool RegExpShared::compileIfNecessary(JSContext* cx,
|
||||||
MutableHandleRegExpShared re,
|
MutableHandleRegExpShared re,
|
||||||
HandleLinearString input,
|
HandleLinearString input,
|
||||||
ForceByteCodeEnum force) {
|
ForceByteCodeEnum force) {
|
||||||
MOZ_CRASH("TODO");
|
bool needsCompile = false;
|
||||||
|
if (re->kind() == RegExpShared::Kind::Unparsed) {
|
||||||
|
needsCompile = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: tier-up from interpreter to generated code
|
||||||
|
|
||||||
|
if (needsCompile) {
|
||||||
|
return irregexp::CompilePattern(cx, re, input);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* static */
|
/* static */
|
||||||
|
@ -1011,9 +1028,40 @@ RegExpRunStatus RegExpShared::execute(JSContext* cx,
|
||||||
MutableHandleRegExpShared re,
|
MutableHandleRegExpShared re,
|
||||||
HandleLinearString input, size_t start,
|
HandleLinearString input, size_t start,
|
||||||
VectorMatchPairs* matches) {
|
VectorMatchPairs* matches) {
|
||||||
|
MOZ_ASSERT(matches);
|
||||||
|
|
||||||
|
// TODO: Add tracelogger support
|
||||||
|
|
||||||
|
/* Compile the code at point-of-use. */
|
||||||
|
if (!compileIfNecessary(cx, re, input, DontForceByteCode)) {
|
||||||
|
return RegExpRunStatus_Error;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Ensure sufficient memory for output vector.
|
||||||
|
* No need to initialize it. The RegExp engine fills them in on a match.
|
||||||
|
*/
|
||||||
|
if (!matches->allocOrExpandArray(re->pairCount())) {
|
||||||
|
ReportOutOfMemory(cx);
|
||||||
|
return RegExpRunStatus_Error;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (re->kind() == RegExpShared::Kind::Atom) {
|
||||||
|
return RegExpShared::executeAtom(cx, re, input, start, matches);
|
||||||
|
}
|
||||||
|
|
||||||
MOZ_CRASH("TODO");
|
MOZ_CRASH("TODO");
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
|
void RegExpShared::useAtomMatch(HandleAtom pattern) {
|
||||||
|
MOZ_ASSERT(kind() == RegExpShared::Kind::Unparsed);
|
||||||
|
kind_ = RegExpShared::Kind::Atom;
|
||||||
|
patternAtom_ = pattern;
|
||||||
|
parenCount = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#else // !ENABLE_NEW_REGEXP
|
||||||
|
|
||||||
/* static */
|
/* static */
|
||||||
bool RegExpShared::compile(JSContext* cx, MutableHandleRegExpShared re,
|
bool RegExpShared::compile(JSContext* cx, MutableHandleRegExpShared re,
|
||||||
HandleAtom pattern, HandleLinearString input,
|
HandleAtom pattern, HandleLinearString input,
|
||||||
|
@ -1185,7 +1233,8 @@ RegExpRunStatus RegExpShared::execute(JSContext* cx,
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
#endif // ENABLE_NEW_REGEXP
|
#endif // !ENABLE_NEW_REGEXP
|
||||||
|
|
||||||
/* static */
|
/* static */
|
||||||
RegExpRunStatus RegExpShared::executeAtom(JSContext* cx,
|
RegExpRunStatus RegExpShared::executeAtom(JSContext* cx,
|
||||||
MutableHandleRegExpShared re,
|
MutableHandleRegExpShared re,
|
||||||
|
|
|
@ -72,6 +72,7 @@ struct RegExpByteCodeHeader {
|
||||||
class RegExpShared : public gc::TenuredCell {
|
class RegExpShared : public gc::TenuredCell {
|
||||||
public:
|
public:
|
||||||
enum ForceByteCodeEnum { DontForceByteCode, ForceByteCode };
|
enum ForceByteCodeEnum { DontForceByteCode, ForceByteCode };
|
||||||
|
enum class Kind { Unparsed, Atom, RegExp };
|
||||||
|
|
||||||
using JitCodeTable = UniquePtr<uint8_t[], JS::FreePolicy>;
|
using JitCodeTable = UniquePtr<uint8_t[], JS::FreePolicy>;
|
||||||
using JitCodeTables = Vector<JitCodeTable, 0, SystemAllocPolicy>;
|
using JitCodeTables = Vector<JitCodeTable, 0, SystemAllocPolicy>;
|
||||||
|
@ -103,7 +104,13 @@ class RegExpShared : public gc::TenuredCell {
|
||||||
|
|
||||||
uint32_t parenCount;
|
uint32_t parenCount;
|
||||||
JS::RegExpFlags flags;
|
JS::RegExpFlags flags;
|
||||||
bool canStringMatch;
|
|
||||||
|
#ifdef ENABLE_NEW_REGEXP
|
||||||
|
RegExpShared::Kind kind_ = Kind::Unparsed;
|
||||||
|
GCPtrAtom patternAtom_;
|
||||||
|
#else
|
||||||
|
bool canStringMatch = false;
|
||||||
|
#endif
|
||||||
|
|
||||||
static int CompilationIndex(bool latin1) { return latin1 ? 0 : 1; }
|
static int CompilationIndex(bool latin1) { return latin1 ? 0 : 1; }
|
||||||
|
|
||||||
|
@ -150,16 +157,31 @@ class RegExpShared : public gc::TenuredCell {
|
||||||
/* Accessors */
|
/* Accessors */
|
||||||
|
|
||||||
size_t getParenCount() const {
|
size_t getParenCount() const {
|
||||||
|
#ifdef ENABLE_NEW_REGEXP
|
||||||
|
MOZ_ASSERT(kind() != Kind::Unparsed);
|
||||||
|
#else
|
||||||
MOZ_ASSERT(isCompiled());
|
MOZ_ASSERT(isCompiled());
|
||||||
|
#endif
|
||||||
return parenCount;
|
return parenCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef ENABLE_NEW_REGEXP
|
||||||
|
RegExpShared::Kind kind() const { return kind_; }
|
||||||
|
|
||||||
|
// Use simple string matching for this regexp.
|
||||||
|
void useAtomMatch(HandleAtom pattern);
|
||||||
|
#endif
|
||||||
|
|
||||||
/* Accounts for the "0" (whole match) pair. */
|
/* Accounts for the "0" (whole match) pair. */
|
||||||
size_t pairCount() const { return getParenCount() + 1; }
|
size_t pairCount() const { return getParenCount() + 1; }
|
||||||
|
|
||||||
JSAtom* getSource() const { return headerAndSource.ptr(); }
|
JSAtom* getSource() const { return headerAndSource.ptr(); }
|
||||||
|
|
||||||
|
#ifdef ENABLE_NEW_REGEXP
|
||||||
|
JSAtom* patternAtom() const { return patternAtom_; }
|
||||||
|
#else
|
||||||
JSAtom* patternAtom() const { return getSource(); }
|
JSAtom* patternAtom() const { return getSource(); }
|
||||||
|
#endif
|
||||||
|
|
||||||
JS::RegExpFlags getFlags() const { return flags; }
|
JS::RegExpFlags getFlags() const { return flags; }
|
||||||
|
|
||||||
|
|
Загрузка…
Ссылка в новой задаче