зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1627838: Simple string matches r=mgaudet
The old version only supported matching the exact pattern atom. This version supports patterns with escape characters (like `/\u212a/`).
The Boyer-Moore heuristic code is taken from here:
8e4a5e973e/src/regexp/regexp.cc (L116-L136)
Differential Revision: https://phabricator.services.mozilla.com/D69896
--HG--
extra : moz-landing-system : lando
This commit is contained in:
Родитель
f13813200e
Коммит
666cd90093
|
@ -13,10 +13,13 @@
|
|||
#include "mozilla/ArrayUtils.h"
|
||||
#include "mozilla/Casting.h"
|
||||
|
||||
#include "new-regexp/regexp-compiler.h"
|
||||
#include "new-regexp/regexp-macro-assembler-arch.h"
|
||||
#include "new-regexp/regexp-parser.h"
|
||||
#include "new-regexp/regexp-shim.h"
|
||||
#include "new-regexp/regexp.h"
|
||||
#include "util/StringBuffer.h"
|
||||
#include "vm/RegExpShared.h"
|
||||
|
||||
namespace js {
|
||||
namespace irregexp {
|
||||
|
@ -30,9 +33,12 @@ using frontend::TokenStreamAnyChars;
|
|||
using v8::internal::FlatStringReader;
|
||||
using v8::internal::RegExpCompileData;
|
||||
using v8::internal::RegExpError;
|
||||
using v8::internal::RegExpNode;
|
||||
using v8::internal::RegExpParser;
|
||||
using v8::internal::Zone;
|
||||
|
||||
using namespace v8::internal::regexp_compiler_constants;
|
||||
|
||||
static uint32_t ErrorNumber(RegExpError err) {
|
||||
switch (err) {
|
||||
case RegExpError::kNone:
|
||||
|
@ -234,5 +240,93 @@ bool CheckPatternSyntax(JSContext* cx, TokenStreamAnyChars& ts,
|
|||
return true;
|
||||
}
|
||||
|
||||
// A regexp is a good candidate for Boyer-Moore if it has at least 3
|
||||
// times as many characters as it has unique characters. Note that
|
||||
// table lookups in irregexp are done modulo tableSize (128).
|
||||
template <typename CharT>
|
||||
static bool HasFewDifferentCharacters(const CharT* chars, size_t length) {
|
||||
const uint32_t tableSize =
|
||||
v8::internal::NativeRegExpMacroAssembler::kTableSize;
|
||||
bool character_found[tableSize];
|
||||
uint32_t different = 0;
|
||||
memset(&character_found[0], 0, sizeof(character_found));
|
||||
for (uint32_t i = 0; i < length; i++) {
|
||||
uint32_t ch = chars[i] % tableSize;
|
||||
if (!character_found[ch]) {
|
||||
character_found[ch] = true;
|
||||
different++;
|
||||
// We declare a regexp low-alphabet if it has at least 3 times as many
|
||||
// characters as it has different characters.
|
||||
if (different * 3 > length) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Identifies the sort of pattern where Boyer-Moore is faster than string search
|
||||
static bool UseBoyerMoore(HandleAtom pattern, JS::AutoAssertNoGC& nogc) {
|
||||
size_t length =
|
||||
std::min(size_t(kMaxLookaheadForBoyerMoore), pattern->length());
|
||||
if (length <= kPatternTooShortForBoyerMoore) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (pattern->hasLatin1Chars()) {
|
||||
return HasFewDifferentCharacters(pattern->latin1Chars(nogc), length);
|
||||
}
|
||||
MOZ_ASSERT(pattern->hasTwoByteChars());
|
||||
return HasFewDifferentCharacters(pattern->twoByteChars(nogc), length);
|
||||
}
|
||||
|
||||
bool CompilePattern(JSContext* cx, MutableHandleRegExpShared re,
|
||||
HandleLinearString input) {
|
||||
RootedAtom pattern(cx, re->getSource());
|
||||
JS::RegExpFlags flags = re->getFlags();
|
||||
LifoAllocScope allocScope(&cx->tempLifoAlloc());
|
||||
Zone zone(allocScope.alloc());
|
||||
|
||||
RegExpCompileData data;
|
||||
{
|
||||
FlatStringReader patternBytes(pattern);
|
||||
if (!RegExpParser::ParseRegExp(cx->isolate, &zone, &patternBytes, flags,
|
||||
&data)) {
|
||||
MOZ_ASSERT(data.error == RegExpError::kStackOverflow);
|
||||
JS::CompileOptions options(cx);
|
||||
TokenStream dummyTokenStream(cx, options, nullptr, 0, nullptr);
|
||||
ReportSyntaxError(dummyTokenStream, data, pattern);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (re->kind() == RegExpShared::Kind::Unparsed) {
|
||||
// This is the first time we have compiled this regexp.
|
||||
// First, check to see if we should use simple string search
|
||||
// with an atom.
|
||||
if (!flags.ignoreCase() && !flags.sticky()) {
|
||||
RootedAtom searchAtom(cx);
|
||||
if (data.simple) {
|
||||
// The parse-tree is a single atom that is equal to the pattern.
|
||||
searchAtom = re->getSource();
|
||||
} else if (data.tree->IsAtom() && data.capture_count == 0) {
|
||||
// The parse-tree is a single atom that is not equal to the pattern.
|
||||
v8::internal::RegExpAtom* atom = data.tree->AsAtom();
|
||||
const char16_t* twoByteChars = atom->data().begin();
|
||||
searchAtom = AtomizeChars(cx, twoByteChars, atom->length());
|
||||
if (!searchAtom) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
JS::AutoAssertNoGC nogc(cx);
|
||||
if (searchAtom && !UseBoyerMoore(searchAtom, nogc)) {
|
||||
re->useAtomMatch(searchAtom);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
MOZ_CRASH("TODO");
|
||||
}
|
||||
|
||||
} // namespace irregexp
|
||||
} // namespace js
|
||||
|
|
|
@ -24,6 +24,9 @@ bool CheckPatternSyntax(JSContext* cx, frontend::TokenStreamAnyChars& ts,
|
|||
bool CheckPatternSyntax(JSContext* cx, frontend::TokenStreamAnyChars& ts,
|
||||
HandleAtom pattern, JS::RegExpFlags flags);
|
||||
|
||||
bool CompilePattern(JSContext* cx, MutableHandleRegExpShared re,
|
||||
HandleLinearString input);
|
||||
|
||||
} // namespace irregexp
|
||||
} // namespace js
|
||||
|
||||
|
|
|
@ -946,10 +946,7 @@ bool js::StringHasRegExpMetaChars(JSLinearString* str) {
|
|||
/* RegExpShared */
|
||||
|
||||
RegExpShared::RegExpShared(JSAtom* source, RegExpFlags flags)
|
||||
: headerAndSource(source),
|
||||
parenCount(0),
|
||||
flags(flags),
|
||||
canStringMatch(false) {}
|
||||
: headerAndSource(source), parenCount(0), flags(flags) {}
|
||||
|
||||
void RegExpShared::traceChildren(JSTracer* trc) {
|
||||
// Discard code to avoid holding onto ExecutablePools.
|
||||
|
@ -958,9 +955,19 @@ void RegExpShared::traceChildren(JSTracer* trc) {
|
|||
}
|
||||
|
||||
TraceNullableEdge(trc, &headerAndSource, "RegExpShared source");
|
||||
#ifdef ENABLE_NEW_REGEXP
|
||||
if (kind() == RegExpShared::Kind::Atom) {
|
||||
TraceNullableEdge(trc, &patternAtom_, "RegExpShared pattern atom");
|
||||
} else {
|
||||
for (auto& comp : compilationArray) {
|
||||
TraceNullableEdge(trc, &comp.jitCode, "RegExpShared code");
|
||||
}
|
||||
}
|
||||
#else
|
||||
for (auto& comp : compilationArray) {
|
||||
TraceNullableEdge(trc, &comp.jitCode, "RegExpShared code");
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void RegExpShared::discardJitCode() {
|
||||
|
@ -1003,7 +1010,17 @@ bool RegExpShared::compileIfNecessary(JSContext* cx,
|
|||
MutableHandleRegExpShared re,
|
||||
HandleLinearString input,
|
||||
ForceByteCodeEnum force) {
|
||||
MOZ_CRASH("TODO");
|
||||
bool needsCompile = false;
|
||||
if (re->kind() == RegExpShared::Kind::Unparsed) {
|
||||
needsCompile = true;
|
||||
}
|
||||
|
||||
// TODO: tier-up from interpreter to generated code
|
||||
|
||||
if (needsCompile) {
|
||||
return irregexp::CompilePattern(cx, re, input);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/* static */
|
||||
|
@ -1011,9 +1028,40 @@ RegExpRunStatus RegExpShared::execute(JSContext* cx,
|
|||
MutableHandleRegExpShared re,
|
||||
HandleLinearString input, size_t start,
|
||||
VectorMatchPairs* matches) {
|
||||
MOZ_ASSERT(matches);
|
||||
|
||||
// TODO: Add tracelogger support
|
||||
|
||||
/* Compile the code at point-of-use. */
|
||||
if (!compileIfNecessary(cx, re, input, DontForceByteCode)) {
|
||||
return RegExpRunStatus_Error;
|
||||
}
|
||||
|
||||
/*
|
||||
* Ensure sufficient memory for output vector.
|
||||
* No need to initialize it. The RegExp engine fills them in on a match.
|
||||
*/
|
||||
if (!matches->allocOrExpandArray(re->pairCount())) {
|
||||
ReportOutOfMemory(cx);
|
||||
return RegExpRunStatus_Error;
|
||||
}
|
||||
|
||||
if (re->kind() == RegExpShared::Kind::Atom) {
|
||||
return RegExpShared::executeAtom(cx, re, input, start, matches);
|
||||
}
|
||||
|
||||
MOZ_CRASH("TODO");
|
||||
}
|
||||
#else
|
||||
|
||||
void RegExpShared::useAtomMatch(HandleAtom pattern) {
|
||||
MOZ_ASSERT(kind() == RegExpShared::Kind::Unparsed);
|
||||
kind_ = RegExpShared::Kind::Atom;
|
||||
patternAtom_ = pattern;
|
||||
parenCount = 0;
|
||||
}
|
||||
|
||||
#else // !ENABLE_NEW_REGEXP
|
||||
|
||||
/* static */
|
||||
bool RegExpShared::compile(JSContext* cx, MutableHandleRegExpShared re,
|
||||
HandleAtom pattern, HandleLinearString input,
|
||||
|
@ -1185,7 +1233,8 @@ RegExpRunStatus RegExpShared::execute(JSContext* cx,
|
|||
}
|
||||
return result;
|
||||
}
|
||||
#endif // ENABLE_NEW_REGEXP
|
||||
#endif // !ENABLE_NEW_REGEXP
|
||||
|
||||
/* static */
|
||||
RegExpRunStatus RegExpShared::executeAtom(JSContext* cx,
|
||||
MutableHandleRegExpShared re,
|
||||
|
|
|
@ -72,6 +72,7 @@ struct RegExpByteCodeHeader {
|
|||
class RegExpShared : public gc::TenuredCell {
|
||||
public:
|
||||
enum ForceByteCodeEnum { DontForceByteCode, ForceByteCode };
|
||||
enum class Kind { Unparsed, Atom, RegExp };
|
||||
|
||||
using JitCodeTable = UniquePtr<uint8_t[], JS::FreePolicy>;
|
||||
using JitCodeTables = Vector<JitCodeTable, 0, SystemAllocPolicy>;
|
||||
|
@ -103,7 +104,13 @@ class RegExpShared : public gc::TenuredCell {
|
|||
|
||||
uint32_t parenCount;
|
||||
JS::RegExpFlags flags;
|
||||
bool canStringMatch;
|
||||
|
||||
#ifdef ENABLE_NEW_REGEXP
|
||||
RegExpShared::Kind kind_ = Kind::Unparsed;
|
||||
GCPtrAtom patternAtom_;
|
||||
#else
|
||||
bool canStringMatch = false;
|
||||
#endif
|
||||
|
||||
static int CompilationIndex(bool latin1) { return latin1 ? 0 : 1; }
|
||||
|
||||
|
@ -150,16 +157,31 @@ class RegExpShared : public gc::TenuredCell {
|
|||
/* Accessors */
|
||||
|
||||
size_t getParenCount() const {
|
||||
#ifdef ENABLE_NEW_REGEXP
|
||||
MOZ_ASSERT(kind() != Kind::Unparsed);
|
||||
#else
|
||||
MOZ_ASSERT(isCompiled());
|
||||
#endif
|
||||
return parenCount;
|
||||
}
|
||||
|
||||
#ifdef ENABLE_NEW_REGEXP
|
||||
RegExpShared::Kind kind() const { return kind_; }
|
||||
|
||||
// Use simple string matching for this regexp.
|
||||
void useAtomMatch(HandleAtom pattern);
|
||||
#endif
|
||||
|
||||
/* Accounts for the "0" (whole match) pair. */
|
||||
size_t pairCount() const { return getParenCount() + 1; }
|
||||
|
||||
JSAtom* getSource() const { return headerAndSource.ptr(); }
|
||||
|
||||
#ifdef ENABLE_NEW_REGEXP
|
||||
JSAtom* patternAtom() const { return patternAtom_; }
|
||||
#else
|
||||
JSAtom* patternAtom() const { return getSource(); }
|
||||
#endif
|
||||
|
||||
JS::RegExpFlags getFlags() const { return flags; }
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче