Bug 1627838: Simple string matches r=mgaudet

The old version only supported matching the exact pattern atom. This version supports patterns with escape characters (like `/\u212a/`).

The Boyer-Moore heuristic code is taken from here:
8e4a5e973e/src/regexp/regexp.cc (L116-L136)

Differential Revision: https://phabricator.services.mozilla.com/D69896

--HG--
extra : moz-landing-system : lando
This commit is contained in:
Iain Ireland 2020-04-07 22:18:53 +00:00
Родитель f13813200e
Коммит 666cd90093
4 изменённых файлов: 176 добавлений и 8 удалений

Просмотреть файл

@ -13,10 +13,13 @@
#include "mozilla/ArrayUtils.h"
#include "mozilla/Casting.h"
#include "new-regexp/regexp-compiler.h"
#include "new-regexp/regexp-macro-assembler-arch.h"
#include "new-regexp/regexp-parser.h"
#include "new-regexp/regexp-shim.h"
#include "new-regexp/regexp.h"
#include "util/StringBuffer.h"
#include "vm/RegExpShared.h"
namespace js {
namespace irregexp {
@ -30,9 +33,12 @@ using frontend::TokenStreamAnyChars;
using v8::internal::FlatStringReader;
using v8::internal::RegExpCompileData;
using v8::internal::RegExpError;
using v8::internal::RegExpNode;
using v8::internal::RegExpParser;
using v8::internal::Zone;
using namespace v8::internal::regexp_compiler_constants;
static uint32_t ErrorNumber(RegExpError err) {
switch (err) {
case RegExpError::kNone:
@ -234,5 +240,93 @@ bool CheckPatternSyntax(JSContext* cx, TokenStreamAnyChars& ts,
return true;
}
// A regexp is a good candidate for Boyer-Moore if it has at least 3
// times as many characters as it has unique characters. Note that
// table lookups in irregexp are done modulo tableSize (128).
template <typename CharT>
static bool HasFewDifferentCharacters(const CharT* chars, size_t length) {
const uint32_t tableSize =
v8::internal::NativeRegExpMacroAssembler::kTableSize;
bool character_found[tableSize];
uint32_t different = 0;
memset(&character_found[0], 0, sizeof(character_found));
for (uint32_t i = 0; i < length; i++) {
uint32_t ch = chars[i] % tableSize;
if (!character_found[ch]) {
character_found[ch] = true;
different++;
// We declare a regexp low-alphabet if it has at least 3 times as many
// characters as it has different characters.
if (different * 3 > length) {
return false;
}
}
}
return true;
}
// Identifies the sort of pattern where Boyer-Moore is faster than string search
static bool UseBoyerMoore(HandleAtom pattern, JS::AutoAssertNoGC& nogc) {
size_t length =
std::min(size_t(kMaxLookaheadForBoyerMoore), pattern->length());
if (length <= kPatternTooShortForBoyerMoore) {
return false;
}
if (pattern->hasLatin1Chars()) {
return HasFewDifferentCharacters(pattern->latin1Chars(nogc), length);
}
MOZ_ASSERT(pattern->hasTwoByteChars());
return HasFewDifferentCharacters(pattern->twoByteChars(nogc), length);
}
bool CompilePattern(JSContext* cx, MutableHandleRegExpShared re,
HandleLinearString input) {
RootedAtom pattern(cx, re->getSource());
JS::RegExpFlags flags = re->getFlags();
LifoAllocScope allocScope(&cx->tempLifoAlloc());
Zone zone(allocScope.alloc());
RegExpCompileData data;
{
FlatStringReader patternBytes(pattern);
if (!RegExpParser::ParseRegExp(cx->isolate, &zone, &patternBytes, flags,
&data)) {
MOZ_ASSERT(data.error == RegExpError::kStackOverflow);
JS::CompileOptions options(cx);
TokenStream dummyTokenStream(cx, options, nullptr, 0, nullptr);
ReportSyntaxError(dummyTokenStream, data, pattern);
return false;
}
}
if (re->kind() == RegExpShared::Kind::Unparsed) {
// This is the first time we have compiled this regexp.
// First, check to see if we should use simple string search
// with an atom.
if (!flags.ignoreCase() && !flags.sticky()) {
RootedAtom searchAtom(cx);
if (data.simple) {
// The parse-tree is a single atom that is equal to the pattern.
searchAtom = re->getSource();
} else if (data.tree->IsAtom() && data.capture_count == 0) {
// The parse-tree is a single atom that is not equal to the pattern.
v8::internal::RegExpAtom* atom = data.tree->AsAtom();
const char16_t* twoByteChars = atom->data().begin();
searchAtom = AtomizeChars(cx, twoByteChars, atom->length());
if (!searchAtom) {
return false;
}
}
JS::AutoAssertNoGC nogc(cx);
if (searchAtom && !UseBoyerMoore(searchAtom, nogc)) {
re->useAtomMatch(searchAtom);
return true;
}
}
}
MOZ_CRASH("TODO");
}
} // namespace irregexp
} // namespace js

Просмотреть файл

@ -24,6 +24,9 @@ bool CheckPatternSyntax(JSContext* cx, frontend::TokenStreamAnyChars& ts,
bool CheckPatternSyntax(JSContext* cx, frontend::TokenStreamAnyChars& ts,
HandleAtom pattern, JS::RegExpFlags flags);
bool CompilePattern(JSContext* cx, MutableHandleRegExpShared re,
HandleLinearString input);
} // namespace irregexp
} // namespace js

Просмотреть файл

@ -946,10 +946,7 @@ bool js::StringHasRegExpMetaChars(JSLinearString* str) {
/* RegExpShared */
RegExpShared::RegExpShared(JSAtom* source, RegExpFlags flags)
: headerAndSource(source),
parenCount(0),
flags(flags),
canStringMatch(false) {}
: headerAndSource(source), parenCount(0), flags(flags) {}
void RegExpShared::traceChildren(JSTracer* trc) {
// Discard code to avoid holding onto ExecutablePools.
@ -958,9 +955,19 @@ void RegExpShared::traceChildren(JSTracer* trc) {
}
TraceNullableEdge(trc, &headerAndSource, "RegExpShared source");
#ifdef ENABLE_NEW_REGEXP
if (kind() == RegExpShared::Kind::Atom) {
TraceNullableEdge(trc, &patternAtom_, "RegExpShared pattern atom");
} else {
for (auto& comp : compilationArray) {
TraceNullableEdge(trc, &comp.jitCode, "RegExpShared code");
}
}
#else
for (auto& comp : compilationArray) {
TraceNullableEdge(trc, &comp.jitCode, "RegExpShared code");
}
#endif
}
void RegExpShared::discardJitCode() {
@ -1003,7 +1010,17 @@ bool RegExpShared::compileIfNecessary(JSContext* cx,
MutableHandleRegExpShared re,
HandleLinearString input,
ForceByteCodeEnum force) {
MOZ_CRASH("TODO");
bool needsCompile = false;
if (re->kind() == RegExpShared::Kind::Unparsed) {
needsCompile = true;
}
// TODO: tier-up from interpreter to generated code
if (needsCompile) {
return irregexp::CompilePattern(cx, re, input);
}
return true;
}
/* static */
@ -1011,9 +1028,40 @@ RegExpRunStatus RegExpShared::execute(JSContext* cx,
MutableHandleRegExpShared re,
HandleLinearString input, size_t start,
VectorMatchPairs* matches) {
MOZ_ASSERT(matches);
// TODO: Add tracelogger support
/* Compile the code at point-of-use. */
if (!compileIfNecessary(cx, re, input, DontForceByteCode)) {
return RegExpRunStatus_Error;
}
/*
* Ensure sufficient memory for output vector.
* No need to initialize it. The RegExp engine fills them in on a match.
*/
if (!matches->allocOrExpandArray(re->pairCount())) {
ReportOutOfMemory(cx);
return RegExpRunStatus_Error;
}
if (re->kind() == RegExpShared::Kind::Atom) {
return RegExpShared::executeAtom(cx, re, input, start, matches);
}
MOZ_CRASH("TODO");
}
#else
void RegExpShared::useAtomMatch(HandleAtom pattern) {
MOZ_ASSERT(kind() == RegExpShared::Kind::Unparsed);
kind_ = RegExpShared::Kind::Atom;
patternAtom_ = pattern;
parenCount = 0;
}
#else // !ENABLE_NEW_REGEXP
/* static */
bool RegExpShared::compile(JSContext* cx, MutableHandleRegExpShared re,
HandleAtom pattern, HandleLinearString input,
@ -1185,7 +1233,8 @@ RegExpRunStatus RegExpShared::execute(JSContext* cx,
}
return result;
}
#endif // ENABLE_NEW_REGEXP
#endif // !ENABLE_NEW_REGEXP
/* static */
RegExpRunStatus RegExpShared::executeAtom(JSContext* cx,
MutableHandleRegExpShared re,

Просмотреть файл

@ -72,6 +72,7 @@ struct RegExpByteCodeHeader {
class RegExpShared : public gc::TenuredCell {
public:
enum ForceByteCodeEnum { DontForceByteCode, ForceByteCode };
enum class Kind { Unparsed, Atom, RegExp };
using JitCodeTable = UniquePtr<uint8_t[], JS::FreePolicy>;
using JitCodeTables = Vector<JitCodeTable, 0, SystemAllocPolicy>;
@ -103,7 +104,13 @@ class RegExpShared : public gc::TenuredCell {
uint32_t parenCount;
JS::RegExpFlags flags;
bool canStringMatch;
#ifdef ENABLE_NEW_REGEXP
RegExpShared::Kind kind_ = Kind::Unparsed;
GCPtrAtom patternAtom_;
#else
bool canStringMatch = false;
#endif
static int CompilationIndex(bool latin1) { return latin1 ? 0 : 1; }
@ -150,16 +157,31 @@ class RegExpShared : public gc::TenuredCell {
/* Accessors */
size_t getParenCount() const {
#ifdef ENABLE_NEW_REGEXP
MOZ_ASSERT(kind() != Kind::Unparsed);
#else
MOZ_ASSERT(isCompiled());
#endif
return parenCount;
}
#ifdef ENABLE_NEW_REGEXP
RegExpShared::Kind kind() const { return kind_; }
// Use simple string matching for this regexp.
void useAtomMatch(HandleAtom pattern);
#endif
/* Accounts for the "0" (whole match) pair. */
size_t pairCount() const { return getParenCount() + 1; }
JSAtom* getSource() const { return headerAndSource.ptr(); }
#ifdef ENABLE_NEW_REGEXP
JSAtom* patternAtom() const { return patternAtom_; }
#else
JSAtom* patternAtom() const { return getSource(); }
#endif
JS::RegExpFlags getFlags() const { return flags; }