From 666cd9009388b792556ef8b34984b47dd20a1a08 Mon Sep 17 00:00:00 2001 From: Iain Ireland Date: Tue, 7 Apr 2020 22:18:53 +0000 Subject: [PATCH] Bug 1627838: Simple string matches r=mgaudet The old version only supported matching the exact pattern atom. This version supports patterns with escape characters (like `/\u212a/`). The Boyer-Moore heuristic code is taken from here: https://github.com/v8/v8/blob/8e4a5e973e076adfdc035cfe1d0115f047709160/src/regexp/regexp.cc#L116-L136 Differential Revision: https://phabricator.services.mozilla.com/D69896 --HG-- extra : moz-landing-system : lando --- js/src/new-regexp/RegExpAPI.cpp | 94 +++++++++++++++++++++++++++++++++ js/src/new-regexp/RegExpAPI.h | 3 ++ js/src/vm/RegExpObject.cpp | 63 +++++++++++++++++++--- js/src/vm/RegExpShared.h | 24 ++++++++- 4 files changed, 176 insertions(+), 8 deletions(-) diff --git a/js/src/new-regexp/RegExpAPI.cpp b/js/src/new-regexp/RegExpAPI.cpp index 4ac24168342f..04c8383ad939 100644 --- a/js/src/new-regexp/RegExpAPI.cpp +++ b/js/src/new-regexp/RegExpAPI.cpp @@ -13,10 +13,13 @@ #include "mozilla/ArrayUtils.h" #include "mozilla/Casting.h" +#include "new-regexp/regexp-compiler.h" +#include "new-regexp/regexp-macro-assembler-arch.h" #include "new-regexp/regexp-parser.h" #include "new-regexp/regexp-shim.h" #include "new-regexp/regexp.h" #include "util/StringBuffer.h" +#include "vm/RegExpShared.h" namespace js { namespace irregexp { @@ -30,9 +33,12 @@ using frontend::TokenStreamAnyChars; using v8::internal::FlatStringReader; using v8::internal::RegExpCompileData; using v8::internal::RegExpError; +using v8::internal::RegExpNode; using v8::internal::RegExpParser; using v8::internal::Zone; +using namespace v8::internal::regexp_compiler_constants; + static uint32_t ErrorNumber(RegExpError err) { switch (err) { case RegExpError::kNone: @@ -234,5 +240,93 @@ bool CheckPatternSyntax(JSContext* cx, TokenStreamAnyChars& ts, return true; } +// A regexp is a good candidate for Boyer-Moore if it has at least 3 +// times as many characters as it has unique characters. Note that +// table lookups in irregexp are done modulo tableSize (128). +template +static bool HasFewDifferentCharacters(const CharT* chars, size_t length) { + const uint32_t tableSize = + v8::internal::NativeRegExpMacroAssembler::kTableSize; + bool character_found[tableSize]; + uint32_t different = 0; + memset(&character_found[0], 0, sizeof(character_found)); + for (uint32_t i = 0; i < length; i++) { + uint32_t ch = chars[i] % tableSize; + if (!character_found[ch]) { + character_found[ch] = true; + different++; + // We declare a regexp low-alphabet if it has at least 3 times as many + // characters as it has different characters. + if (different * 3 > length) { + return false; + } + } + } + return true; +} + +// Identifies the sort of pattern where Boyer-Moore is faster than string search +static bool UseBoyerMoore(HandleAtom pattern, JS::AutoAssertNoGC& nogc) { + size_t length = + std::min(size_t(kMaxLookaheadForBoyerMoore), pattern->length()); + if (length <= kPatternTooShortForBoyerMoore) { + return false; + } + + if (pattern->hasLatin1Chars()) { + return HasFewDifferentCharacters(pattern->latin1Chars(nogc), length); + } + MOZ_ASSERT(pattern->hasTwoByteChars()); + return HasFewDifferentCharacters(pattern->twoByteChars(nogc), length); +} + +bool CompilePattern(JSContext* cx, MutableHandleRegExpShared re, + HandleLinearString input) { + RootedAtom pattern(cx, re->getSource()); + JS::RegExpFlags flags = re->getFlags(); + LifoAllocScope allocScope(&cx->tempLifoAlloc()); + Zone zone(allocScope.alloc()); + + RegExpCompileData data; + { + FlatStringReader patternBytes(pattern); + if (!RegExpParser::ParseRegExp(cx->isolate, &zone, &patternBytes, flags, + &data)) { + MOZ_ASSERT(data.error == RegExpError::kStackOverflow); + JS::CompileOptions options(cx); + TokenStream dummyTokenStream(cx, options, nullptr, 0, nullptr); + ReportSyntaxError(dummyTokenStream, data, pattern); + return false; + } + } + + if (re->kind() == RegExpShared::Kind::Unparsed) { + // This is the first time we have compiled this regexp. + // First, check to see if we should use simple string search + // with an atom. + if (!flags.ignoreCase() && !flags.sticky()) { + RootedAtom searchAtom(cx); + if (data.simple) { + // The parse-tree is a single atom that is equal to the pattern. + searchAtom = re->getSource(); + } else if (data.tree->IsAtom() && data.capture_count == 0) { + // The parse-tree is a single atom that is not equal to the pattern. + v8::internal::RegExpAtom* atom = data.tree->AsAtom(); + const char16_t* twoByteChars = atom->data().begin(); + searchAtom = AtomizeChars(cx, twoByteChars, atom->length()); + if (!searchAtom) { + return false; + } + } + JS::AutoAssertNoGC nogc(cx); + if (searchAtom && !UseBoyerMoore(searchAtom, nogc)) { + re->useAtomMatch(searchAtom); + return true; + } + } + } + MOZ_CRASH("TODO"); +} + } // namespace irregexp } // namespace js diff --git a/js/src/new-regexp/RegExpAPI.h b/js/src/new-regexp/RegExpAPI.h index fdb82d39e648..f8d356cc0bf5 100644 --- a/js/src/new-regexp/RegExpAPI.h +++ b/js/src/new-regexp/RegExpAPI.h @@ -24,6 +24,9 @@ bool CheckPatternSyntax(JSContext* cx, frontend::TokenStreamAnyChars& ts, bool CheckPatternSyntax(JSContext* cx, frontend::TokenStreamAnyChars& ts, HandleAtom pattern, JS::RegExpFlags flags); +bool CompilePattern(JSContext* cx, MutableHandleRegExpShared re, + HandleLinearString input); + } // namespace irregexp } // namespace js diff --git a/js/src/vm/RegExpObject.cpp b/js/src/vm/RegExpObject.cpp index d905012c594a..c362a538e432 100644 --- a/js/src/vm/RegExpObject.cpp +++ b/js/src/vm/RegExpObject.cpp @@ -946,10 +946,7 @@ bool js::StringHasRegExpMetaChars(JSLinearString* str) { /* RegExpShared */ RegExpShared::RegExpShared(JSAtom* source, RegExpFlags flags) - : headerAndSource(source), - parenCount(0), - flags(flags), - canStringMatch(false) {} + : headerAndSource(source), parenCount(0), flags(flags) {} void RegExpShared::traceChildren(JSTracer* trc) { // Discard code to avoid holding onto ExecutablePools. @@ -958,9 +955,19 @@ void RegExpShared::traceChildren(JSTracer* trc) { } TraceNullableEdge(trc, &headerAndSource, "RegExpShared source"); +#ifdef ENABLE_NEW_REGEXP + if (kind() == RegExpShared::Kind::Atom) { + TraceNullableEdge(trc, &patternAtom_, "RegExpShared pattern atom"); + } else { + for (auto& comp : compilationArray) { + TraceNullableEdge(trc, &comp.jitCode, "RegExpShared code"); + } + } +#else for (auto& comp : compilationArray) { TraceNullableEdge(trc, &comp.jitCode, "RegExpShared code"); } +#endif } void RegExpShared::discardJitCode() { @@ -1003,7 +1010,17 @@ bool RegExpShared::compileIfNecessary(JSContext* cx, MutableHandleRegExpShared re, HandleLinearString input, ForceByteCodeEnum force) { - MOZ_CRASH("TODO"); + bool needsCompile = false; + if (re->kind() == RegExpShared::Kind::Unparsed) { + needsCompile = true; + } + + // TODO: tier-up from interpreter to generated code + + if (needsCompile) { + return irregexp::CompilePattern(cx, re, input); + } + return true; } /* static */ @@ -1011,9 +1028,40 @@ RegExpRunStatus RegExpShared::execute(JSContext* cx, MutableHandleRegExpShared re, HandleLinearString input, size_t start, VectorMatchPairs* matches) { + MOZ_ASSERT(matches); + + // TODO: Add tracelogger support + + /* Compile the code at point-of-use. */ + if (!compileIfNecessary(cx, re, input, DontForceByteCode)) { + return RegExpRunStatus_Error; + } + + /* + * Ensure sufficient memory for output vector. + * No need to initialize it. The RegExp engine fills them in on a match. + */ + if (!matches->allocOrExpandArray(re->pairCount())) { + ReportOutOfMemory(cx); + return RegExpRunStatus_Error; + } + + if (re->kind() == RegExpShared::Kind::Atom) { + return RegExpShared::executeAtom(cx, re, input, start, matches); + } + MOZ_CRASH("TODO"); } -#else + +void RegExpShared::useAtomMatch(HandleAtom pattern) { + MOZ_ASSERT(kind() == RegExpShared::Kind::Unparsed); + kind_ = RegExpShared::Kind::Atom; + patternAtom_ = pattern; + parenCount = 0; +} + +#else // !ENABLE_NEW_REGEXP + /* static */ bool RegExpShared::compile(JSContext* cx, MutableHandleRegExpShared re, HandleAtom pattern, HandleLinearString input, @@ -1185,7 +1233,8 @@ RegExpRunStatus RegExpShared::execute(JSContext* cx, } return result; } -#endif // ENABLE_NEW_REGEXP +#endif // !ENABLE_NEW_REGEXP + /* static */ RegExpRunStatus RegExpShared::executeAtom(JSContext* cx, MutableHandleRegExpShared re, diff --git a/js/src/vm/RegExpShared.h b/js/src/vm/RegExpShared.h index 8658ef5bb485..3f711d5340fd 100644 --- a/js/src/vm/RegExpShared.h +++ b/js/src/vm/RegExpShared.h @@ -72,6 +72,7 @@ struct RegExpByteCodeHeader { class RegExpShared : public gc::TenuredCell { public: enum ForceByteCodeEnum { DontForceByteCode, ForceByteCode }; + enum class Kind { Unparsed, Atom, RegExp }; using JitCodeTable = UniquePtr; using JitCodeTables = Vector; @@ -103,7 +104,13 @@ class RegExpShared : public gc::TenuredCell { uint32_t parenCount; JS::RegExpFlags flags; - bool canStringMatch; + +#ifdef ENABLE_NEW_REGEXP + RegExpShared::Kind kind_ = Kind::Unparsed; + GCPtrAtom patternAtom_; +#else + bool canStringMatch = false; +#endif static int CompilationIndex(bool latin1) { return latin1 ? 0 : 1; } @@ -150,16 +157,31 @@ class RegExpShared : public gc::TenuredCell { /* Accessors */ size_t getParenCount() const { +#ifdef ENABLE_NEW_REGEXP + MOZ_ASSERT(kind() != Kind::Unparsed); +#else MOZ_ASSERT(isCompiled()); +#endif return parenCount; } +#ifdef ENABLE_NEW_REGEXP + RegExpShared::Kind kind() const { return kind_; } + + // Use simple string matching for this regexp. + void useAtomMatch(HandleAtom pattern); +#endif + /* Accounts for the "0" (whole match) pair. */ size_t pairCount() const { return getParenCount() + 1; } JSAtom* getSource() const { return headerAndSource.ptr(); } +#ifdef ENABLE_NEW_REGEXP + JSAtom* patternAtom() const { return patternAtom_; } +#else JSAtom* patternAtom() const { return getSource(); } +#endif JS::RegExpFlags getFlags() const { return flags; }