Bug 1627838: Simple string matches r=mgaudet

The old version only supported matching the exact pattern atom. This version supports patterns with escape characters (like `/\u212a/`). The Boyer-Moore heuristic code is taken from here: 8e4a5e973e/src/regexp/regexp.cc (L116-L136) Differential Revision: https://phabricator.services.mozilla.com/D69896 --HG-- extra : moz-landing-system : lando
2020-04-07 22:18:53 +00:00 · 2020-04-07 22:18:53 +00:00 · 666cd90093
--- a/js/src/new-regexp/RegExpAPI.cpp
+++ b/js/src/new-regexp/RegExpAPI.cpp
@ -13,10 +13,13 @@
 #include "mozilla/ArrayUtils.h"
 #include "mozilla/Casting.h"

+#include "new-regexp/regexp-compiler.h"
+#include "new-regexp/regexp-macro-assembler-arch.h"
 #include "new-regexp/regexp-parser.h"
 #include "new-regexp/regexp-shim.h"
 #include "new-regexp/regexp.h"
 #include "util/StringBuffer.h"
+#include "vm/RegExpShared.h"

 namespace js {
 namespace irregexp {
@ -30,9 +33,12 @@ using frontend::TokenStreamAnyChars;
 using v8::internal::FlatStringReader;
 using v8::internal::RegExpCompileData;
 using v8::internal::RegExpError;
+using v8::internal::RegExpNode;
 using v8::internal::RegExpParser;
 using v8::internal::Zone;

+using namespace v8::internal::regexp_compiler_constants;
+
 static uint32_t ErrorNumber(RegExpError err) {
  switch (err) {
    case RegExpError::kNone:
@ -234,5 +240,93 @@ bool CheckPatternSyntax(JSContext* cx, TokenStreamAnyChars& ts,
  return true;
 }

+// A regexp is a good candidate for Boyer-Moore if it has at least 3
+// times as many characters as it has unique characters. Note that
+// table lookups in irregexp are done modulo tableSize (128).
+template <typename CharT>
+static bool HasFewDifferentCharacters(const CharT* chars, size_t length) {
+  const uint32_t tableSize =
+      v8::internal::NativeRegExpMacroAssembler::kTableSize;
+  bool character_found[tableSize];
+  uint32_t different = 0;
+  memset(&character_found[0], 0, sizeof(character_found));
+  for (uint32_t i = 0; i < length; i++) {
+    uint32_t ch = chars[i] % tableSize;
+    if (!character_found[ch]) {
+      character_found[ch] = true;
+      different++;
+      // We declare a regexp low-alphabet if it has at least 3 times as many
+      // characters as it has different characters.
+      if (different * 3 > length) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+// Identifies the sort of pattern where Boyer-Moore is faster than string search
+static bool UseBoyerMoore(HandleAtom pattern, JS::AutoAssertNoGC& nogc) {
+  size_t length =
+      std::min(size_t(kMaxLookaheadForBoyerMoore), pattern->length());
+  if (length <= kPatternTooShortForBoyerMoore) {
+    return false;
+  }
+
+  if (pattern->hasLatin1Chars()) {
+    return HasFewDifferentCharacters(pattern->latin1Chars(nogc), length);
+  }
+  MOZ_ASSERT(pattern->hasTwoByteChars());
+  return HasFewDifferentCharacters(pattern->twoByteChars(nogc), length);
+}
+
+bool CompilePattern(JSContext* cx, MutableHandleRegExpShared re,
+                    HandleLinearString input) {
+  RootedAtom pattern(cx, re->getSource());
+  JS::RegExpFlags flags = re->getFlags();
+  LifoAllocScope allocScope(&cx->tempLifoAlloc());
+  Zone zone(allocScope.alloc());
+
+  RegExpCompileData data;
+  {
+    FlatStringReader patternBytes(pattern);
+    if (!RegExpParser::ParseRegExp(cx->isolate, &zone, &patternBytes, flags,
+                                   &data)) {
+      MOZ_ASSERT(data.error == RegExpError::kStackOverflow);
+      JS::CompileOptions options(cx);
+      TokenStream dummyTokenStream(cx, options, nullptr, 0, nullptr);
+      ReportSyntaxError(dummyTokenStream, data, pattern);
+      return false;
+    }
+  }
+
+  if (re->kind() == RegExpShared::Kind::Unparsed) {
+    // This is the first time we have compiled this regexp.
+    // First, check to see if we should use simple string search
+    // with an atom.
+    if (!flags.ignoreCase() && !flags.sticky()) {
+      RootedAtom searchAtom(cx);
+      if (data.simple) {
+        // The parse-tree is a single atom that is equal to the pattern.
+        searchAtom = re->getSource();
+      } else if (data.tree->IsAtom() && data.capture_count == 0) {
+        // The parse-tree is a single atom that is not equal to the pattern.
+        v8::internal::RegExpAtom* atom = data.tree->AsAtom();
+        const char16_t* twoByteChars = atom->data().begin();
+        searchAtom = AtomizeChars(cx, twoByteChars, atom->length());
+        if (!searchAtom) {
+          return false;
+        }
+      }
+      JS::AutoAssertNoGC nogc(cx);
+      if (searchAtom && !UseBoyerMoore(searchAtom, nogc)) {
+        re->useAtomMatch(searchAtom);
+        return true;
+      }
+    }
+  }
+  MOZ_CRASH("TODO");
+}
+
 }  // namespace irregexp
 }  // namespace js
--- a/js/src/new-regexp/RegExpAPI.h
+++ b/js/src/new-regexp/RegExpAPI.h
@ -24,6 +24,9 @@ bool CheckPatternSyntax(JSContext* cx, frontend::TokenStreamAnyChars& ts,
 bool CheckPatternSyntax(JSContext* cx, frontend::TokenStreamAnyChars& ts,
                        HandleAtom pattern, JS::RegExpFlags flags);

+bool CompilePattern(JSContext* cx, MutableHandleRegExpShared re,
+                    HandleLinearString input);
+
 }  // namespace irregexp
 }  // namespace js

--- a/js/src/vm/RegExpObject.cpp
+++ b/js/src/vm/RegExpObject.cpp
@ -946,10 +946,7 @@ bool js::StringHasRegExpMetaChars(JSLinearString* str) {
 /* RegExpShared */

 RegExpShared::RegExpShared(JSAtom* source, RegExpFlags flags)
-    : headerAndSource(source),
-      parenCount(0),
-      flags(flags),
-      canStringMatch(false) {}
+    : headerAndSource(source), parenCount(0), flags(flags) {}

 void RegExpShared::traceChildren(JSTracer* trc) {
  // Discard code to avoid holding onto ExecutablePools.
@ -958,9 +955,19 @@ void RegExpShared::traceChildren(JSTracer* trc) {
  }

  TraceNullableEdge(trc, &headerAndSource, "RegExpShared source");
+#ifdef ENABLE_NEW_REGEXP
+  if (kind() == RegExpShared::Kind::Atom) {
+    TraceNullableEdge(trc, &patternAtom_, "RegExpShared pattern atom");
+  } else {
+    for (auto& comp : compilationArray) {
+      TraceNullableEdge(trc, &comp.jitCode, "RegExpShared code");
+    }
+  }
+#else
  for (auto& comp : compilationArray) {
    TraceNullableEdge(trc, &comp.jitCode, "RegExpShared code");
  }
+#endif
 }

 void RegExpShared::discardJitCode() {
@ -1003,7 +1010,17 @@ bool RegExpShared::compileIfNecessary(JSContext* cx,
                                      MutableHandleRegExpShared re,
                                      HandleLinearString input,
                                      ForceByteCodeEnum force) {
-  MOZ_CRASH("TODO");
+  bool needsCompile = false;
+  if (re->kind() == RegExpShared::Kind::Unparsed) {
+    needsCompile = true;
+  }
+
+  // TODO: tier-up from interpreter to generated code
+
+  if (needsCompile) {
+    return irregexp::CompilePattern(cx, re, input);
+  }
+  return true;
 }

 /* static */
@ -1011,9 +1028,40 @@ RegExpRunStatus RegExpShared::execute(JSContext* cx,
                                      MutableHandleRegExpShared re,
                                      HandleLinearString input, size_t start,
                                      VectorMatchPairs* matches) {
+  MOZ_ASSERT(matches);
+
+  // TODO: Add tracelogger support
+
+  /* Compile the code at point-of-use. */
+  if (!compileIfNecessary(cx, re, input, DontForceByteCode)) {
+    return RegExpRunStatus_Error;
+  }
+
+  /*
+   * Ensure sufficient memory for output vector.
+   * No need to initialize it. The RegExp engine fills them in on a match.
+   */
+  if (!matches->allocOrExpandArray(re->pairCount())) {
+    ReportOutOfMemory(cx);
+    return RegExpRunStatus_Error;
+  }
+
+  if (re->kind() == RegExpShared::Kind::Atom) {
+    return RegExpShared::executeAtom(cx, re, input, start, matches);
+  }
+
  MOZ_CRASH("TODO");
 }
-#else
+
+void RegExpShared::useAtomMatch(HandleAtom pattern) {
+  MOZ_ASSERT(kind() == RegExpShared::Kind::Unparsed);
+  kind_ = RegExpShared::Kind::Atom;
+  patternAtom_ = pattern;
+  parenCount = 0;
+}
+
+#else   // !ENABLE_NEW_REGEXP
+
 /* static */
 bool RegExpShared::compile(JSContext* cx, MutableHandleRegExpShared re,
                           HandleAtom pattern, HandleLinearString input,
@ -1185,7 +1233,8 @@ RegExpRunStatus RegExpShared::execute(JSContext* cx,
  }
  return result;
 }
-#endif  // ENABLE_NEW_REGEXP
+#endif  // !ENABLE_NEW_REGEXP
+
 /* static */
 RegExpRunStatus RegExpShared::executeAtom(JSContext* cx,
                                          MutableHandleRegExpShared re,
--- a/js/src/vm/RegExpShared.h
+++ b/js/src/vm/RegExpShared.h
@ -72,6 +72,7 @@ struct RegExpByteCodeHeader {
 class RegExpShared : public gc::TenuredCell {
 public:
  enum ForceByteCodeEnum { DontForceByteCode, ForceByteCode };
+  enum class Kind { Unparsed, Atom, RegExp };

  using JitCodeTable = UniquePtr<uint8_t[], JS::FreePolicy>;
  using JitCodeTables = Vector<JitCodeTable, 0, SystemAllocPolicy>;
@ -103,7 +104,13 @@ class RegExpShared : public gc::TenuredCell {

  uint32_t parenCount;
  JS::RegExpFlags flags;
-  bool canStringMatch;
+
+#ifdef ENABLE_NEW_REGEXP
+  RegExpShared::Kind kind_ = Kind::Unparsed;
+  GCPtrAtom patternAtom_;
+#else
+  bool canStringMatch = false;
+#endif

  static int CompilationIndex(bool latin1) { return latin1 ? 0 : 1; }

@ -150,16 +157,31 @@ class RegExpShared : public gc::TenuredCell {
  /* Accessors */

  size_t getParenCount() const {
+#ifdef ENABLE_NEW_REGEXP
+    MOZ_ASSERT(kind() != Kind::Unparsed);
+#else
    MOZ_ASSERT(isCompiled());
+#endif
    return parenCount;
  }

+#ifdef ENABLE_NEW_REGEXP
+  RegExpShared::Kind kind() const { return kind_; }
+
+  // Use simple string matching for this regexp.
+  void useAtomMatch(HandleAtom pattern);
+#endif
+
  /* Accounts for the "0" (whole match) pair. */
  size_t pairCount() const { return getParenCount() + 1; }

  JSAtom* getSource() const { return headerAndSource.ptr(); }

+#ifdef ENABLE_NEW_REGEXP
+  JSAtom* patternAtom() const { return patternAtom_; }
+#else
  JSAtom* patternAtom() const { return getSource(); }
+#endif

  JS::RegExpFlags getFlags() const { return flags; }