From 666cd9009388b792556ef8b34984b47dd20a1a08 Mon Sep 17 00:00:00 2001
From: Iain Ireland <iireland@mozilla.com>
Date: Tue, 7 Apr 2020 22:18:53 +0000
Subject: [PATCH] Bug 1627838: Simple string matches r=mgaudet

The old version only supported matching the exact pattern atom. This version supports patterns with escape characters (like `/\u212a/`).

The Boyer-Moore heuristic code is taken from here:
https://github.com/v8/v8/blob/8e4a5e973e076adfdc035cfe1d0115f047709160/src/regexp/regexp.cc#L116-L136

Differential Revision: https://phabricator.services.mozilla.com/D69896

--HG--
extra : moz-landing-system : lando
---
 js/src/new-regexp/RegExpAPI.cpp | 94 +++++++++++++++++++++++++++++++++
 js/src/new-regexp/RegExpAPI.h   |  3 ++
 js/src/vm/RegExpObject.cpp      | 63 +++++++++++++++++++---
 js/src/vm/RegExpShared.h        | 24 ++++++++-
 4 files changed, 176 insertions(+), 8 deletions(-)
diff --git a/js/src/new-regexp/RegExpAPI.cpp b/js/src/new-regexp/RegExpAPI.cpp
index 4ac24168342f..04c8383ad939 100644
--- a/js/src/new-regexp/RegExpAPI.cpp
+++ b/js/src/new-regexp/RegExpAPI.cpp
@@ -13,10 +13,13 @@
 #include "mozilla/ArrayUtils.h"
 #include "mozilla/Casting.h"
 
+#include "new-regexp/regexp-compiler.h"
+#include "new-regexp/regexp-macro-assembler-arch.h"
 #include "new-regexp/regexp-parser.h"
 #include "new-regexp/regexp-shim.h"
 #include "new-regexp/regexp.h"
 #include "util/StringBuffer.h"
+#include "vm/RegExpShared.h"
 
 namespace js {
 namespace irregexp {
@@ -30,9 +33,12 @@ using frontend::TokenStreamAnyChars;
 using v8::internal::FlatStringReader;
 using v8::internal::RegExpCompileData;
 using v8::internal::RegExpError;
+using v8::internal::RegExpNode;
 using v8::internal::RegExpParser;
 using v8::internal::Zone;
 
+using namespace v8::internal::regexp_compiler_constants;
+
 static uint32_t ErrorNumber(RegExpError err) {
   switch (err) {
     case RegExpError::kNone:
@@ -234,5 +240,93 @@ bool CheckPatternSyntax(JSContext* cx, TokenStreamAnyChars& ts,
   return true;
 }
 
+// A regexp is a good candidate for Boyer-Moore if it has at least 3
+// times as many characters as it has unique characters. Note that
+// table lookups in irregexp are done modulo tableSize (128).
+template <typename CharT>
+static bool HasFewDifferentCharacters(const CharT* chars, size_t length) {
+  const uint32_t tableSize =
+      v8::internal::NativeRegExpMacroAssembler::kTableSize;
+  bool character_found[tableSize];
+  uint32_t different = 0;
+  memset(&character_found[0], 0, sizeof(character_found));
+  for (uint32_t i = 0; i < length; i++) {
+    uint32_t ch = chars[i] % tableSize;
+    if (!character_found[ch]) {
+      character_found[ch] = true;
+      different++;
+      // We declare a regexp low-alphabet if it has at least 3 times as many
+      // characters as it has different characters.
+      if (different * 3 > length) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+// Identifies the sort of pattern where Boyer-Moore is faster than string search
+static bool UseBoyerMoore(HandleAtom pattern, JS::AutoAssertNoGC& nogc) {
+  size_t length =
+      std::min(size_t(kMaxLookaheadForBoyerMoore), pattern->length());
+  if (length <= kPatternTooShortForBoyerMoore) {
+    return false;
+  }
+
+  if (pattern->hasLatin1Chars()) {
+    return HasFewDifferentCharacters(pattern->latin1Chars(nogc), length);
+  }
+  MOZ_ASSERT(pattern->hasTwoByteChars());
+  return HasFewDifferentCharacters(pattern->twoByteChars(nogc), length);
+}
+
+bool CompilePattern(JSContext* cx, MutableHandleRegExpShared re,
+                    HandleLinearString input) {
+  RootedAtom pattern(cx, re->getSource());
+  JS::RegExpFlags flags = re->getFlags();
+  LifoAllocScope allocScope(&cx->tempLifoAlloc());
+  Zone zone(allocScope.alloc());
+
+  RegExpCompileData data;
+  {
+    FlatStringReader patternBytes(pattern);
+    if (!RegExpParser::ParseRegExp(cx->isolate, &zone, &patternBytes, flags,
+                                   &data)) {
+      MOZ_ASSERT(data.error == RegExpError::kStackOverflow);
+      JS::CompileOptions options(cx);
+      TokenStream dummyTokenStream(cx, options, nullptr, 0, nullptr);
+      ReportSyntaxError(dummyTokenStream, data, pattern);
+      return false;
+    }
+  }
+
+  if (re->kind() == RegExpShared::Kind::Unparsed) {
+    // This is the first time we have compiled this regexp.
+    // First, check to see if we should use simple string search
+    // with an atom.
+    if (!flags.ignoreCase() && !flags.sticky()) {
+      RootedAtom searchAtom(cx);
+      if (data.simple) {
+        // The parse-tree is a single atom that is equal to the pattern.
+        searchAtom = re->getSource();
+      } else if (data.tree->IsAtom() && data.capture_count == 0) {
+        // The parse-tree is a single atom that is not equal to the pattern.
+        v8::internal::RegExpAtom* atom = data.tree->AsAtom();
+        const char16_t* twoByteChars = atom->data().begin();
+        searchAtom = AtomizeChars(cx, twoByteChars, atom->length());
+        if (!searchAtom) {
+          return false;
+        }
+      }
+      JS::AutoAssertNoGC nogc(cx);
+      if (searchAtom && !UseBoyerMoore(searchAtom, nogc)) {
+        re->useAtomMatch(searchAtom);
+        return true;
+      }
+    }
+  }
+  MOZ_CRASH("TODO");
+}
+
 }  // namespace irregexp
 }  // namespace js
diff --git a/js/src/new-regexp/RegExpAPI.h b/js/src/new-regexp/RegExpAPI.h
index fdb82d39e648..f8d356cc0bf5 100644
--- a/js/src/new-regexp/RegExpAPI.h
+++ b/js/src/new-regexp/RegExpAPI.h
@@ -24,6 +24,9 @@ bool CheckPatternSyntax(JSContext* cx, frontend::TokenStreamAnyChars& ts,
 bool CheckPatternSyntax(JSContext* cx, frontend::TokenStreamAnyChars& ts,
                         HandleAtom pattern, JS::RegExpFlags flags);
 
+bool CompilePattern(JSContext* cx, MutableHandleRegExpShared re,
+                    HandleLinearString input);
+
 }  // namespace irregexp
 }  // namespace js
 
diff --git a/js/src/vm/RegExpObject.cpp b/js/src/vm/RegExpObject.cpp
index d905012c594a..c362a538e432 100644
--- a/js/src/vm/RegExpObject.cpp
+++ b/js/src/vm/RegExpObject.cpp
@@ -946,10 +946,7 @@ bool js::StringHasRegExpMetaChars(JSLinearString* str) {
 /* RegExpShared */
 
 RegExpShared::RegExpShared(JSAtom* source, RegExpFlags flags)
-    : headerAndSource(source),
-      parenCount(0),
-      flags(flags),
-      canStringMatch(false) {}
+    : headerAndSource(source), parenCount(0), flags(flags) {}
 
 void RegExpShared::traceChildren(JSTracer* trc) {
   // Discard code to avoid holding onto ExecutablePools.
@@ -958,9 +955,19 @@ void RegExpShared::traceChildren(JSTracer* trc) {
   }
 
   TraceNullableEdge(trc, &headerAndSource, "RegExpShared source");
+#ifdef ENABLE_NEW_REGEXP
+  if (kind() == RegExpShared::Kind::Atom) {
+    TraceNullableEdge(trc, &patternAtom_, "RegExpShared pattern atom");
+  } else {
+    for (auto& comp : compilationArray) {
+      TraceNullableEdge(trc, &comp.jitCode, "RegExpShared code");
+    }
+  }
+#else
   for (auto& comp : compilationArray) {
     TraceNullableEdge(trc, &comp.jitCode, "RegExpShared code");
   }
+#endif
 }
 
 void RegExpShared::discardJitCode() {
@@ -1003,7 +1010,17 @@ bool RegExpShared::compileIfNecessary(JSContext* cx,
                                       MutableHandleRegExpShared re,
                                       HandleLinearString input,
                                       ForceByteCodeEnum force) {
-  MOZ_CRASH("TODO");
+  bool needsCompile = false;
+  if (re->kind() == RegExpShared::Kind::Unparsed) {
+    needsCompile = true;
+  }
+
+  // TODO: tier-up from interpreter to generated code
+
+  if (needsCompile) {
+    return irregexp::CompilePattern(cx, re, input);
+  }
+  return true;
 }
 
 /* static */
@@ -1011,9 +1028,40 @@ RegExpRunStatus RegExpShared::execute(JSContext* cx,
                                       MutableHandleRegExpShared re,
                                       HandleLinearString input, size_t start,
                                       VectorMatchPairs* matches) {
+  MOZ_ASSERT(matches);
+
+  // TODO: Add tracelogger support
+
+  /* Compile the code at point-of-use. */
+  if (!compileIfNecessary(cx, re, input, DontForceByteCode)) {
+    return RegExpRunStatus_Error;
+  }
+
+  /*
+   * Ensure sufficient memory for output vector.
+   * No need to initialize it. The RegExp engine fills them in on a match.
+   */
+  if (!matches->allocOrExpandArray(re->pairCount())) {
+    ReportOutOfMemory(cx);
+    return RegExpRunStatus_Error;
+  }
+
+  if (re->kind() == RegExpShared::Kind::Atom) {
+    return RegExpShared::executeAtom(cx, re, input, start, matches);
+  }
+
   MOZ_CRASH("TODO");
 }
-#else
+
+void RegExpShared::useAtomMatch(HandleAtom pattern) {
+  MOZ_ASSERT(kind() == RegExpShared::Kind::Unparsed);
+  kind_ = RegExpShared::Kind::Atom;
+  patternAtom_ = pattern;
+  parenCount = 0;
+}
+
+#else   // !ENABLE_NEW_REGEXP
+
 /* static */
 bool RegExpShared::compile(JSContext* cx, MutableHandleRegExpShared re,
                            HandleAtom pattern, HandleLinearString input,
@@ -1185,7 +1233,8 @@ RegExpRunStatus RegExpShared::execute(JSContext* cx,
   }
   return result;
 }
-#endif  // ENABLE_NEW_REGEXP
+#endif  // !ENABLE_NEW_REGEXP
+
 /* static */
 RegExpRunStatus RegExpShared::executeAtom(JSContext* cx,
                                           MutableHandleRegExpShared re,
diff --git a/js/src/vm/RegExpShared.h b/js/src/vm/RegExpShared.h
index 8658ef5bb485..3f711d5340fd 100644
--- a/js/src/vm/RegExpShared.h
+++ b/js/src/vm/RegExpShared.h
@@ -72,6 +72,7 @@ struct RegExpByteCodeHeader {
 class RegExpShared : public gc::TenuredCell {
  public:
   enum ForceByteCodeEnum { DontForceByteCode, ForceByteCode };
+  enum class Kind { Unparsed, Atom, RegExp };
 
   using JitCodeTable = UniquePtr<uint8_t[], JS::FreePolicy>;
   using JitCodeTables = Vector<JitCodeTable, 0, SystemAllocPolicy>;
@@ -103,7 +104,13 @@ class RegExpShared : public gc::TenuredCell {
 
   uint32_t parenCount;
   JS::RegExpFlags flags;
-  bool canStringMatch;
+
+#ifdef ENABLE_NEW_REGEXP
+  RegExpShared::Kind kind_ = Kind::Unparsed;
+  GCPtrAtom patternAtom_;
+#else
+  bool canStringMatch = false;
+#endif
 
   static int CompilationIndex(bool latin1) { return latin1 ? 0 : 1; }
 
@@ -150,16 +157,31 @@ class RegExpShared : public gc::TenuredCell {
   /* Accessors */
 
   size_t getParenCount() const {
+#ifdef ENABLE_NEW_REGEXP
+    MOZ_ASSERT(kind() != Kind::Unparsed);
+#else
     MOZ_ASSERT(isCompiled());
+#endif
     return parenCount;
   }
 
+#ifdef ENABLE_NEW_REGEXP
+  RegExpShared::Kind kind() const { return kind_; }
+
+  // Use simple string matching for this regexp.
+  void useAtomMatch(HandleAtom pattern);
+#endif
+
   /* Accounts for the "0" (whole match) pair. */
   size_t pairCount() const { return getParenCount() + 1; }
 
   JSAtom* getSource() const { return headerAndSource.ptr(); }
 
+#ifdef ENABLE_NEW_REGEXP
+  JSAtom* patternAtom() const { return patternAtom_; }
+#else
   JSAtom* patternAtom() const { return getSource(); }
+#endif
 
   JS::RegExpFlags getFlags() const { return flags; }