make a parameterized module out of the RegexpMatching implementation

2022-03-22 11:42:11 +01:00 · 2022-03-22 11:42:11 +01:00 · dbeae9aefb
--- a/config/identical-files.json
+++ b/config/identical-files.json
@ -501,6 +501,11 @@
    "ruby/ql/lib/codeql/ruby/security/performance/SuperlinearBackTracking.qll",
    "java/ql/lib/semmle/code/java/security/performance/SuperlinearBackTracking.qll"
  ],
+  "RegexpMatching Python/JS/Ruby": [
+    "javascript/ql/lib/semmle/javascript/security/performance/RegexpMatching.qll",
+    "python/ql/lib/semmle/python/security/performance/RegexpMatching.qll",
+    "ruby/ql/lib/codeql/ruby/security/performance/RegexpMatching.qll"
+  ],
  "BadTagFilterQuery Python/JS/Ruby": [
    "javascript/ql/lib/semmle/javascript/security/BadTagFilterQuery.qll",
    "python/ql/lib/semmle/python/security/BadTagFilterQuery.qll",
--- a/javascript/ql/lib/semmle/javascript/security/BadTagFilterQuery.qll
+++ b/javascript/ql/lib/semmle/javascript/security/BadTagFilterQuery.qll
@ -2,196 +2,25 @@
 * Provides precicates for reasoning about bad tag filter vulnerabilities.
 */

-import performance.ReDoSUtil
+import performance.RegexpMatching

 /**
- * A module for determining if a regexp matches a given string,
- * and reasoning about which capture groups are filled by a given string.
+ * Holds if the regexp `root` should be tested against `str`.
+ * Implements the `isRegexpMatchingCandidateSig` signature from `RegexpMatching`.
 */
-private module RegexpMatching {
-  /**
-   * A class to test whether a regular expression matches a string.
-   * Override this class and extend `test`/`testWithGroups` to configure which strings should be tested for acceptance by this regular expression.
-   * The result can afterwards be read from the `matches` predicate.
-   *
-   * Strings in the `testWithGroups` predicate are also tested for which capture groups are filled by the given string.
-   * The result is available in the `fillCaptureGroup` predicate.
-   */
-  abstract class MatchedRegExp extends RegExpTerm {
-    MatchedRegExp() { this.isRootTerm() }
-
-    /**
-     * Holds if it should be tested whether this regular expression matches `str`.
-     *
-     * If `ignorePrefix` is true, then a regexp without a start anchor will be treated as if it had a start anchor.
-     * E.g. a regular expression `/foo$/` will match any string that ends with "foo",
-     * but if `ignorePrefix` is true, it will only match "foo".
-     */
-    predicate test(string str, boolean ignorePrefix) {
-      none() // maybe overridden in subclasses
-    }
-
-    /**
-     * Same as `test(..)`, but where the `fillsCaptureGroup` afterwards tells which capture groups were filled by the given string.
-     */
-    predicate testWithGroups(string str, boolean ignorePrefix) {
-      none() // maybe overridden in subclasses
-    }
-
-    /**
-     * Holds if this RegExp matches `str`, where `str` is either in the `test` or `testWithGroups` predicate.
-     */
-    final predicate matches(string str) {
-      exists(State state | state = getAState(this, str.length() - 1, str, _) |
-        epsilonSucc*(state) = Accept(_)
-      )
-    }
-
-    /**
-     * Holds if matching `str` may fill capture group number `g`.
-     * Only holds if `str` is in the `testWithGroups` predicate.
-     */
-    final predicate fillsCaptureGroup(string str, int g) {
-      exists(State s |
-        s = getAStateThatReachesAccept(this, _, str, _) and
-        g = group(s.getRepr())
-      )
-    }
-  }
-
-  /**
-   * Gets a state the regular expression `reg` can be in after matching the `i`th char in `str`.
-   * The regular expression is modeled as a non-determistic finite automaton,
-   * the regular expression can therefore be in multiple states after matching a character.
-   *
-   * It's a forward search to all possible states, and there is thus no guarantee that the state is on a path to an accepting state.
-   */
-  private State getAState(MatchedRegExp reg, int i, string str, boolean ignorePrefix) {
-    // start state, the -1 position before any chars have been matched
-    i = -1 and
-    (
-      reg.test(str, ignorePrefix)
-      or
-      reg.testWithGroups(str, ignorePrefix)
-    ) and
-    result.getRepr().getRootTerm() = reg and
-    isStartState(result)
+private predicate isBadTagFilterCandidate(
+  RootTerm root, string str, boolean ignorePrefix, boolean testWithGroups
+) {
+  // the regexp must mention "<" and ">" explicitly.
+  forall(string angleBracket | angleBracket = ["<", ">"] |
+    any(RegExpConstant term | term.getValue().matches("%" + angleBracket + "%")).getRootTerm() =
+      root
+  ) and
+  ignorePrefix = true and
+  (
+    str = ["<!-- foo -->", "<!-- foo --!>", "<!- foo ->", "<foo>", "<script>"] and
+    testWithGroups = true
    or
-    // recursive case
-    result = getAStateAfterMatching(reg, _, str, i, _, ignorePrefix)
-  }
-
-  /**
-   * Gets the next state after the `prev` state from `reg`.
-   * `prev` is the state after matching `fromIndex` chars in `str`,
-   * and the result is the state after matching `toIndex` chars in `str`.
-   *
-   * This predicate is used as a step relation in the forwards search (`getAState`),
-   * and also as a step relation in the later backwards search (`getAStateThatReachesAccept`).
-   */
-  private State getAStateAfterMatching(
-    MatchedRegExp reg, State prev, string str, int toIndex, int fromIndex, boolean ignorePrefix
-  ) {
-    // the basic recursive case - outlined into a noopt helper to make performance work out.
-    result = getAStateAfterMatchingAux(reg, prev, str, toIndex, fromIndex, ignorePrefix)
-    or
-    // we can skip past word boundaries if the next char is a non-word char.
-    fromIndex = toIndex and
-    prev.getRepr() instanceof RegExpWordBoundary and
-    prev = getAState(reg, toIndex, str, ignorePrefix) and
-    after(prev.getRepr()) = result and
-    str.charAt(toIndex + 1).regexpMatch("\\W") // \W matches any non-word char.
-  }
-
-  pragma[noopt]
-  private State getAStateAfterMatchingAux(
-    MatchedRegExp reg, State prev, string str, int toIndex, int fromIndex, boolean ignorePrefix
-  ) {
-    prev = getAState(reg, fromIndex, str, ignorePrefix) and
-    fromIndex = toIndex - 1 and
-    exists(string char | char = str.charAt(toIndex) | specializedDeltaClosed(prev, char, result)) and
-    not discardedPrefixStep(prev, result, ignorePrefix)
-  }
-
-  /** Holds if a step from `prev` to `next` should be discarded when the `ignorePrefix` flag is set. */
-  private predicate discardedPrefixStep(State prev, State next, boolean ignorePrefix) {
-    prev = mkMatch(any(RegExpRoot r)) and
-    ignorePrefix = true and
-    next = prev
-  }
-
-  // The `deltaClosed` relation specialized to the chars that exists in strings tested by a `MatchedRegExp`.
-  private predicate specializedDeltaClosed(State prev, string char, State next) {
-    deltaClosed(prev, specializedGetAnInputSymbolMatching(char), next)
-  }
-
-  // The `getAnInputSymbolMatching` relation specialized to the chars that exists in strings tested by a `MatchedRegExp`.
-  pragma[noinline]
-  private InputSymbol specializedGetAnInputSymbolMatching(string char) {
-    exists(string s, MatchedRegExp r |
-      r.test(s, _)
-      or
-      r.testWithGroups(s, _)
-    |
-      char = s.charAt(_)
-    ) and
-    result = getAnInputSymbolMatching(char)
-  }
-
-  /**
-   * Gets the `i`th state on a path to the accepting state when `reg` matches `str`.
-   * Starts with an accepting state as found by `getAState` and searches backwards
-   * to the start state through the reachable states (as found by `getAState`).
-   *
-   * This predicate holds the invariant that the result state can be reached with `i` steps from a start state,
-   * and an accepting state can be found after (`str.length() - 1 - i`) steps from the result.
-   * The result state is therefore always on a valid path where `reg` accepts `str`.
-   *
-   * This predicate is only used to find which capture groups a regular expression has filled,
-   * and thus the search is only performed for the strings in the `testWithGroups(..)` predicate.
-   */
-  private State getAStateThatReachesAccept(
-    MatchedRegExp reg, int i, string str, boolean ignorePrefix
-  ) {
-    // base case, reaches an accepting state from the last state in `getAState(..)`
-    reg.testWithGroups(str, ignorePrefix) and
-    i = str.length() - 1 and
-    result = getAState(reg, i, str, ignorePrefix) and
-    epsilonSucc*(result) = Accept(_)
-    or
-    // recursive case. `next` is the next state to be matched after matching `prev`.
-    // this predicate is doing a backwards search, so `prev` is the result we are looking for.
-    exists(State next, State prev, int fromIndex, int toIndex |
-      next = getAStateThatReachesAccept(reg, toIndex, str, ignorePrefix) and
-      next = getAStateAfterMatching(reg, prev, str, toIndex, fromIndex, ignorePrefix) and
-      i = fromIndex and
-      result = prev
-    )
-  }
-
-  /** Gets the capture group number that `term` belongs to. */
-  private int group(RegExpTerm term) {
-    exists(RegExpGroup grp | grp.getNumber() = result | term.getParent*() = grp)
-  }
-}
-
-/** A class to test whether a regular expression matches certain HTML tags. */
-class HtmlMatchingRegExp extends RegexpMatching::MatchedRegExp {
-  HtmlMatchingRegExp() {
-    // the regexp must mention "<" and ">" explicitly.
-    forall(string angleBracket | angleBracket = ["<", ">"] |
-      any(RegExpConstant term | term.getValue().matches("%" + angleBracket + "%")).getRootTerm() =
-        this
-    )
-  }
-
-  override predicate testWithGroups(string str, boolean ignorePrefix) {
-    ignorePrefix = true and
-    str = ["<!-- foo -->", "<!-- foo --!>", "<!- foo ->", "<foo>", "<script>"]
-  }
-
-  override predicate test(string str, boolean ignorePrefix) {
-    ignorePrefix = true and
    str =
      [
        "<!-- foo -->", "<!- foo ->", "<!-- foo --!>", "<!-- foo\n -->", "<script>foo</script>",
@ -200,12 +29,25 @@ class HtmlMatchingRegExp extends RegexpMatching::MatchedRegExp {
        "<script src='foo'></script>", "<SCRIPT>foo</SCRIPT>", "<script\tsrc=\"foo\"/>",
        "<script\tsrc='foo'></script>", "<sCrIpT>foo</ScRiPt>", "<script src=\"foo\">foo</script >",
        "<script src=\"foo\">foo</script foo=\"bar\">", "<script src=\"foo\">foo</script\t\n bar>"
-      ]
-  }
+      ] and
+    testWithGroups = false
+  )
 }

-/** DEPRECATED: Alias for HtmlMatchingRegExp */
-deprecated class HTMLMatchingRegExp = HtmlMatchingRegExp;
+/**
+ * A regexp that matches some string from the `isBadTagFilterCandidate` predicate.
+ */
+class HtmlMatchingRegExp extends RootTerm {
+  HtmlMatchingRegExp() { RegexpMatching<isBadTagFilterCandidate/4>::matches(this, _) }
+
+  /** Holds if this regexp matched `str`, where `str` is one of the string from `isBadTagFilterCandidate`. */
+  predicate matches(string str) { RegexpMatching<isBadTagFilterCandidate/4>::matches(this, str) }
+
+  /** Holds if this regexp fills capture group `g' when matching `str', where `str` is one of the string from `isBadTagFilterCandidate`. */
+  predicate fillsCaptureGroup(string str, int g) {
+    RegexpMatching<isBadTagFilterCandidate/4>::fillsCaptureGroup(this, str, g)
+  }
+}

 /**
 * Holds if `regexp` matches some HTML tags, but misses some HTML tags that it should match.
--- a/javascript/ql/lib/semmle/javascript/security/performance/RegexpMatching.qll
+++ b/javascript/ql/lib/semmle/javascript/security/performance/RegexpMatching.qll
@ -0,0 +1,175 @@
+/**
+ * Provides precicates for reasoning about which strings are matched by a regular expression,
+ * and for testing which capture groups are filled when a particular regexp matches a string.
+ */
+
+import ReDoSUtil
+
+/** A root term */
+class RootTerm extends RegExpTerm {
+  RootTerm() { this.isRootTerm() }
+}
+
+/**
+ * Holds if it should be tested whether `reg` matches `str`.
+ *
+ * If `ignorePrefix` is true, then a regexp without a start anchor will be treated as if it had a start anchor.
+ * E.g. a regular expression `/foo$/` will match any string that ends with "foo",
+ * but if `ignorePrefix` is true, it will only match "foo".
+ *
+ * If `testWithGroups` is true, then the `RegexpMatching::fillsCaptureGroup` predicate can be used to determine which capture
+ * groups are filled by a string.
+ */
+signature predicate isRegexpMatchingCandidateSig(
+  RootTerm root, string str, boolean ignorePrefix, boolean testWithGroups
+);
+
+/**
+ * A module for determining if a regexp matches a given string,
+ * and reasoning about which capture groups are filled by a given string.
+ *
+ * The module parameter `isRegexpMatchingCandidateSig` determines which string should be tested,
+ * and the results can be read from the `matches` and `fillsCaptureGroup` predicates.
+ */
+module RegexpMatching<isRegexpMatchingCandidateSig/4 isCandidate> {
+  private predicate test(RootTerm reg, string str, boolean ignorePrefix) {
+    isCandidate(reg, str, ignorePrefix, false)
+  }
+
+  private predicate testWithGroups(RootTerm reg, string str, boolean ignorePrefix) {
+    isCandidate(reg, str, ignorePrefix, true)
+  }
+
+  /**
+   * Gets a state the regular expression `reg` can be in after matching the `i`th char in `str`.
+   * The regular expression is modeled as a non-determistic finite automaton,
+   * the regular expression can therefore be in multiple states after matching a character.
+   *
+   * It's a forward search to all possible states, and there is thus no guarantee that the state is on a path to an accepting state.
+   */
+  private State getAState(RootTerm reg, int i, string str, boolean ignorePrefix) {
+    // start state, the -1 position before any chars have been matched
+    i = -1 and
+    (
+      test(reg, str, ignorePrefix)
+      or
+      testWithGroups(reg, str, ignorePrefix)
+    ) and
+    result.getRepr().getRootTerm() = reg and
+    isStartState(result)
+    or
+    // recursive case
+    result = getAStateAfterMatching(reg, _, str, i, _, ignorePrefix)
+  }
+
+  /**
+   * Gets the next state after the `prev` state from `reg`.
+   * `prev` is the state after matching `fromIndex` chars in `str`,
+   * and the result is the state after matching `toIndex` chars in `str`.
+   *
+   * This predicate is used as a step relation in the forwards search (`getAState`),
+   * and also as a step relation in the later backwards search (`getAStateThatReachesAccept`).
+   */
+  private State getAStateAfterMatching(
+    RootTerm reg, State prev, string str, int toIndex, int fromIndex, boolean ignorePrefix
+  ) {
+    // the basic recursive case - outlined into a noopt helper to make performance work out.
+    result = getAStateAfterMatchingAux(reg, prev, str, toIndex, fromIndex, ignorePrefix)
+    or
+    // we can skip past word boundaries if the next char is a non-word char.
+    fromIndex = toIndex and
+    prev.getRepr() instanceof RegExpWordBoundary and
+    prev = getAState(reg, toIndex, str, ignorePrefix) and
+    after(prev.getRepr()) = result and
+    str.charAt(toIndex + 1).regexpMatch("\\W") // \W matches any non-word char.
+  }
+
+  pragma[noopt]
+  private State getAStateAfterMatchingAux(
+    RootTerm reg, State prev, string str, int toIndex, int fromIndex, boolean ignorePrefix
+  ) {
+    prev = getAState(reg, fromIndex, str, ignorePrefix) and
+    fromIndex = toIndex - 1 and
+    exists(string char | char = str.charAt(toIndex) | specializedDeltaClosed(prev, char, result)) and
+    not discardedPrefixStep(prev, result, ignorePrefix)
+  }
+
+  /** Holds if a step from `prev` to `next` should be discarded when the `ignorePrefix` flag is set. */
+  private predicate discardedPrefixStep(State prev, State next, boolean ignorePrefix) {
+    prev = mkMatch(any(RegExpRoot r)) and
+    ignorePrefix = true and
+    next = prev
+  }
+
+  // The `deltaClosed` relation specialized to the chars that exists in strings tested by a `MatchedRegExp`.
+  private predicate specializedDeltaClosed(State prev, string char, State next) {
+    deltaClosed(prev, specializedGetAnInputSymbolMatching(char), next)
+  }
+
+  // The `getAnInputSymbolMatching` relation specialized to the chars that exists in strings tested by a `MatchedRegExp`.
+  pragma[noinline]
+  private InputSymbol specializedGetAnInputSymbolMatching(string char) {
+    exists(string s, RootTerm r |
+      test(r, s, _)
+      or
+      testWithGroups(r, s, _)
+    |
+      char = s.charAt(_)
+    ) and
+    result = getAnInputSymbolMatching(char)
+  }
+
+  /**
+   * Gets the `i`th state on a path to the accepting state when `reg` matches `str`.
+   * Starts with an accepting state as found by `getAState` and searches backwards
+   * to the start state through the reachable states (as found by `getAState`).
+   *
+   * This predicate holds the invariant that the result state can be reached with `i` steps from a start state,
+   * and an accepting state can be found after (`str.length() - 1 - i`) steps from the result.
+   * The result state is therefore always on a valid path where `reg` accepts `str`.
+   *
+   * This predicate is only used to find which capture groups a regular expression has filled,
+   * and thus the search is only performed for the strings in the `testWithGroups(..)` predicate.
+   */
+  private State getAStateThatReachesAccept(RootTerm reg, int i, string str, boolean ignorePrefix) {
+    // base case, reaches an accepting state from the last state in `getAState(..)`
+    testWithGroups(reg, str, ignorePrefix) and
+    i = str.length() - 1 and
+    result = getAState(reg, i, str, ignorePrefix) and
+    epsilonSucc*(result) = Accept(_)
+    or
+    // recursive case. `next` is the next state to be matched after matching `prev`.
+    // this predicate is doing a backwards search, so `prev` is the result we are looking for.
+    exists(State next, State prev, int fromIndex, int toIndex |
+      next = getAStateThatReachesAccept(reg, toIndex, str, ignorePrefix) and
+      next = getAStateAfterMatching(reg, prev, str, toIndex, fromIndex, ignorePrefix) and
+      i = fromIndex and
+      result = prev
+    )
+  }
+
+  /** Gets the capture group number that `term` belongs to. */
+  private int group(RegExpTerm term) {
+    exists(RegExpGroup grp | grp.getNumber() = result | term.getParent*() = grp)
+  }
+
+  /**
+   * Holds if `reg` matches `str`, where `str` is either in the `test` or `testWithGroups` predicate.
+   */
+  predicate matches(RootTerm reg, string str) {
+    exists(State state | state = getAState(reg, str.length() - 1, str, _) |
+      epsilonSucc*(state) = Accept(_)
+    )
+  }
+
+  /**
+   * Holds if matching `str` against `reg` may fill capture group number `g`.
+   * Only holds if `str` is in the `testWithGroups` predicate.
+   */
+  predicate fillsCaptureGroup(RootTerm reg, string str, int g) {
+    exists(State s |
+      s = getAStateThatReachesAccept(reg, _, str, _) and
+      g = group(s.getRepr())
+    )
+  }
+}
--- a/python/ql/lib/semmle/python/security/BadTagFilterQuery.qll
+++ b/python/ql/lib/semmle/python/security/BadTagFilterQuery.qll
@ -2,196 +2,25 @@
 * Provides precicates for reasoning about bad tag filter vulnerabilities.
 */

-import performance.ReDoSUtil
+import performance.RegexpMatching

 /**
- * A module for determining if a regexp matches a given string,
- * and reasoning about which capture groups are filled by a given string.
+ * Holds if the regexp `root` should be tested against `str`.
+ * Implements the `isRegexpMatchingCandidateSig` signature from `RegexpMatching`.
 */
-private module RegexpMatching {
-  /**
-   * A class to test whether a regular expression matches a string.
-   * Override this class and extend `test`/`testWithGroups` to configure which strings should be tested for acceptance by this regular expression.
-   * The result can afterwards be read from the `matches` predicate.
-   *
-   * Strings in the `testWithGroups` predicate are also tested for which capture groups are filled by the given string.
-   * The result is available in the `fillCaptureGroup` predicate.
-   */
-  abstract class MatchedRegExp extends RegExpTerm {
-    MatchedRegExp() { this.isRootTerm() }
-
-    /**
-     * Holds if it should be tested whether this regular expression matches `str`.
-     *
-     * If `ignorePrefix` is true, then a regexp without a start anchor will be treated as if it had a start anchor.
-     * E.g. a regular expression `/foo$/` will match any string that ends with "foo",
-     * but if `ignorePrefix` is true, it will only match "foo".
-     */
-    predicate test(string str, boolean ignorePrefix) {
-      none() // maybe overridden in subclasses
-    }
-
-    /**
-     * Same as `test(..)`, but where the `fillsCaptureGroup` afterwards tells which capture groups were filled by the given string.
-     */
-    predicate testWithGroups(string str, boolean ignorePrefix) {
-      none() // maybe overridden in subclasses
-    }
-
-    /**
-     * Holds if this RegExp matches `str`, where `str` is either in the `test` or `testWithGroups` predicate.
-     */
-    final predicate matches(string str) {
-      exists(State state | state = getAState(this, str.length() - 1, str, _) |
-        epsilonSucc*(state) = Accept(_)
-      )
-    }
-
-    /**
-     * Holds if matching `str` may fill capture group number `g`.
-     * Only holds if `str` is in the `testWithGroups` predicate.
-     */
-    final predicate fillsCaptureGroup(string str, int g) {
-      exists(State s |
-        s = getAStateThatReachesAccept(this, _, str, _) and
-        g = group(s.getRepr())
-      )
-    }
-  }
-
-  /**
-   * Gets a state the regular expression `reg` can be in after matching the `i`th char in `str`.
-   * The regular expression is modeled as a non-determistic finite automaton,
-   * the regular expression can therefore be in multiple states after matching a character.
-   *
-   * It's a forward search to all possible states, and there is thus no guarantee that the state is on a path to an accepting state.
-   */
-  private State getAState(MatchedRegExp reg, int i, string str, boolean ignorePrefix) {
-    // start state, the -1 position before any chars have been matched
-    i = -1 and
-    (
-      reg.test(str, ignorePrefix)
-      or
-      reg.testWithGroups(str, ignorePrefix)
-    ) and
-    result.getRepr().getRootTerm() = reg and
-    isStartState(result)
+private predicate isBadTagFilterCandidate(
+  RootTerm root, string str, boolean ignorePrefix, boolean testWithGroups
+) {
+  // the regexp must mention "<" and ">" explicitly.
+  forall(string angleBracket | angleBracket = ["<", ">"] |
+    any(RegExpConstant term | term.getValue().matches("%" + angleBracket + "%")).getRootTerm() =
+      root
+  ) and
+  ignorePrefix = true and
+  (
+    str = ["<!-- foo -->", "<!-- foo --!>", "<!- foo ->", "<foo>", "<script>"] and
+    testWithGroups = true
    or
-    // recursive case
-    result = getAStateAfterMatching(reg, _, str, i, _, ignorePrefix)
-  }
-
-  /**
-   * Gets the next state after the `prev` state from `reg`.
-   * `prev` is the state after matching `fromIndex` chars in `str`,
-   * and the result is the state after matching `toIndex` chars in `str`.
-   *
-   * This predicate is used as a step relation in the forwards search (`getAState`),
-   * and also as a step relation in the later backwards search (`getAStateThatReachesAccept`).
-   */
-  private State getAStateAfterMatching(
-    MatchedRegExp reg, State prev, string str, int toIndex, int fromIndex, boolean ignorePrefix
-  ) {
-    // the basic recursive case - outlined into a noopt helper to make performance work out.
-    result = getAStateAfterMatchingAux(reg, prev, str, toIndex, fromIndex, ignorePrefix)
-    or
-    // we can skip past word boundaries if the next char is a non-word char.
-    fromIndex = toIndex and
-    prev.getRepr() instanceof RegExpWordBoundary and
-    prev = getAState(reg, toIndex, str, ignorePrefix) and
-    after(prev.getRepr()) = result and
-    str.charAt(toIndex + 1).regexpMatch("\\W") // \W matches any non-word char.
-  }
-
-  pragma[noopt]
-  private State getAStateAfterMatchingAux(
-    MatchedRegExp reg, State prev, string str, int toIndex, int fromIndex, boolean ignorePrefix
-  ) {
-    prev = getAState(reg, fromIndex, str, ignorePrefix) and
-    fromIndex = toIndex - 1 and
-    exists(string char | char = str.charAt(toIndex) | specializedDeltaClosed(prev, char, result)) and
-    not discardedPrefixStep(prev, result, ignorePrefix)
-  }
-
-  /** Holds if a step from `prev` to `next` should be discarded when the `ignorePrefix` flag is set. */
-  private predicate discardedPrefixStep(State prev, State next, boolean ignorePrefix) {
-    prev = mkMatch(any(RegExpRoot r)) and
-    ignorePrefix = true and
-    next = prev
-  }
-
-  // The `deltaClosed` relation specialized to the chars that exists in strings tested by a `MatchedRegExp`.
-  private predicate specializedDeltaClosed(State prev, string char, State next) {
-    deltaClosed(prev, specializedGetAnInputSymbolMatching(char), next)
-  }
-
-  // The `getAnInputSymbolMatching` relation specialized to the chars that exists in strings tested by a `MatchedRegExp`.
-  pragma[noinline]
-  private InputSymbol specializedGetAnInputSymbolMatching(string char) {
-    exists(string s, MatchedRegExp r |
-      r.test(s, _)
-      or
-      r.testWithGroups(s, _)
-    |
-      char = s.charAt(_)
-    ) and
-    result = getAnInputSymbolMatching(char)
-  }
-
-  /**
-   * Gets the `i`th state on a path to the accepting state when `reg` matches `str`.
-   * Starts with an accepting state as found by `getAState` and searches backwards
-   * to the start state through the reachable states (as found by `getAState`).
-   *
-   * This predicate holds the invariant that the result state can be reached with `i` steps from a start state,
-   * and an accepting state can be found after (`str.length() - 1 - i`) steps from the result.
-   * The result state is therefore always on a valid path where `reg` accepts `str`.
-   *
-   * This predicate is only used to find which capture groups a regular expression has filled,
-   * and thus the search is only performed for the strings in the `testWithGroups(..)` predicate.
-   */
-  private State getAStateThatReachesAccept(
-    MatchedRegExp reg, int i, string str, boolean ignorePrefix
-  ) {
-    // base case, reaches an accepting state from the last state in `getAState(..)`
-    reg.testWithGroups(str, ignorePrefix) and
-    i = str.length() - 1 and
-    result = getAState(reg, i, str, ignorePrefix) and
-    epsilonSucc*(result) = Accept(_)
-    or
-    // recursive case. `next` is the next state to be matched after matching `prev`.
-    // this predicate is doing a backwards search, so `prev` is the result we are looking for.
-    exists(State next, State prev, int fromIndex, int toIndex |
-      next = getAStateThatReachesAccept(reg, toIndex, str, ignorePrefix) and
-      next = getAStateAfterMatching(reg, prev, str, toIndex, fromIndex, ignorePrefix) and
-      i = fromIndex and
-      result = prev
-    )
-  }
-
-  /** Gets the capture group number that `term` belongs to. */
-  private int group(RegExpTerm term) {
-    exists(RegExpGroup grp | grp.getNumber() = result | term.getParent*() = grp)
-  }
-}
-
-/** A class to test whether a regular expression matches certain HTML tags. */
-class HtmlMatchingRegExp extends RegexpMatching::MatchedRegExp {
-  HtmlMatchingRegExp() {
-    // the regexp must mention "<" and ">" explicitly.
-    forall(string angleBracket | angleBracket = ["<", ">"] |
-      any(RegExpConstant term | term.getValue().matches("%" + angleBracket + "%")).getRootTerm() =
-        this
-    )
-  }
-
-  override predicate testWithGroups(string str, boolean ignorePrefix) {
-    ignorePrefix = true and
-    str = ["<!-- foo -->", "<!-- foo --!>", "<!- foo ->", "<foo>", "<script>"]
-  }
-
-  override predicate test(string str, boolean ignorePrefix) {
-    ignorePrefix = true and
    str =
      [
        "<!-- foo -->", "<!- foo ->", "<!-- foo --!>", "<!-- foo\n -->", "<script>foo</script>",
@ -200,12 +29,25 @@ class HtmlMatchingRegExp extends RegexpMatching::MatchedRegExp {
        "<script src='foo'></script>", "<SCRIPT>foo</SCRIPT>", "<script\tsrc=\"foo\"/>",
        "<script\tsrc='foo'></script>", "<sCrIpT>foo</ScRiPt>", "<script src=\"foo\">foo</script >",
        "<script src=\"foo\">foo</script foo=\"bar\">", "<script src=\"foo\">foo</script\t\n bar>"
-      ]
-  }
+      ] and
+    testWithGroups = false
+  )
 }

-/** DEPRECATED: Alias for HtmlMatchingRegExp */
-deprecated class HTMLMatchingRegExp = HtmlMatchingRegExp;
+/**
+ * A regexp that matches some string from the `isBadTagFilterCandidate` predicate.
+ */
+class HtmlMatchingRegExp extends RootTerm {
+  HtmlMatchingRegExp() { RegexpMatching<isBadTagFilterCandidate/4>::matches(this, _) }
+
+  /** Holds if this regexp matched `str`, where `str` is one of the string from `isBadTagFilterCandidate`. */
+  predicate matches(string str) { RegexpMatching<isBadTagFilterCandidate/4>::matches(this, str) }
+
+  /** Holds if this regexp fills capture group `g' when matching `str', where `str` is one of the string from `isBadTagFilterCandidate`. */
+  predicate fillsCaptureGroup(string str, int g) {
+    RegexpMatching<isBadTagFilterCandidate/4>::fillsCaptureGroup(this, str, g)
+  }
+}

 /**
 * Holds if `regexp` matches some HTML tags, but misses some HTML tags that it should match.
--- a/python/ql/lib/semmle/python/security/performance/RegexpMatching.qll
+++ b/python/ql/lib/semmle/python/security/performance/RegexpMatching.qll
@ -0,0 +1,175 @@
+/**
+ * Provides precicates for reasoning about which strings are matched by a regular expression,
+ * and for testing which capture groups are filled when a particular regexp matches a string.
+ */
+
+import ReDoSUtil
+
+/** A root term */
+class RootTerm extends RegExpTerm {
+  RootTerm() { this.isRootTerm() }
+}
+
+/**
+ * Holds if it should be tested whether `reg` matches `str`.
+ *
+ * If `ignorePrefix` is true, then a regexp without a start anchor will be treated as if it had a start anchor.
+ * E.g. a regular expression `/foo$/` will match any string that ends with "foo",
+ * but if `ignorePrefix` is true, it will only match "foo".
+ *
+ * If `testWithGroups` is true, then the `RegexpMatching::fillsCaptureGroup` predicate can be used to determine which capture
+ * groups are filled by a string.
+ */
+signature predicate isRegexpMatchingCandidateSig(
+  RootTerm root, string str, boolean ignorePrefix, boolean testWithGroups
+);
+
+/**
+ * A module for determining if a regexp matches a given string,
+ * and reasoning about which capture groups are filled by a given string.
+ *
+ * The module parameter `isRegexpMatchingCandidateSig` determines which string should be tested,
+ * and the results can be read from the `matches` and `fillsCaptureGroup` predicates.
+ */
+module RegexpMatching<isRegexpMatchingCandidateSig/4 isCandidate> {
+  private predicate test(RootTerm reg, string str, boolean ignorePrefix) {
+    isCandidate(reg, str, ignorePrefix, false)
+  }
+
+  private predicate testWithGroups(RootTerm reg, string str, boolean ignorePrefix) {
+    isCandidate(reg, str, ignorePrefix, true)
+  }
+
+  /**
+   * Gets a state the regular expression `reg` can be in after matching the `i`th char in `str`.
+   * The regular expression is modeled as a non-determistic finite automaton,
+   * the regular expression can therefore be in multiple states after matching a character.
+   *
+   * It's a forward search to all possible states, and there is thus no guarantee that the state is on a path to an accepting state.
+   */
+  private State getAState(RootTerm reg, int i, string str, boolean ignorePrefix) {
+    // start state, the -1 position before any chars have been matched
+    i = -1 and
+    (
+      test(reg, str, ignorePrefix)
+      or
+      testWithGroups(reg, str, ignorePrefix)
+    ) and
+    result.getRepr().getRootTerm() = reg and
+    isStartState(result)
+    or
+    // recursive case
+    result = getAStateAfterMatching(reg, _, str, i, _, ignorePrefix)
+  }
+
+  /**
+   * Gets the next state after the `prev` state from `reg`.
+   * `prev` is the state after matching `fromIndex` chars in `str`,
+   * and the result is the state after matching `toIndex` chars in `str`.
+   *
+   * This predicate is used as a step relation in the forwards search (`getAState`),
+   * and also as a step relation in the later backwards search (`getAStateThatReachesAccept`).
+   */
+  private State getAStateAfterMatching(
+    RootTerm reg, State prev, string str, int toIndex, int fromIndex, boolean ignorePrefix
+  ) {
+    // the basic recursive case - outlined into a noopt helper to make performance work out.
+    result = getAStateAfterMatchingAux(reg, prev, str, toIndex, fromIndex, ignorePrefix)
+    or
+    // we can skip past word boundaries if the next char is a non-word char.
+    fromIndex = toIndex and
+    prev.getRepr() instanceof RegExpWordBoundary and
+    prev = getAState(reg, toIndex, str, ignorePrefix) and
+    after(prev.getRepr()) = result and
+    str.charAt(toIndex + 1).regexpMatch("\\W") // \W matches any non-word char.
+  }
+
+  pragma[noopt]
+  private State getAStateAfterMatchingAux(
+    RootTerm reg, State prev, string str, int toIndex, int fromIndex, boolean ignorePrefix
+  ) {
+    prev = getAState(reg, fromIndex, str, ignorePrefix) and
+    fromIndex = toIndex - 1 and
+    exists(string char | char = str.charAt(toIndex) | specializedDeltaClosed(prev, char, result)) and
+    not discardedPrefixStep(prev, result, ignorePrefix)
+  }
+
+  /** Holds if a step from `prev` to `next` should be discarded when the `ignorePrefix` flag is set. */
+  private predicate discardedPrefixStep(State prev, State next, boolean ignorePrefix) {
+    prev = mkMatch(any(RegExpRoot r)) and
+    ignorePrefix = true and
+    next = prev
+  }
+
+  // The `deltaClosed` relation specialized to the chars that exists in strings tested by a `MatchedRegExp`.
+  private predicate specializedDeltaClosed(State prev, string char, State next) {
+    deltaClosed(prev, specializedGetAnInputSymbolMatching(char), next)
+  }
+
+  // The `getAnInputSymbolMatching` relation specialized to the chars that exists in strings tested by a `MatchedRegExp`.
+  pragma[noinline]
+  private InputSymbol specializedGetAnInputSymbolMatching(string char) {
+    exists(string s, RootTerm r |
+      test(r, s, _)
+      or
+      testWithGroups(r, s, _)
+    |
+      char = s.charAt(_)
+    ) and
+    result = getAnInputSymbolMatching(char)
+  }
+
+  /**
+   * Gets the `i`th state on a path to the accepting state when `reg` matches `str`.
+   * Starts with an accepting state as found by `getAState` and searches backwards
+   * to the start state through the reachable states (as found by `getAState`).
+   *
+   * This predicate holds the invariant that the result state can be reached with `i` steps from a start state,
+   * and an accepting state can be found after (`str.length() - 1 - i`) steps from the result.
+   * The result state is therefore always on a valid path where `reg` accepts `str`.
+   *
+   * This predicate is only used to find which capture groups a regular expression has filled,
+   * and thus the search is only performed for the strings in the `testWithGroups(..)` predicate.
+   */
+  private State getAStateThatReachesAccept(RootTerm reg, int i, string str, boolean ignorePrefix) {
+    // base case, reaches an accepting state from the last state in `getAState(..)`
+    testWithGroups(reg, str, ignorePrefix) and
+    i = str.length() - 1 and
+    result = getAState(reg, i, str, ignorePrefix) and
+    epsilonSucc*(result) = Accept(_)
+    or
+    // recursive case. `next` is the next state to be matched after matching `prev`.
+    // this predicate is doing a backwards search, so `prev` is the result we are looking for.
+    exists(State next, State prev, int fromIndex, int toIndex |
+      next = getAStateThatReachesAccept(reg, toIndex, str, ignorePrefix) and
+      next = getAStateAfterMatching(reg, prev, str, toIndex, fromIndex, ignorePrefix) and
+      i = fromIndex and
+      result = prev
+    )
+  }
+
+  /** Gets the capture group number that `term` belongs to. */
+  private int group(RegExpTerm term) {
+    exists(RegExpGroup grp | grp.getNumber() = result | term.getParent*() = grp)
+  }
+
+  /**
+   * Holds if `reg` matches `str`, where `str` is either in the `test` or `testWithGroups` predicate.
+   */
+  predicate matches(RootTerm reg, string str) {
+    exists(State state | state = getAState(reg, str.length() - 1, str, _) |
+      epsilonSucc*(state) = Accept(_)
+    )
+  }
+
+  /**
+   * Holds if matching `str` against `reg` may fill capture group number `g`.
+   * Only holds if `str` is in the `testWithGroups` predicate.
+   */
+  predicate fillsCaptureGroup(RootTerm reg, string str, int g) {
+    exists(State s |
+      s = getAStateThatReachesAccept(reg, _, str, _) and
+      g = group(s.getRepr())
+    )
+  }
+}
--- a/ruby/ql/lib/codeql/ruby/security/BadTagFilterQuery.qll
+++ b/ruby/ql/lib/codeql/ruby/security/BadTagFilterQuery.qll
@ -2,196 +2,25 @@
 * Provides precicates for reasoning about bad tag filter vulnerabilities.
 */

-import performance.ReDoSUtil
+import performance.RegexpMatching

 /**
- * A module for determining if a regexp matches a given string,
- * and reasoning about which capture groups are filled by a given string.
+ * Holds if the regexp `root` should be tested against `str`.
+ * Implements the `isRegexpMatchingCandidateSig` signature from `RegexpMatching`.
 */
-private module RegexpMatching {
-  /**
-   * A class to test whether a regular expression matches a string.
-   * Override this class and extend `test`/`testWithGroups` to configure which strings should be tested for acceptance by this regular expression.
-   * The result can afterwards be read from the `matches` predicate.
-   *
-   * Strings in the `testWithGroups` predicate are also tested for which capture groups are filled by the given string.
-   * The result is available in the `fillCaptureGroup` predicate.
-   */
-  abstract class MatchedRegExp extends RegExpTerm {
-    MatchedRegExp() { this.isRootTerm() }
-
-    /**
-     * Holds if it should be tested whether this regular expression matches `str`.
-     *
-     * If `ignorePrefix` is true, then a regexp without a start anchor will be treated as if it had a start anchor.
-     * E.g. a regular expression `/foo$/` will match any string that ends with "foo",
-     * but if `ignorePrefix` is true, it will only match "foo".
-     */
-    predicate test(string str, boolean ignorePrefix) {
-      none() // maybe overridden in subclasses
-    }
-
-    /**
-     * Same as `test(..)`, but where the `fillsCaptureGroup` afterwards tells which capture groups were filled by the given string.
-     */
-    predicate testWithGroups(string str, boolean ignorePrefix) {
-      none() // maybe overridden in subclasses
-    }
-
-    /**
-     * Holds if this RegExp matches `str`, where `str` is either in the `test` or `testWithGroups` predicate.
-     */
-    final predicate matches(string str) {
-      exists(State state | state = getAState(this, str.length() - 1, str, _) |
-        epsilonSucc*(state) = Accept(_)
-      )
-    }
-
-    /**
-     * Holds if matching `str` may fill capture group number `g`.
-     * Only holds if `str` is in the `testWithGroups` predicate.
-     */
-    final predicate fillsCaptureGroup(string str, int g) {
-      exists(State s |
-        s = getAStateThatReachesAccept(this, _, str, _) and
-        g = group(s.getRepr())
-      )
-    }
-  }
-
-  /**
-   * Gets a state the regular expression `reg` can be in after matching the `i`th char in `str`.
-   * The regular expression is modeled as a non-determistic finite automaton,
-   * the regular expression can therefore be in multiple states after matching a character.
-   *
-   * It's a forward search to all possible states, and there is thus no guarantee that the state is on a path to an accepting state.
-   */
-  private State getAState(MatchedRegExp reg, int i, string str, boolean ignorePrefix) {
-    // start state, the -1 position before any chars have been matched
-    i = -1 and
-    (
-      reg.test(str, ignorePrefix)
-      or
-      reg.testWithGroups(str, ignorePrefix)
-    ) and
-    result.getRepr().getRootTerm() = reg and
-    isStartState(result)
+private predicate isBadTagFilterCandidate(
+  RootTerm root, string str, boolean ignorePrefix, boolean testWithGroups
+) {
+  // the regexp must mention "<" and ">" explicitly.
+  forall(string angleBracket | angleBracket = ["<", ">"] |
+    any(RegExpConstant term | term.getValue().matches("%" + angleBracket + "%")).getRootTerm() =
+      root
+  ) and
+  ignorePrefix = true and
+  (
+    str = ["<!-- foo -->", "<!-- foo --!>", "<!- foo ->", "<foo>", "<script>"] and
+    testWithGroups = true
    or
-    // recursive case
-    result = getAStateAfterMatching(reg, _, str, i, _, ignorePrefix)
-  }
-
-  /**
-   * Gets the next state after the `prev` state from `reg`.
-   * `prev` is the state after matching `fromIndex` chars in `str`,
-   * and the result is the state after matching `toIndex` chars in `str`.
-   *
-   * This predicate is used as a step relation in the forwards search (`getAState`),
-   * and also as a step relation in the later backwards search (`getAStateThatReachesAccept`).
-   */
-  private State getAStateAfterMatching(
-    MatchedRegExp reg, State prev, string str, int toIndex, int fromIndex, boolean ignorePrefix
-  ) {
-    // the basic recursive case - outlined into a noopt helper to make performance work out.
-    result = getAStateAfterMatchingAux(reg, prev, str, toIndex, fromIndex, ignorePrefix)
-    or
-    // we can skip past word boundaries if the next char is a non-word char.
-    fromIndex = toIndex and
-    prev.getRepr() instanceof RegExpWordBoundary and
-    prev = getAState(reg, toIndex, str, ignorePrefix) and
-    after(prev.getRepr()) = result and
-    str.charAt(toIndex + 1).regexpMatch("\\W") // \W matches any non-word char.
-  }
-
-  pragma[noopt]
-  private State getAStateAfterMatchingAux(
-    MatchedRegExp reg, State prev, string str, int toIndex, int fromIndex, boolean ignorePrefix
-  ) {
-    prev = getAState(reg, fromIndex, str, ignorePrefix) and
-    fromIndex = toIndex - 1 and
-    exists(string char | char = str.charAt(toIndex) | specializedDeltaClosed(prev, char, result)) and
-    not discardedPrefixStep(prev, result, ignorePrefix)
-  }
-
-  /** Holds if a step from `prev` to `next` should be discarded when the `ignorePrefix` flag is set. */
-  private predicate discardedPrefixStep(State prev, State next, boolean ignorePrefix) {
-    prev = mkMatch(any(RegExpRoot r)) and
-    ignorePrefix = true and
-    next = prev
-  }
-
-  // The `deltaClosed` relation specialized to the chars that exists in strings tested by a `MatchedRegExp`.
-  private predicate specializedDeltaClosed(State prev, string char, State next) {
-    deltaClosed(prev, specializedGetAnInputSymbolMatching(char), next)
-  }
-
-  // The `getAnInputSymbolMatching` relation specialized to the chars that exists in strings tested by a `MatchedRegExp`.
-  pragma[noinline]
-  private InputSymbol specializedGetAnInputSymbolMatching(string char) {
-    exists(string s, MatchedRegExp r |
-      r.test(s, _)
-      or
-      r.testWithGroups(s, _)
-    |
-      char = s.charAt(_)
-    ) and
-    result = getAnInputSymbolMatching(char)
-  }
-
-  /**
-   * Gets the `i`th state on a path to the accepting state when `reg` matches `str`.
-   * Starts with an accepting state as found by `getAState` and searches backwards
-   * to the start state through the reachable states (as found by `getAState`).
-   *
-   * This predicate holds the invariant that the result state can be reached with `i` steps from a start state,
-   * and an accepting state can be found after (`str.length() - 1 - i`) steps from the result.
-   * The result state is therefore always on a valid path where `reg` accepts `str`.
-   *
-   * This predicate is only used to find which capture groups a regular expression has filled,
-   * and thus the search is only performed for the strings in the `testWithGroups(..)` predicate.
-   */
-  private State getAStateThatReachesAccept(
-    MatchedRegExp reg, int i, string str, boolean ignorePrefix
-  ) {
-    // base case, reaches an accepting state from the last state in `getAState(..)`
-    reg.testWithGroups(str, ignorePrefix) and
-    i = str.length() - 1 and
-    result = getAState(reg, i, str, ignorePrefix) and
-    epsilonSucc*(result) = Accept(_)
-    or
-    // recursive case. `next` is the next state to be matched after matching `prev`.
-    // this predicate is doing a backwards search, so `prev` is the result we are looking for.
-    exists(State next, State prev, int fromIndex, int toIndex |
-      next = getAStateThatReachesAccept(reg, toIndex, str, ignorePrefix) and
-      next = getAStateAfterMatching(reg, prev, str, toIndex, fromIndex, ignorePrefix) and
-      i = fromIndex and
-      result = prev
-    )
-  }
-
-  /** Gets the capture group number that `term` belongs to. */
-  private int group(RegExpTerm term) {
-    exists(RegExpGroup grp | grp.getNumber() = result | term.getParent*() = grp)
-  }
-}
-
-/** A class to test whether a regular expression matches certain HTML tags. */
-class HtmlMatchingRegExp extends RegexpMatching::MatchedRegExp {
-  HtmlMatchingRegExp() {
-    // the regexp must mention "<" and ">" explicitly.
-    forall(string angleBracket | angleBracket = ["<", ">"] |
-      any(RegExpConstant term | term.getValue().matches("%" + angleBracket + "%")).getRootTerm() =
-        this
-    )
-  }
-
-  override predicate testWithGroups(string str, boolean ignorePrefix) {
-    ignorePrefix = true and
-    str = ["<!-- foo -->", "<!-- foo --!>", "<!- foo ->", "<foo>", "<script>"]
-  }
-
-  override predicate test(string str, boolean ignorePrefix) {
-    ignorePrefix = true and
    str =
      [
        "<!-- foo -->", "<!- foo ->", "<!-- foo --!>", "<!-- foo\n -->", "<script>foo</script>",
@ -200,12 +29,25 @@ class HtmlMatchingRegExp extends RegexpMatching::MatchedRegExp {
        "<script src='foo'></script>", "<SCRIPT>foo</SCRIPT>", "<script\tsrc=\"foo\"/>",
        "<script\tsrc='foo'></script>", "<sCrIpT>foo</ScRiPt>", "<script src=\"foo\">foo</script >",
        "<script src=\"foo\">foo</script foo=\"bar\">", "<script src=\"foo\">foo</script\t\n bar>"
-      ]
-  }
+      ] and
+    testWithGroups = false
+  )
 }

-/** DEPRECATED: Alias for HtmlMatchingRegExp */
-deprecated class HTMLMatchingRegExp = HtmlMatchingRegExp;
+/**
+ * A regexp that matches some string from the `isBadTagFilterCandidate` predicate.
+ */
+class HtmlMatchingRegExp extends RootTerm {
+  HtmlMatchingRegExp() { RegexpMatching<isBadTagFilterCandidate/4>::matches(this, _) }
+
+  /** Holds if this regexp matched `str`, where `str` is one of the string from `isBadTagFilterCandidate`. */
+  predicate matches(string str) { RegexpMatching<isBadTagFilterCandidate/4>::matches(this, str) }
+
+  /** Holds if this regexp fills capture group `g' when matching `str', where `str` is one of the string from `isBadTagFilterCandidate`. */
+  predicate fillsCaptureGroup(string str, int g) {
+    RegexpMatching<isBadTagFilterCandidate/4>::fillsCaptureGroup(this, str, g)
+  }
+}

 /**
 * Holds if `regexp` matches some HTML tags, but misses some HTML tags that it should match.
--- a/ruby/ql/lib/codeql/ruby/security/performance/RegexpMatching.qll
+++ b/ruby/ql/lib/codeql/ruby/security/performance/RegexpMatching.qll
@ -0,0 +1,175 @@
+/**
+ * Provides precicates for reasoning about which strings are matched by a regular expression,
+ * and for testing which capture groups are filled when a particular regexp matches a string.
+ */
+
+import ReDoSUtil
+
+/** A root term */
+class RootTerm extends RegExpTerm {
+  RootTerm() { this.isRootTerm() }
+}
+
+/**
+ * Holds if it should be tested whether `reg` matches `str`.
+ *
+ * If `ignorePrefix` is true, then a regexp without a start anchor will be treated as if it had a start anchor.
+ * E.g. a regular expression `/foo$/` will match any string that ends with "foo",
+ * but if `ignorePrefix` is true, it will only match "foo".
+ *
+ * If `testWithGroups` is true, then the `RegexpMatching::fillsCaptureGroup` predicate can be used to determine which capture
+ * groups are filled by a string.
+ */
+signature predicate isRegexpMatchingCandidateSig(
+  RootTerm root, string str, boolean ignorePrefix, boolean testWithGroups
+);
+
+/**
+ * A module for determining if a regexp matches a given string,
+ * and reasoning about which capture groups are filled by a given string.
+ *
+ * The module parameter `isRegexpMatchingCandidateSig` determines which string should be tested,
+ * and the results can be read from the `matches` and `fillsCaptureGroup` predicates.
+ */
+module RegexpMatching<isRegexpMatchingCandidateSig/4 isCandidate> {
+  private predicate test(RootTerm reg, string str, boolean ignorePrefix) {
+    isCandidate(reg, str, ignorePrefix, false)
+  }
+
+  private predicate testWithGroups(RootTerm reg, string str, boolean ignorePrefix) {
+    isCandidate(reg, str, ignorePrefix, true)
+  }
+
+  /**
+   * Gets a state the regular expression `reg` can be in after matching the `i`th char in `str`.
+   * The regular expression is modeled as a non-determistic finite automaton,
+   * the regular expression can therefore be in multiple states after matching a character.
+   *
+   * It's a forward search to all possible states, and there is thus no guarantee that the state is on a path to an accepting state.
+   */
+  private State getAState(RootTerm reg, int i, string str, boolean ignorePrefix) {
+    // start state, the -1 position before any chars have been matched
+    i = -1 and
+    (
+      test(reg, str, ignorePrefix)
+      or
+      testWithGroups(reg, str, ignorePrefix)
+    ) and
+    result.getRepr().getRootTerm() = reg and
+    isStartState(result)
+    or
+    // recursive case
+    result = getAStateAfterMatching(reg, _, str, i, _, ignorePrefix)
+  }
+
+  /**
+   * Gets the next state after the `prev` state from `reg`.
+   * `prev` is the state after matching `fromIndex` chars in `str`,
+   * and the result is the state after matching `toIndex` chars in `str`.
+   *
+   * This predicate is used as a step relation in the forwards search (`getAState`),
+   * and also as a step relation in the later backwards search (`getAStateThatReachesAccept`).
+   */
+  private State getAStateAfterMatching(
+    RootTerm reg, State prev, string str, int toIndex, int fromIndex, boolean ignorePrefix
+  ) {
+    // the basic recursive case - outlined into a noopt helper to make performance work out.
+    result = getAStateAfterMatchingAux(reg, prev, str, toIndex, fromIndex, ignorePrefix)
+    or
+    // we can skip past word boundaries if the next char is a non-word char.
+    fromIndex = toIndex and
+    prev.getRepr() instanceof RegExpWordBoundary and
+    prev = getAState(reg, toIndex, str, ignorePrefix) and
+    after(prev.getRepr()) = result and
+    str.charAt(toIndex + 1).regexpMatch("\\W") // \W matches any non-word char.
+  }
+
+  pragma[noopt]
+  private State getAStateAfterMatchingAux(
+    RootTerm reg, State prev, string str, int toIndex, int fromIndex, boolean ignorePrefix
+  ) {
+    prev = getAState(reg, fromIndex, str, ignorePrefix) and
+    fromIndex = toIndex - 1 and
+    exists(string char | char = str.charAt(toIndex) | specializedDeltaClosed(prev, char, result)) and
+    not discardedPrefixStep(prev, result, ignorePrefix)
+  }
+
+  /** Holds if a step from `prev` to `next` should be discarded when the `ignorePrefix` flag is set. */
+  private predicate discardedPrefixStep(State prev, State next, boolean ignorePrefix) {
+    prev = mkMatch(any(RegExpRoot r)) and
+    ignorePrefix = true and
+    next = prev
+  }
+
+  // The `deltaClosed` relation specialized to the chars that exists in strings tested by a `MatchedRegExp`.
+  private predicate specializedDeltaClosed(State prev, string char, State next) {
+    deltaClosed(prev, specializedGetAnInputSymbolMatching(char), next)
+  }
+
+  // The `getAnInputSymbolMatching` relation specialized to the chars that exists in strings tested by a `MatchedRegExp`.
+  pragma[noinline]
+  private InputSymbol specializedGetAnInputSymbolMatching(string char) {
+    exists(string s, RootTerm r |
+      test(r, s, _)
+      or
+      testWithGroups(r, s, _)
+    |
+      char = s.charAt(_)
+    ) and
+    result = getAnInputSymbolMatching(char)
+  }
+
+  /**
+   * Gets the `i`th state on a path to the accepting state when `reg` matches `str`.
+   * Starts with an accepting state as found by `getAState` and searches backwards
+   * to the start state through the reachable states (as found by `getAState`).
+   *
+   * This predicate holds the invariant that the result state can be reached with `i` steps from a start state,
+   * and an accepting state can be found after (`str.length() - 1 - i`) steps from the result.
+   * The result state is therefore always on a valid path where `reg` accepts `str`.
+   *
+   * This predicate is only used to find which capture groups a regular expression has filled,
+   * and thus the search is only performed for the strings in the `testWithGroups(..)` predicate.
+   */
+  private State getAStateThatReachesAccept(RootTerm reg, int i, string str, boolean ignorePrefix) {
+    // base case, reaches an accepting state from the last state in `getAState(..)`
+    testWithGroups(reg, str, ignorePrefix) and
+    i = str.length() - 1 and
+    result = getAState(reg, i, str, ignorePrefix) and
+    epsilonSucc*(result) = Accept(_)
+    or
+    // recursive case. `next` is the next state to be matched after matching `prev`.
+    // this predicate is doing a backwards search, so `prev` is the result we are looking for.
+    exists(State next, State prev, int fromIndex, int toIndex |
+      next = getAStateThatReachesAccept(reg, toIndex, str, ignorePrefix) and
+      next = getAStateAfterMatching(reg, prev, str, toIndex, fromIndex, ignorePrefix) and
+      i = fromIndex and
+      result = prev
+    )
+  }
+
+  /** Gets the capture group number that `term` belongs to. */
+  private int group(RegExpTerm term) {
+    exists(RegExpGroup grp | grp.getNumber() = result | term.getParent*() = grp)
+  }
+
+  /**
+   * Holds if `reg` matches `str`, where `str` is either in the `test` or `testWithGroups` predicate.
+   */
+  predicate matches(RootTerm reg, string str) {
+    exists(State state | state = getAState(reg, str.length() - 1, str, _) |
+      epsilonSucc*(state) = Accept(_)
+    )
+  }
+
+  /**
+   * Holds if matching `str` against `reg` may fill capture group number `g`.
+   * Only holds if `str` is in the `testWithGroups` predicate.
+   */
+  predicate fillsCaptureGroup(RootTerm reg, string str, int g) {
+    exists(State s |
+      s = getAStateThatReachesAccept(reg, _, str, _) and
+      g = group(s.getRepr())
+    )
+  }
+}