Merge pull request #6561 from erik-krogh/htmlReg

JS/Py/Ruby: add a bad-tag-filter query
2021-11-18 09:39:13 +01:00 · 2021-11-18 09:39:13 +01:00 · 1cca377e7d
--- a/config/identical-files.json
+++ b/config/identical-files.json
@ -471,7 +471,12 @@
  "ReDoS Polynomial Python/JS": [
    "javascript/ql/lib/semmle/javascript/security/performance/SuperlinearBackTracking.qll",
    "python/ql/lib/semmle/python/security/performance/SuperlinearBackTracking.qll",
-    "ruby/ql/lib/codeql/ruby/regexp/SuperlinearBackTracking.qll"
+    "ruby/ql/lib/codeql/ruby/security/performance/SuperlinearBackTracking.qll"
+  ],
+  "BadTagFilterQuery Python/JS/Ruby": [
+    "javascript/ql/lib/semmle/javascript/security/BadTagFilterQuery.qll",
+    "python/ql/lib/semmle/python/security/BadTagFilterQuery.qll",
+    "ruby/ql/lib/codeql/ruby/security/BadTagFilterQuery.qll"
  ],
  "CFG": [
    "csharp/ql/lib/semmle/code/csharp/controlflow/internal/ControlFlowGraphImplShared.qll",
--- a/javascript/change-notes/2021-08-26-bad-tag-filter.md
+++ b/javascript/change-notes/2021-08-26-bad-tag-filter.md
@ -0,0 +1,4 @@
+lgtm,codescanning
+* A new query, `js/bad-tag-filter`, has been added to the query suite,
+  highlighting regular expressions that only match a subset of the HTML tags
+  it is supposed to match.
--- a/javascript/ql/lib/semmle/javascript/AST.qll
+++ b/javascript/ql/lib/semmle/javascript/AST.qll
@ -207,7 +207,9 @@ private predicate isAmbientTopLevel(TopLevel tl) {
 */
 class TopLevel extends @toplevel, StmtContainer {
  /** Holds if this toplevel is minified. */
+  cached
  predicate isMinified() {
+    Stages::Ast::ref() and
    // file name contains 'min' (not as part of a longer word)
    getFile().getBaseName().regexpMatch(".*[^-._]*[-._]min([-._].*)?\\.\\w+")
    or
--- a/javascript/ql/lib/semmle/javascript/Regexp.qll
+++ b/javascript/ql/lib/semmle/javascript/Regexp.qll
@ -7,6 +7,7 @@

 import javascript
 private import semmle.javascript.dataflow.InferredTypes
+private import semmle.javascript.internal.CachedStages

 /**
 * An element containing a regular expression term, that is, either
@ -955,7 +956,9 @@ private predicate isUsedAsNonMatchObject(DataFlow::MethodCallNode call) {
 /**
 * Holds if `source` may be interpreted as a regular expression.
 */
+cached
 predicate isInterpretedAsRegExp(DataFlow::Node source) {
+  Stages::Taint::ref() and
  source.analyze().getAType() = TTString() and
  (
    // The first argument to an invocation of `RegExp` (with or without `new`).
--- a/javascript/ql/lib/semmle/javascript/internal/CachedStages.qll
+++ b/javascript/ql/lib/semmle/javascript/internal/CachedStages.qll
@ -260,6 +260,8 @@ module Stages {
      exists(RemoteFlowSource r)
      or
      exists(Exports::getALibraryInputParameter())
+      or
+      any(RegExpTerm t).isUsedAsRegExp()
    }
  }
 }
--- a/javascript/ql/lib/semmle/javascript/security/BadTagFilterQuery.qll
+++ b/javascript/ql/lib/semmle/javascript/security/BadTagFilterQuery.qll
@ -0,0 +1,306 @@
+/**
+ * Provides precicates for reasoning about bad tag filter vulnerabilities.
+ */
+
+import performance.ReDoSUtil
+
+/**
+ * A module for determining if a regexp matches a given string,
+ * and reasoning about which capture groups are filled by a given string.
+ */
+private module RegexpMatching {
+  /**
+   * A class to test whether a regular expression matches a string.
+   * Override this class and extend `test`/`testWithGroups` to configure which strings should be tested for acceptance by this regular expression.
+   * The result can afterwards be read from the `matches` predicate.
+   *
+   * Strings in the `testWithGroups` predicate are also tested for which capture groups are filled by the given string.
+   * The result is available in the `fillCaptureGroup` predicate.
+   */
+  abstract class MatchedRegExp extends RegExpTerm {
+    MatchedRegExp() { this.isRootTerm() }
+
+    /**
+     * Holds if it should be tested whether this regular expression matches `str`.
+     *
+     * If `ignorePrefix` is true, then a regexp without a start anchor will be treated as if it had a start anchor.
+     * E.g. a regular expression `/foo$/` will match any string that ends with "foo",
+     * but if `ignorePrefix` is true, it will only match "foo".
+     */
+    predicate test(string str, boolean ignorePrefix) {
+      none() // maybe overriden in subclasses
+    }
+
+    /**
+     * Same as `test(..)`, but where the `fillsCaptureGroup` afterwards tells which capture groups were filled by the given string.
+     */
+    predicate testWithGroups(string str, boolean ignorePrefix) {
+      none() // maybe overriden in subclasses
+    }
+
+    /**
+     * Holds if this RegExp matches `str`, where `str` is either in the `test` or `testWithGroups` predicate.
+     */
+    final predicate matches(string str) {
+      exists(State state | state = getAState(this, str.length() - 1, str, _) |
+        epsilonSucc*(state) = Accept(_)
+      )
+    }
+
+    /**
+     * Holds if matching `str` may fill capture group number `g`.
+     * Only holds if `str` is in the `testWithGroups` predicate.
+     */
+    final predicate fillsCaptureGroup(string str, int g) {
+      exists(State s |
+        s = getAStateThatReachesAccept(this, _, str, _) and
+        g = group(s.getRepr())
+      )
+    }
+  }
+
+  /**
+   * Gets a state the regular expression `reg` can be in after matching the `i`th char in `str`.
+   * The regular expression is modelled as a non-determistic finite automaton,
+   * the regular expression can therefore be in multiple states after matching a character.
+   *
+   * It's a forward search to all possible states, and there is thus no guarantee that the state is on a path to an accepting state.
+   */
+  private State getAState(MatchedRegExp reg, int i, string str, boolean ignorePrefix) {
+    // start state, the -1 position before any chars have been matched
+    i = -1 and
+    (
+      reg.test(str, ignorePrefix)
+      or
+      reg.testWithGroups(str, ignorePrefix)
+    ) and
+    result.getRepr().getRootTerm() = reg and
+    isStartState(result)
+    or
+    // recursive case
+    result = getAStateAfterMatching(reg, _, str, i, _, ignorePrefix)
+  }
+
+  /**
+   * Gets the next state after the `prev` state from `reg`.
+   * `prev` is the state after matching `fromIndex` chars in `str`,
+   * and the result is the state after matching `toIndex` chars in `str`.
+   *
+   * This predicate is used as a step relation in the forwards search (`getAState`),
+   * and also as a step relation in the later backwards search (`getAStateThatReachesAccept`).
+   */
+  private State getAStateAfterMatching(
+    MatchedRegExp reg, State prev, string str, int toIndex, int fromIndex, boolean ignorePrefix
+  ) {
+    // the basic recursive case - outlined into a noopt helper to make performance work out.
+    result = getAStateAfterMatchingAux(reg, prev, str, toIndex, fromIndex, ignorePrefix)
+    or
+    // we can skip past word boundaries if the next char is a non-word char.
+    fromIndex = toIndex and
+    prev.getRepr() instanceof RegExpWordBoundary and
+    prev = getAState(reg, toIndex, str, ignorePrefix) and
+    after(prev.getRepr()) = result and
+    str.charAt(toIndex + 1).regexpMatch("\\W") // \W matches any non-word char.
+  }
+
+  pragma[noopt]
+  private State getAStateAfterMatchingAux(
+    MatchedRegExp reg, State prev, string str, int toIndex, int fromIndex, boolean ignorePrefix
+  ) {
+    prev = getAState(reg, fromIndex, str, ignorePrefix) and
+    fromIndex = toIndex - 1 and
+    exists(string char | char = str.charAt(toIndex) | specializedDeltaClosed(prev, char, result)) and
+    not discardedPrefixStep(prev, result, ignorePrefix)
+  }
+
+  /** Holds if a step from `prev` to `next` should be discarded when the `ignorePrefix` flag is set. */
+  private predicate discardedPrefixStep(State prev, State next, boolean ignorePrefix) {
+    prev = mkMatch(any(RegExpRoot r)) and
+    ignorePrefix = true and
+    next = prev
+  }
+
+  // The `deltaClosed` relation specialized to the chars that exists in strings tested by a `MatchedRegExp`.
+  private predicate specializedDeltaClosed(State prev, string char, State next) {
+    deltaClosed(prev, specializedGetAnInputSymbolMatching(char), next)
+  }
+
+  // The `getAnInputSymbolMatching` relation specialized to the chars that exists in strings tested by a `MatchedRegExp`.
+  pragma[noinline]
+  private InputSymbol specializedGetAnInputSymbolMatching(string char) {
+    exists(string s, MatchedRegExp r |
+      r.test(s, _)
+      or
+      r.testWithGroups(s, _)
+    |
+      char = s.charAt(_)
+    ) and
+    result = getAnInputSymbolMatching(char)
+  }
+
+  /**
+   * Gets the `i`th state on a path to the accepting state when `reg` matches `str`.
+   * Starts with an accepting state as found by `getAState` and searches backwards
+   * to the start state through the reachable states (as found by `getAState`).
+   *
+   * This predicate holds the invariant that the result state can be reached with `i` steps from a start state,
+   * and an accepting state can be found after (`str.length() - 1 - i`) steps from the result.
+   * The result state is therefore always on a valid path where `reg` accepts `str`.
+   *
+   * This predicate is only used to find which capture groups a regular expression has filled,
+   * and thus the search is only performed for the strings in the `testWithGroups(..)` predicate.
+   */
+  private State getAStateThatReachesAccept(
+    MatchedRegExp reg, int i, string str, boolean ignorePrefix
+  ) {
+    // base case, reaches an accepting state from the last state in `getAState(..)`
+    reg.testWithGroups(str, ignorePrefix) and
+    i = str.length() - 1 and
+    result = getAState(reg, i, str, ignorePrefix) and
+    epsilonSucc*(result) = Accept(_)
+    or
+    // recursive case. `next` is the next state to be matched after matching `prev`.
+    // this predicate is doing a backwards search, so `prev` is the result we are looking for.
+    exists(State next, State prev, int fromIndex, int toIndex |
+      next = getAStateThatReachesAccept(reg, toIndex, str, ignorePrefix) and
+      next = getAStateAfterMatching(reg, prev, str, toIndex, fromIndex, ignorePrefix) and
+      i = fromIndex and
+      result = prev
+    )
+  }
+
+  /** Gets the capture group number that `term` belongs to. */
+  private int group(RegExpTerm term) {
+    exists(RegExpGroup grp | grp.getNumber() = result | term.getParent*() = grp)
+  }
+}
+
+/** A class to test whether a regular expression matches certain HTML tags. */
+class HTMLMatchingRegExp extends RegexpMatching::MatchedRegExp {
+  HTMLMatchingRegExp() {
+    // the regexp must mention "<" and ">" explicitly.
+    forall(string angleBracket | angleBracket = ["<", ">"] |
+      any(RegExpConstant term | term.getValue().matches("%" + angleBracket + "%")).getRootTerm() =
+        this
+    )
+  }
+
+  override predicate testWithGroups(string str, boolean ignorePrefix) {
+    ignorePrefix = true and
+    str = ["<!-- foo -->", "<!-- foo --!>", "<!- foo ->", "<foo>", "<script>"]
+  }
+
+  override predicate test(string str, boolean ignorePrefix) {
+    ignorePrefix = true and
+    str =
+      [
+        "<!-- foo -->", "<!- foo ->", "<!-- foo --!>", "<!-- foo\n -->", "<script>foo</script>",
+        "<script \n>foo</script>", "<script >foo\n</script>", "<foo ></foo>", "<foo>",
+        "<foo src=\"foo\"></foo>", "<script>", "<script src=\"foo\"></script>",
+        "<script src='foo'></script>", "<SCRIPT>foo</SCRIPT>", "<script\tsrc=\"foo\"/>",
+        "<script\tsrc='foo'></script>", "<sCrIpT>foo</ScRiPt>", "<script src=\"foo\">foo</script >",
+        "<script src=\"foo\">foo</script foo=\"bar\">", "<script src=\"foo\">foo</script\t\n bar>"
+      ]
+  }
+}
+
+/**
+ * Holds if `regexp` matches some HTML tags, but misses some HTML tags that it should match.
+ *
+ * When adding a new case to this predicate, make sure the test string used in `matches(..)` calls are present in `HTMLMatchingRegExp::test` / `HTMLMatchingRegExp::testWithGroups`.
+ */
+predicate isBadRegexpFilter(HTMLMatchingRegExp regexp, string msg) {
+  // CVE-2021-33829 - matching both "<!-- foo -->" and "<!-- foo --!>", but in different capture groups
+  regexp.matches("<!-- foo -->") and
+  regexp.matches("<!-- foo --!>") and
+  exists(int a, int b | a != b |
+    regexp.fillsCaptureGroup("<!-- foo -->", a) and
+    // <!-- foo --> might be ambigously parsed (matching both capture groups), and that is ok here.
+    regexp.fillsCaptureGroup("<!-- foo --!>", b) and
+    not regexp.fillsCaptureGroup("<!-- foo --!>", a) and
+    msg =
+      "Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group "
+        + a + " and comments ending with --!> are matched with capture group " +
+        strictconcat(int i | regexp.fillsCaptureGroup("<!-- foo --!>", i) | i.toString(), ", ") +
+        "."
+  )
+  or
+  // CVE-2020-17480 - matching "<!-- foo -->" and other tags, but not "<!-- foo --!>".
+  exists(int group, int other |
+    group != other and
+    regexp.fillsCaptureGroup("<!-- foo -->", group) and
+    regexp.fillsCaptureGroup("<foo>", other) and
+    not regexp.matches("<!-- foo --!>") and
+    not regexp.fillsCaptureGroup("<!-- foo -->", any(int i | i != group)) and
+    not regexp.fillsCaptureGroup("<!- foo ->", group) and
+    not regexp.fillsCaptureGroup("<foo>", group) and
+    not regexp.fillsCaptureGroup("<script>", group) and
+    msg =
+      "This regular expression only parses --> (capture group " + group +
+        ") and not --!> as a HTML comment end tag."
+  )
+  or
+  regexp.matches("<!-- foo -->") and
+  not regexp.matches("<!-- foo\n -->") and
+  not regexp.matches("<!- foo ->") and
+  not regexp.matches("<foo>") and
+  not regexp.matches("<script>") and
+  msg = "This regular expression does not match comments containing newlines."
+  or
+  regexp.matches("<script>foo</script>") and
+  regexp.matches("<script src=\"foo\"></script>") and
+  not regexp.matches("<foo ></foo>") and
+  (
+    not regexp.matches("<script \n>foo</script>") and
+    msg = "This regular expression matches <script></script>, but not <script \\n></script>"
+    or
+    not regexp.matches("<script >foo\n</script>") and
+    msg = "This regular expression matches <script>...</script>, but not <script >...\\n</script>"
+  )
+  or
+  regexp.matches("<script>foo</script>") and
+  regexp.matches("<script src=\"foo\"></script>") and
+  not regexp.matches("<script src='foo'></script>") and
+  not regexp.matches("<foo>") and
+  msg = "This regular expression does not match script tags where the attribute uses single-quotes."
+  or
+  regexp.matches("<script>foo</script>") and
+  regexp.matches("<script src='foo'></script>") and
+  not regexp.matches("<script src=\"foo\"></script>") and
+  not regexp.matches("<foo>") and
+  msg = "This regular expression does not match script tags where the attribute uses double-quotes."
+  or
+  regexp.matches("<script>foo</script>") and
+  regexp.matches("<script src='foo'></script>") and
+  not regexp.matches("<script\tsrc='foo'></script>") and
+  not regexp.matches("<foo>") and
+  not regexp.matches("<foo src=\"foo\"></foo>") and
+  msg = "This regular expression does not match script tags where tabs are used between attributes."
+  or
+  regexp.matches("<script>foo</script>") and
+  not RegExpFlags::isIgnoreCase(regexp) and
+  not regexp.matches("<foo>") and
+  not regexp.matches("<foo ></foo>") and
+  (
+    not regexp.matches("<SCRIPT>foo</SCRIPT>") and
+    msg = "This regular expression does not match upper case <SCRIPT> tags."
+    or
+    not regexp.matches("<sCrIpT>foo</ScRiPt>") and
+    regexp.matches("<SCRIPT>foo</SCRIPT>") and
+    msg = "This regular expression does not match mixed case <sCrIpT> tags."
+  )
+  or
+  regexp.matches("<script src=\"foo\"></script>") and
+  not regexp.matches("<foo>") and
+  not regexp.matches("<foo ></foo>") and
+  (
+    not regexp.matches("<script src=\"foo\">foo</script >") and
+    msg = "This regular expression does not match script end tags like </script >."
+    or
+    not regexp.matches("<script src=\"foo\">foo</script foo=\"bar\">") and
+    msg = "This regular expression does not match script end tags like </script foo=\"bar\">."
+    or
+    not regexp.matches("<script src=\"foo\">foo</script\t\n bar>") and
+    msg = "This regular expression does not match script end tags like </script\\t\\n bar>."
+  )
+}
--- a/javascript/ql/lib/semmle/javascript/security/performance/ReDoSUtil.qll
+++ b/javascript/ql/lib/semmle/javascript/security/performance/ReDoSUtil.qll
@ -542,7 +542,7 @@ private State before(RegExpTerm t) { result = Match(t, 0) }
 /**
 * Gets a state the NFA may be in after matching `t`.
 */
-private State after(RegExpTerm t) {
+State after(RegExpTerm t) {
  exists(RegExpAlt alt | t = alt.getAChild() | result = after(alt))
  or
  exists(RegExpSequence seq, int i | t = seq.getChild(i) |
@ -671,7 +671,7 @@ RegExpRoot getRoot(RegExpTerm term) {
 /**
 * A state in the NFA.
 */
-private newtype TState =
+newtype TState =
  /**
   * A state representing that the NFA is about to match a term.
   * `i` is used to index into multi-char literals.
@ -801,29 +801,26 @@ InputSymbol getAnInputSymbolMatching(string char) {
  result = Any()
 }

+/**
+ * Holds if `state` is a start state.
+ */
+predicate isStartState(State state) {
+  state = mkMatch(any(RegExpRoot r))
+  or
+  exists(RegExpCaret car | state = after(car))
+}
+
 /**
 * Predicates for constructing a prefix string that leads to a given state.
 */
 private module PrefixConstruction {
-  /**
-   * Holds if `state` starts the string matched by the regular expression.
-   */
-  private predicate isStartState(State state) {
-    state instanceof StateInPumpableRegexp and
-    (
-      state = Match(any(RegExpRoot r), _)
-      or
-      exists(RegExpCaret car | state = after(car))
-    )
-  }
-
  /**
   * Holds if `state` is the textually last start state for the regular expression.
   */
  private predicate lastStartState(State state) {
    exists(RegExpRoot root |
      state =
-        max(State s, Location l |
+        max(StateInPumpableRegexp s, Location l |
          isStartState(s) and getRoot(s.getRepr()) = root and l = s.getRepr().getLocation()
        |
          s
--- a/javascript/ql/lib/semmle/javascript/security/performance/RegExpTreeView.qll
+++ b/javascript/ql/lib/semmle/javascript/security/performance/RegExpTreeView.qll
@ -20,12 +20,7 @@ module RegExpFlags {
  /**
   * Holds if `root` has the `i` flag for case-insensitive matching.
   */
-  predicate isIgnoreCase(RegExpTerm root) {
-    root.isRootTerm() and
-    exists(DataFlow::RegExpCreationNode node | node.getRoot() = root |
-      RegExp::isIgnoreCase(node.getFlags())
-    )
-  }
+  predicate isIgnoreCase(RegExpTerm root) { RegExp::isIgnoreCase(getFlags(root)) }

  /**
   * Gets the flags for `root`, or the empty string if `root` has no flags.
@ -38,15 +33,14 @@ module RegExpFlags {
      not exists(node.getFlags()) and
      result = ""
    )
+    or
+    exists(RegExpPatternSource source | source.getRegExpTerm() = root |
+      result = source.getARegExpObject().(DataFlow::RegExpCreationNode).getFlags()
+    )
  }

  /**
   * Holds if `root` has the `s` flag for multi-line matching.
   */
-  predicate isDotAll(RegExpTerm root) {
-    root.isRootTerm() and
-    exists(DataFlow::RegExpCreationNode node | node.getRoot() = root |
-      RegExp::isDotAll(node.getFlags())
-    )
-  }
+  predicate isDotAll(RegExpTerm root) { RegExp::isDotAll(getFlags(root)) }
 }
--- a/javascript/ql/src/Security/CWE-116/BadTagFilter.qhelp
+++ b/javascript/ql/src/Security/CWE-116/BadTagFilter.qhelp
@ -0,0 +1,54 @@
+<!DOCTYPE qhelp PUBLIC
+  "-//Semmle//qhelp//EN"
+  "qhelp.dtd">
+<qhelp>
+
+<overview>
+<p>
+It is possible to match some single HTML tags using regular expressions (parsing general HTML using 
+regular expressions is impossible). However, if the regular expression is not written well it might 
+be possible to circumvent it, which can lead to cross-site scripting or other security issues.
+</p>
+<p>
+Some of these mistakes are caused by browsers having very forgiving HTML parsers, and
+will often render invalid HTML containing syntax errors. 
+Regular expressions that attempt to match HTML should also recognize tags containing such syntax errors.
+</p>
+</overview>
+
+<recommendation>
+<p>
+Use a well-tested sanitization or parser library if at all possible. These libraries are much more
+likely to handle corner cases correctly than a custom implementation.
+</p>
+</recommendation>
+
+<example>
+<p>
+The following example attempts to filters out all <code>&lt;script&gt;</code> tags.
+</p>
+
+<sample src="examples/BadTagFilter.js" />
+
+<p>
+The above sanitizer does not filter out all <code>&lt;script&gt;</code> tags. 
+Browsers will not only accept <code>&lt;/script&gt;</code> as script end tags, but also tags such as <code>&lt;/script foo="bar"&gt;</code> even though it is a parser error.
+This means that an attack string such as <code>&lt;script&gt;alert(1)&lt;/script foo="bar"&gt;</code> will not be filtered by 
+the function, and <code>alert(1)</code> will be executed by a browser if the string is rendered as HTML.
+</p>
+
+<p>
+Other corner cases include that HTML comments can end with <code>--!&gt;</code>, 
+and that HTML tag names can contain upper case characters.
+</p>
+</example>
+
+<references>
+<li>Securitum: <a href="https://research.securitum.com/the-curious-case-of-copy-paste/">The Curious Case of Copy &amp; Paste</a>.</li>
+<li>stackoverflow.com: <a href="https://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags#answer-1732454">You can't parse [X]HTML with regex</a>.</li>
+<li>HTML Standard: <a href="https://html.spec.whatwg.org/multipage/parsing.html#comment-end-bang-state">Comment end bang state</a>.</li>
+<li>stackoverflow.com: <a href="https://stackoverflow.com/questions/25559999/why-arent-browsers-strict-about-html">Why aren't browsers strict about HTML?</a>.</li>
+</references>
+</qhelp>
+
+
--- a/javascript/ql/src/Security/CWE-116/BadTagFilter.ql
+++ b/javascript/ql/src/Security/CWE-116/BadTagFilter.ql
@ -0,0 +1,19 @@
+/**
+ * @name Bad HTML filtering regexp
+ * @description Matching HTML tags using regular expressions is hard to do right, and can easily lead to security issues.
+ * @kind problem
+ * @problem.severity warning
+ * @security-severity 7.8
+ * @precision high
+ * @id js/bad-tag-filter
+ * @tags correctness
+ *       security
+ *       external/cwe/cwe-116
+ *       external/cwe/cwe-020
+ */
+
+import semmle.javascript.security.BadTagFilterQuery
+
+from HTMLMatchingRegExp regexp, string msg
+where msg = min(string m | isBadRegexpFilter(regexp, m) | m order by m.length(), m) // there might be multiple, we arbitrarily pick the shortest one
+select regexp, msg
--- a/javascript/ql/src/Security/CWE-116/examples/BadTagFilter.js
+++ b/javascript/ql/src/Security/CWE-116/examples/BadTagFilter.js
@ -0,0 +1,8 @@
+function filterScript(html) {
+    var scriptRegex = /<script\b[^>]*>([\s\S]*?)<\/script>/gi;
+    var match;
+    while ((match = scriptRegex.exec(html)) !== null) {
+        html = html.replace(match[0], match[1]);
+    }
+    return html;
+}
--- a/javascript/ql/test/query-tests/Security/CWE-116/BadTagFilter/BadTagFilter.expected
+++ b/javascript/ql/test/query-tests/Security/CWE-116/BadTagFilter/BadTagFilter.expected
@ -0,0 +1,17 @@
+| tst.js:2:6:2:29 | <script.*?>.*?<\\/script> | This regular expression does not match script end tags like </script >. |
+| tst.js:3:6:3:29 | <script.*?>.*?<\\/script> | This regular expression does not match script end tags like </script >. |
+| tst.js:7:6:7:16 | <!--.*--!?> | This regular expression does not match comments containing newlines. |
+| tst.js:8:6:8:39 | <script.*?>(.\|\\s)*?<\\/script[^>]*> | This regular expression matches <script></script>, but not <script \\n></script> |
+| tst.js:9:6:9:37 | <script[^>]*?>.*?<\\/script[^>]*> | This regular expression matches <script>...</script>, but not <script >...\\n</script> |
+| tst.js:10:6:10:44 | <script(\\s\|\\w\|=\|")*?>.*?<\\/script[^>]*> | This regular expression does not match script tags where the attribute uses single-quotes. |
+| tst.js:11:6:11:44 | <script(\\s\|\\w\|=\|')*?>.*?<\\/script[^>]*> | This regular expression does not match script tags where the attribute uses double-quotes. |
+| tst.js:12:6:12:48 | <script( \|\\n\|\\w\|=\|'\|")*?>.*?<\\/script[^>]*> | This regular expression does not match script tags where tabs are used between attributes. |
+| tst.js:13:6:13:34 | <script.*?>.*?<\\/script[^>]*> | This regular expression does not match upper case <SCRIPT> tags. |
+| tst.js:14:6:14:52 | <(script\|SCRIPT).*?>.*?<\\/(script\|SCRIPT)[^>]*> | This regular expression does not match mixed case <sCrIpT> tags. |
+| tst.js:15:6:15:39 | <script[^>]*?>[\\s\\S]*?<\\/script.*> | This regular expression does not match script end tags like </script\\t\\n bar>. |
+| tst.js:17:6:17:40 | <script\\b[^>]*>([\\s\\S]*?)<\\/script> | This regular expression does not match script end tags like </script >. |
+| tst.js:18:6:18:48 | <(?:!--([\\S\|\\s]*?)-->)\|([^\\/\\s>]+)[\\S\\s]*?> | Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group 1 and comments ending with --!> are matched with capture group 2. |
+| tst.js:19:6:19:147 | <(?:(?:\\/([^>]+)>)\|(?:!--([\\S\|\\s]*?)-->)\|(?:([^\\/\\s>]+)((?:\\s+[\\w\\-:.]+(?:\\s*=\\s*?(?:(?:"[^"]*")\|(?:'[^']*')\|[^\\s"'\\/>]+))?)*)[\\S\\s]*?(\\/?)>)) | Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group 2 and comments ending with --!> are matched with capture group 3, 4. |
+| tst.js:20:3:20:57 | (<[a-z\\/!$]("[^"]*"\|'[^']*'\|[^'">])*>\|<!(--.*?--\\s*)+>) | Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group 3 and comments ending with --!> are matched with capture group 1. |
+| tst.js:21:6:21:249 | <(?:(?:!--([\\w\\W]*?)-->)\|(?:!\\[CDATA\\[([\\w\\W]*?)\\]\\]>)\|(?:!DOCTYPE([\\w\\W]*?)>)\|(?:\\?([^\\s\\/<>]+) ?([\\w\\W]*?)[?/]>)\|(?:\\/([A-Za-z][A-Za-z0-9\\-_\\:\\.]*)>)\|(?:([A-Za-z][A-Za-z0-9\\-_\\:\\.]*)((?:\\s+[^"'>]+(?:(?:"[^"]*")\|(?:'[^']*')\|[^>]*))*\|\\/\|\\s+)>)) | This regular expression only parses --> (capture group 1) and not --!> as a HTML comment end tag. |
+| tst.js:22:6:22:33 | <!--([\\w\\W]*?)-->\|<([^>]*?)> | Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group 1 and comments ending with --!> are matched with capture group 2. |
--- a/javascript/ql/test/query-tests/Security/CWE-116/BadTagFilter/BadTagFilter.qlref
+++ b/javascript/ql/test/query-tests/Security/CWE-116/BadTagFilter/BadTagFilter.qlref
@ -0,0 +1 @@
+Security/CWE-116/BadTagFilter.ql
--- a/javascript/ql/test/query-tests/Security/CWE-116/BadTagFilter/tst.js
+++ b/javascript/ql/test/query-tests/Security/CWE-116/BadTagFilter/tst.js
@ -0,0 +1,28 @@
+var filters = [
+    /<script.*?>.*?<\/script>/i, // NOT OK - doesn't match newlines or `</script >`
+    /<script.*?>.*?<\/script>/is, // NOT OK - doesn't match `</script >`
+    /<script.*?>.*?<\/script[^>]*>/is, // OK
+    /<!--.*-->/is, // OK - we don't care regexps that only match comments
+    /<!--.*--!?>/is, // OK
+    /<!--.*--!?>/i, // NOT OK, does not match newlines
+    /<script.*?>(.|\s)*?<\/script[^>]*>/i, // NOT OK - doesn't match inside the script tag
+    /<script[^>]*?>.*?<\/script[^>]*>/i, // NOT OK - doesn't match newlines inside the content
+    /<script(\s|\w|=|")*?>.*?<\/script[^>]*>/is, // NOT OK - does not match single quotes for attribute values
+    /<script(\s|\w|=|')*?>.*?<\/script[^>]*>/is, // NOT OK - does not match double quotes for attribute values
+    /<script( |\n|\w|=|'|")*?>.*?<\/script[^>]*>/is, // NOT OK - does not match tabs between attributes
+    /<script.*?>.*?<\/script[^>]*>/s, // NOT OK - does not match uppercase SCRIPT tags
+    /<(script|SCRIPT).*?>.*?<\/(script|SCRIPT)[^>]*>/s, // NOT OK - does not match mixed case script tags
+    /<script[^>]*?>[\s\S]*?<\/script.*>/i, // NOT OK - doesn't match newlines in the end tag
+    /<script[^>]*?>[\s\S]*?<\/script[^>]*?>/i, // OK
+    /<script\b[^>]*>([\s\S]*?)<\/script>/gi, // NOT OK - too strict matching on the end tag
+    /<(?:!--([\S|\s]*?)-->)|([^\/\s>]+)[\S\s]*?>/, // NOT OK - doesn't match comments with the right capture groups
+    /<(?:(?:\/([^>]+)>)|(?:!--([\S|\s]*?)-->)|(?:([^\/\s>]+)((?:\s+[\w\-:.]+(?:\s*=\s*?(?:(?:"[^"]*")|(?:'[^']*')|[^\s"'\/>]+))?)*)[\S\s]*?(\/?)>))/, // NOT OK - capture groups
+	/(<[a-z\/!$]("[^"]*"|'[^']*'|[^'">])*>|<!(--.*?--\s*)+>)/gi, // NOT OK - capture groups
+    /<(?:(?:!--([\w\W]*?)-->)|(?:!\[CDATA\[([\w\W]*?)\]\]>)|(?:!DOCTYPE([\w\W]*?)>)|(?:\?([^\s\/<>]+) ?([\w\W]*?)[?/]>)|(?:\/([A-Za-z][A-Za-z0-9\-_\:\.]*)>)|(?:([A-Za-z][A-Za-z0-9\-_\:\.]*)((?:\s+[^"'>]+(?:(?:"[^"]*")|(?:'[^']*')|[^>]*))*|\/|\s+)>))/g, // NOT OK - capture groups
+    /<!--([\w\W]*?)-->|<([^>]*?)>/g, // NOT OK - capture groups
+]
+
+doFilters(filters)
+
+var strip = '<script([^>]*)>([\\S\\s]*?)<\/script([^>]*)>';  // OK - it's used with the ignorecase flag
+new RegExp(strip, 'gi');
--- a/python/change-notes/2021-08-26-bad-tag-filter.md
+++ b/python/change-notes/2021-08-26-bad-tag-filter.md
@ -0,0 +1,4 @@
+lgtm,codescanning
+* A new query, `py/bad-tag-filter`, has been added to the query suite,
+  highlighting regular expressions that only match a subset of the HTML tags
+  it is supposed to match.
--- a/python/ql/lib/semmle/python/RegexTreeView.qll
+++ b/python/ql/lib/semmle/python/RegexTreeView.qll
@ -467,9 +467,10 @@ class RegExpEscape extends RegExpNormalChar {
    or
    this.getUnescaped() = "t" and result = "\t"
    or
-    // TODO: Find a way to include a formfeed character
-    // this.getUnescaped() = "f" and result = ""
-    // or
+    this.getUnescaped() = "f" and result = 12.toUnicode()
+    or
+    this.getUnescaped() = "v" and result = 11.toUnicode()
+    or
    this.isUnicode() and
    result = this.getUnicode()
  }
@ -480,7 +481,7 @@ class RegExpEscape extends RegExpNormalChar {
  override string getPrimaryQLClass() { result = "RegExpEscape" }

  /** Gets the part of the term following the escape character. That is e.g. "w" if the term is "\w". */
-  private string getUnescaped() { result = this.getText().suffix(1) }
+  string getUnescaped() { result = this.getText().suffix(1) }

  /**
   * Gets the text for this escape. That is e.g. "\w".
@ -535,6 +536,13 @@ private int toHex(string hex) {
  result = 15 and hex = ["f", "F"]
 }

+/**
+ * A word boundary, that is, a regular expression term of the form `\b`.
+ */
+class RegExpWordBoundary extends RegExpEscape {
+  RegExpWordBoundary() { this.getUnescaped() = "b" }
+}
+
 /**
 * A character class escape in a regular expression.
 * That is, an escaped charachter that denotes multiple characters.
--- a/python/ql/lib/semmle/python/regex.qll
+++ b/python/ql/lib/semmle/python/regex.qll
@ -454,6 +454,7 @@ abstract class RegexString extends Expr {
  /** Gets the number of the group in start,end */
  int getGroupNumber(int start, int end) {
    this.group(start, end) and
+    not this.non_capturing_group_start(start, _) and
    result =
      count(int i | this.group(i, _) and i < start and not this.non_capturing_group_start(i, _)) + 1
  }
--- a/python/ql/lib/semmle/python/security/BadTagFilterQuery.qll
+++ b/python/ql/lib/semmle/python/security/BadTagFilterQuery.qll
@ -0,0 +1,306 @@
+/**
+ * Provides precicates for reasoning about bad tag filter vulnerabilities.
+ */
+
+import performance.ReDoSUtil
+
+/**
+ * A module for determining if a regexp matches a given string,
+ * and reasoning about which capture groups are filled by a given string.
+ */
+private module RegexpMatching {
+  /**
+   * A class to test whether a regular expression matches a string.
+   * Override this class and extend `test`/`testWithGroups` to configure which strings should be tested for acceptance by this regular expression.
+   * The result can afterwards be read from the `matches` predicate.
+   *
+   * Strings in the `testWithGroups` predicate are also tested for which capture groups are filled by the given string.
+   * The result is available in the `fillCaptureGroup` predicate.
+   */
+  abstract class MatchedRegExp extends RegExpTerm {
+    MatchedRegExp() { this.isRootTerm() }
+
+    /**
+     * Holds if it should be tested whether this regular expression matches `str`.
+     *
+     * If `ignorePrefix` is true, then a regexp without a start anchor will be treated as if it had a start anchor.
+     * E.g. a regular expression `/foo$/` will match any string that ends with "foo",
+     * but if `ignorePrefix` is true, it will only match "foo".
+     */
+    predicate test(string str, boolean ignorePrefix) {
+      none() // maybe overriden in subclasses
+    }
+
+    /**
+     * Same as `test(..)`, but where the `fillsCaptureGroup` afterwards tells which capture groups were filled by the given string.
+     */
+    predicate testWithGroups(string str, boolean ignorePrefix) {
+      none() // maybe overriden in subclasses
+    }
+
+    /**
+     * Holds if this RegExp matches `str`, where `str` is either in the `test` or `testWithGroups` predicate.
+     */
+    final predicate matches(string str) {
+      exists(State state | state = getAState(this, str.length() - 1, str, _) |
+        epsilonSucc*(state) = Accept(_)
+      )
+    }
+
+    /**
+     * Holds if matching `str` may fill capture group number `g`.
+     * Only holds if `str` is in the `testWithGroups` predicate.
+     */
+    final predicate fillsCaptureGroup(string str, int g) {
+      exists(State s |
+        s = getAStateThatReachesAccept(this, _, str, _) and
+        g = group(s.getRepr())
+      )
+    }
+  }
+
+  /**
+   * Gets a state the regular expression `reg` can be in after matching the `i`th char in `str`.
+   * The regular expression is modelled as a non-determistic finite automaton,
+   * the regular expression can therefore be in multiple states after matching a character.
+   *
+   * It's a forward search to all possible states, and there is thus no guarantee that the state is on a path to an accepting state.
+   */
+  private State getAState(MatchedRegExp reg, int i, string str, boolean ignorePrefix) {
+    // start state, the -1 position before any chars have been matched
+    i = -1 and
+    (
+      reg.test(str, ignorePrefix)
+      or
+      reg.testWithGroups(str, ignorePrefix)
+    ) and
+    result.getRepr().getRootTerm() = reg and
+    isStartState(result)
+    or
+    // recursive case
+    result = getAStateAfterMatching(reg, _, str, i, _, ignorePrefix)
+  }
+
+  /**
+   * Gets the next state after the `prev` state from `reg`.
+   * `prev` is the state after matching `fromIndex` chars in `str`,
+   * and the result is the state after matching `toIndex` chars in `str`.
+   *
+   * This predicate is used as a step relation in the forwards search (`getAState`),
+   * and also as a step relation in the later backwards search (`getAStateThatReachesAccept`).
+   */
+  private State getAStateAfterMatching(
+    MatchedRegExp reg, State prev, string str, int toIndex, int fromIndex, boolean ignorePrefix
+  ) {
+    // the basic recursive case - outlined into a noopt helper to make performance work out.
+    result = getAStateAfterMatchingAux(reg, prev, str, toIndex, fromIndex, ignorePrefix)
+    or
+    // we can skip past word boundaries if the next char is a non-word char.
+    fromIndex = toIndex and
+    prev.getRepr() instanceof RegExpWordBoundary and
+    prev = getAState(reg, toIndex, str, ignorePrefix) and
+    after(prev.getRepr()) = result and
+    str.charAt(toIndex + 1).regexpMatch("\\W") // \W matches any non-word char.
+  }
+
+  pragma[noopt]
+  private State getAStateAfterMatchingAux(
+    MatchedRegExp reg, State prev, string str, int toIndex, int fromIndex, boolean ignorePrefix
+  ) {
+    prev = getAState(reg, fromIndex, str, ignorePrefix) and
+    fromIndex = toIndex - 1 and
+    exists(string char | char = str.charAt(toIndex) | specializedDeltaClosed(prev, char, result)) and
+    not discardedPrefixStep(prev, result, ignorePrefix)
+  }
+
+  /** Holds if a step from `prev` to `next` should be discarded when the `ignorePrefix` flag is set. */
+  private predicate discardedPrefixStep(State prev, State next, boolean ignorePrefix) {
+    prev = mkMatch(any(RegExpRoot r)) and
+    ignorePrefix = true and
+    next = prev
+  }
+
+  // The `deltaClosed` relation specialized to the chars that exists in strings tested by a `MatchedRegExp`.
+  private predicate specializedDeltaClosed(State prev, string char, State next) {
+    deltaClosed(prev, specializedGetAnInputSymbolMatching(char), next)
+  }
+
+  // The `getAnInputSymbolMatching` relation specialized to the chars that exists in strings tested by a `MatchedRegExp`.
+  pragma[noinline]
+  private InputSymbol specializedGetAnInputSymbolMatching(string char) {
+    exists(string s, MatchedRegExp r |
+      r.test(s, _)
+      or
+      r.testWithGroups(s, _)
+    |
+      char = s.charAt(_)
+    ) and
+    result = getAnInputSymbolMatching(char)
+  }
+
+  /**
+   * Gets the `i`th state on a path to the accepting state when `reg` matches `str`.
+   * Starts with an accepting state as found by `getAState` and searches backwards
+   * to the start state through the reachable states (as found by `getAState`).
+   *
+   * This predicate holds the invariant that the result state can be reached with `i` steps from a start state,
+   * and an accepting state can be found after (`str.length() - 1 - i`) steps from the result.
+   * The result state is therefore always on a valid path where `reg` accepts `str`.
+   *
+   * This predicate is only used to find which capture groups a regular expression has filled,
+   * and thus the search is only performed for the strings in the `testWithGroups(..)` predicate.
+   */
+  private State getAStateThatReachesAccept(
+    MatchedRegExp reg, int i, string str, boolean ignorePrefix
+  ) {
+    // base case, reaches an accepting state from the last state in `getAState(..)`
+    reg.testWithGroups(str, ignorePrefix) and
+    i = str.length() - 1 and
+    result = getAState(reg, i, str, ignorePrefix) and
+    epsilonSucc*(result) = Accept(_)
+    or
+    // recursive case. `next` is the next state to be matched after matching `prev`.
+    // this predicate is doing a backwards search, so `prev` is the result we are looking for.
+    exists(State next, State prev, int fromIndex, int toIndex |
+      next = getAStateThatReachesAccept(reg, toIndex, str, ignorePrefix) and
+      next = getAStateAfterMatching(reg, prev, str, toIndex, fromIndex, ignorePrefix) and
+      i = fromIndex and
+      result = prev
+    )
+  }
+
+  /** Gets the capture group number that `term` belongs to. */
+  private int group(RegExpTerm term) {
+    exists(RegExpGroup grp | grp.getNumber() = result | term.getParent*() = grp)
+  }
+}
+
+/** A class to test whether a regular expression matches certain HTML tags. */
+class HTMLMatchingRegExp extends RegexpMatching::MatchedRegExp {
+  HTMLMatchingRegExp() {
+    // the regexp must mention "<" and ">" explicitly.
+    forall(string angleBracket | angleBracket = ["<", ">"] |
+      any(RegExpConstant term | term.getValue().matches("%" + angleBracket + "%")).getRootTerm() =
+        this
+    )
+  }
+
+  override predicate testWithGroups(string str, boolean ignorePrefix) {
+    ignorePrefix = true and
+    str = ["<!-- foo -->", "<!-- foo --!>", "<!- foo ->", "<foo>", "<script>"]
+  }
+
+  override predicate test(string str, boolean ignorePrefix) {
+    ignorePrefix = true and
+    str =
+      [
+        "<!-- foo -->", "<!- foo ->", "<!-- foo --!>", "<!-- foo\n -->", "<script>foo</script>",
+        "<script \n>foo</script>", "<script >foo\n</script>", "<foo ></foo>", "<foo>",
+        "<foo src=\"foo\"></foo>", "<script>", "<script src=\"foo\"></script>",
+        "<script src='foo'></script>", "<SCRIPT>foo</SCRIPT>", "<script\tsrc=\"foo\"/>",
+        "<script\tsrc='foo'></script>", "<sCrIpT>foo</ScRiPt>", "<script src=\"foo\">foo</script >",
+        "<script src=\"foo\">foo</script foo=\"bar\">", "<script src=\"foo\">foo</script\t\n bar>"
+      ]
+  }
+}
+
+/**
+ * Holds if `regexp` matches some HTML tags, but misses some HTML tags that it should match.
+ *
+ * When adding a new case to this predicate, make sure the test string used in `matches(..)` calls are present in `HTMLMatchingRegExp::test` / `HTMLMatchingRegExp::testWithGroups`.
+ */
+predicate isBadRegexpFilter(HTMLMatchingRegExp regexp, string msg) {
+  // CVE-2021-33829 - matching both "<!-- foo -->" and "<!-- foo --!>", but in different capture groups
+  regexp.matches("<!-- foo -->") and
+  regexp.matches("<!-- foo --!>") and
+  exists(int a, int b | a != b |
+    regexp.fillsCaptureGroup("<!-- foo -->", a) and
+    // <!-- foo --> might be ambigously parsed (matching both capture groups), and that is ok here.
+    regexp.fillsCaptureGroup("<!-- foo --!>", b) and
+    not regexp.fillsCaptureGroup("<!-- foo --!>", a) and
+    msg =
+      "Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group "
+        + a + " and comments ending with --!> are matched with capture group " +
+        strictconcat(int i | regexp.fillsCaptureGroup("<!-- foo --!>", i) | i.toString(), ", ") +
+        "."
+  )
+  or
+  // CVE-2020-17480 - matching "<!-- foo -->" and other tags, but not "<!-- foo --!>".
+  exists(int group, int other |
+    group != other and
+    regexp.fillsCaptureGroup("<!-- foo -->", group) and
+    regexp.fillsCaptureGroup("<foo>", other) and
+    not regexp.matches("<!-- foo --!>") and
+    not regexp.fillsCaptureGroup("<!-- foo -->", any(int i | i != group)) and
+    not regexp.fillsCaptureGroup("<!- foo ->", group) and
+    not regexp.fillsCaptureGroup("<foo>", group) and
+    not regexp.fillsCaptureGroup("<script>", group) and
+    msg =
+      "This regular expression only parses --> (capture group " + group +
+        ") and not --!> as a HTML comment end tag."
+  )
+  or
+  regexp.matches("<!-- foo -->") and
+  not regexp.matches("<!-- foo\n -->") and
+  not regexp.matches("<!- foo ->") and
+  not regexp.matches("<foo>") and
+  not regexp.matches("<script>") and
+  msg = "This regular expression does not match comments containing newlines."
+  or
+  regexp.matches("<script>foo</script>") and
+  regexp.matches("<script src=\"foo\"></script>") and
+  not regexp.matches("<foo ></foo>") and
+  (
+    not regexp.matches("<script \n>foo</script>") and
+    msg = "This regular expression matches <script></script>, but not <script \\n></script>"
+    or
+    not regexp.matches("<script >foo\n</script>") and
+    msg = "This regular expression matches <script>...</script>, but not <script >...\\n</script>"
+  )
+  or
+  regexp.matches("<script>foo</script>") and
+  regexp.matches("<script src=\"foo\"></script>") and
+  not regexp.matches("<script src='foo'></script>") and
+  not regexp.matches("<foo>") and
+  msg = "This regular expression does not match script tags where the attribute uses single-quotes."
+  or
+  regexp.matches("<script>foo</script>") and
+  regexp.matches("<script src='foo'></script>") and
+  not regexp.matches("<script src=\"foo\"></script>") and
+  not regexp.matches("<foo>") and
+  msg = "This regular expression does not match script tags where the attribute uses double-quotes."
+  or
+  regexp.matches("<script>foo</script>") and
+  regexp.matches("<script src='foo'></script>") and
+  not regexp.matches("<script\tsrc='foo'></script>") and
+  not regexp.matches("<foo>") and
+  not regexp.matches("<foo src=\"foo\"></foo>") and
+  msg = "This regular expression does not match script tags where tabs are used between attributes."
+  or
+  regexp.matches("<script>foo</script>") and
+  not RegExpFlags::isIgnoreCase(regexp) and
+  not regexp.matches("<foo>") and
+  not regexp.matches("<foo ></foo>") and
+  (
+    not regexp.matches("<SCRIPT>foo</SCRIPT>") and
+    msg = "This regular expression does not match upper case <SCRIPT> tags."
+    or
+    not regexp.matches("<sCrIpT>foo</ScRiPt>") and
+    regexp.matches("<SCRIPT>foo</SCRIPT>") and
+    msg = "This regular expression does not match mixed case <sCrIpT> tags."
+  )
+  or
+  regexp.matches("<script src=\"foo\"></script>") and
+  not regexp.matches("<foo>") and
+  not regexp.matches("<foo ></foo>") and
+  (
+    not regexp.matches("<script src=\"foo\">foo</script >") and
+    msg = "This regular expression does not match script end tags like </script >."
+    or
+    not regexp.matches("<script src=\"foo\">foo</script foo=\"bar\">") and
+    msg = "This regular expression does not match script end tags like </script foo=\"bar\">."
+    or
+    not regexp.matches("<script src=\"foo\">foo</script\t\n bar>") and
+    msg = "This regular expression does not match script end tags like </script\\t\\n bar>."
+  )
+}
--- a/python/ql/lib/semmle/python/security/performance/ReDoSUtil.qll
+++ b/python/ql/lib/semmle/python/security/performance/ReDoSUtil.qll
@ -542,7 +542,7 @@ private State before(RegExpTerm t) { result = Match(t, 0) }
 /**
 * Gets a state the NFA may be in after matching `t`.
 */
-private State after(RegExpTerm t) {
+State after(RegExpTerm t) {
  exists(RegExpAlt alt | t = alt.getAChild() | result = after(alt))
  or
  exists(RegExpSequence seq, int i | t = seq.getChild(i) |
@ -671,7 +671,7 @@ RegExpRoot getRoot(RegExpTerm term) {
 /**
 * A state in the NFA.
 */
-private newtype TState =
+newtype TState =
  /**
   * A state representing that the NFA is about to match a term.
   * `i` is used to index into multi-char literals.
@ -801,29 +801,26 @@ InputSymbol getAnInputSymbolMatching(string char) {
  result = Any()
 }

+/**
+ * Holds if `state` is a start state.
+ */
+predicate isStartState(State state) {
+  state = mkMatch(any(RegExpRoot r))
+  or
+  exists(RegExpCaret car | state = after(car))
+}
+
 /**
 * Predicates for constructing a prefix string that leads to a given state.
 */
 private module PrefixConstruction {
-  /**
-   * Holds if `state` starts the string matched by the regular expression.
-   */
-  private predicate isStartState(State state) {
-    state instanceof StateInPumpableRegexp and
-    (
-      state = Match(any(RegExpRoot r), _)
-      or
-      exists(RegExpCaret car | state = after(car))
-    )
-  }
-
  /**
   * Holds if `state` is the textually last start state for the regular expression.
   */
  private predicate lastStartState(State state) {
    exists(RegExpRoot root |
      state =
-        max(State s, Location l |
+        max(StateInPumpableRegexp s, Location l |
          isStartState(s) and getRoot(s.getRepr()) = root and l = s.getRepr().getLocation()
        |
          s
--- a/python/ql/src/Security/CWE-116/BadTagFilter.qhelp
+++ b/python/ql/src/Security/CWE-116/BadTagFilter.qhelp
@ -0,0 +1,54 @@
+<!DOCTYPE qhelp PUBLIC
+  "-//Semmle//qhelp//EN"
+  "qhelp.dtd">
+<qhelp>
+
+<overview>
+<p>
+It is possible to match some single HTML tags using regular expressions (parsing general HTML using 
+regular expressions is impossible). However, if the regular expression is not written well it might 
+be possible to circumvent it, which can lead to cross-site scripting or other security issues.
+</p>
+<p>
+Some of these mistakes are caused by browsers having very forgiving HTML parsers, and
+will often render invalid HTML containing syntax errors. 
+Regular expressions that attempt to match HTML should also recognize tags containing such syntax errors.
+</p>
+</overview>
+
+<recommendation>
+<p>
+Use a well-tested sanitization or parser library if at all possible. These libraries are much more
+likely to handle corner cases correctly than a custom implementation.
+</p>
+</recommendation>
+
+<example>
+<p>
+The following example attempts to filters out all <code>&lt;script&gt;</code> tags.
+</p>
+
+<sample src="examples/BadTagFilter.py" />
+
+<p>
+The above sanitizer does not filter out all <code>&lt;script&gt;</code> tags. 
+Browsers will not only accept <code>&lt;/script&gt;</code> as script end tags, but also tags such as <code>&lt;/script foo="bar"&gt;</code> even though it is a parser error.
+This means that an attack string such as <code>&lt;script&gt;alert(1)&lt;/script foo="bar"&gt;</code> will not be filtered by 
+the function, and <code>alert(1)</code> will be executed by a browser if the string is rendered as HTML.
+</p>
+
+<p>
+Other corner cases include that HTML comments can end with <code>--!&gt;</code>, 
+and that HTML tag names can contain upper case characters.
+</p>
+</example>
+
+<references>
+<li>Securitum: <a href="https://research.securitum.com/the-curious-case-of-copy-paste/">The Curious Case of Copy &amp; Paste</a>.</li>
+<li>stackoverflow.com: <a href="https://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags#answer-1732454">You can't parse [X]HTML with regex</a>.</li>
+<li>HTML Standard: <a href="https://html.spec.whatwg.org/multipage/parsing.html#comment-end-bang-state">Comment end bang state</a>.</li>
+<li>stackoverflow.com: <a href="https://stackoverflow.com/questions/25559999/why-arent-browsers-strict-about-html">Why aren't browsers strict about HTML?</a>.</li>
+</references>
+</qhelp>
+
+
--- a/python/ql/src/Security/CWE-116/BadTagFilter.ql
+++ b/python/ql/src/Security/CWE-116/BadTagFilter.ql
@ -0,0 +1,19 @@
+/**
+ * @name Bad HTML filtering regexp
+ * @description Matching HTML tags using regular expressions is hard to do right, and can easily lead to security issues.
+ * @kind problem
+ * @problem.severity warning
+ * @security-severity 7.8
+ * @precision high
+ * @id py/bad-tag-filter
+ * @tags correctness
+ *       security
+ *       external/cwe/cwe-116
+ *       external/cwe/cwe-020
+ */
+
+import semmle.python.security.BadTagFilterQuery
+
+from HTMLMatchingRegExp regexp, string msg
+where msg = min(string m | isBadRegexpFilter(regexp, m) | m order by m.length(), m) // there might be multiple, we arbitrarily pick the shortest one
+select regexp, msg
--- a/python/ql/src/Security/CWE-116/examples/BadTagFilter.py
+++ b/python/ql/src/Security/CWE-116/examples/BadTagFilter.py
@ -0,0 +1,8 @@
+import re
+
+def filterScriptTags(content): 
+    oldContent = ""
+    while oldContent != content:
+        oldContent = content
+        content = re.sub(r'<script.*?>.*?</script>', '', content, flags= re.DOTALL | re.IGNORECASE)
+    return content
--- a/python/ql/test/query-tests/Security/CWE-116-BadTagFilter/BadTagFilter.expected
+++ b/python/ql/test/query-tests/Security/CWE-116-BadTagFilter/BadTagFilter.expected
@ -0,0 +1,16 @@
+| tst.py:4:20:4:43 | <script.*?>.*?<\\/script> | This regular expression does not match script end tags like </script >. |
+| tst.py:5:20:5:43 | <script.*?>.*?<\\/script> | This regular expression does not match script end tags like </script >. |
+| tst.py:9:20:9:30 | <!--.*--!?> | This regular expression does not match comments containing newlines. |
+| tst.py:10:20:10:53 | <script.*?>(.\|\\s)*?<\\/script[^>]*> | This regular expression matches <script></script>, but not <script \\n></script> |
+| tst.py:11:20:11:51 | <script[^>]*?>.*?<\\/script[^>]*> | This regular expression matches <script>...</script>, but not <script >...\\n</script> |
+| tst.py:12:20:12:58 | <script(\\s\|\\w\|=\|")*?>.*?<\\/script[^>]*> | This regular expression does not match script tags where the attribute uses single-quotes. |
+| tst.py:13:20:13:58 | <script(\\s\|\\w\|=\|')*?>.*?<\\/script[^>]*> | This regular expression does not match script tags where the attribute uses double-quotes. |
+| tst.py:14:20:14:62 | <script( \|\\n\|\\w\|=\|'\|")*?>.*?<\\/script[^>]*> | This regular expression does not match script tags where tabs are used between attributes. |
+| tst.py:15:20:15:48 | <script.*?>.*?<\\/script[^>]*> | This regular expression does not match upper case <SCRIPT> tags. |
+| tst.py:16:20:16:66 | <(script\|SCRIPT).*?>.*?<\\/(script\|SCRIPT)[^>]*> | This regular expression does not match mixed case <sCrIpT> tags. |
+| tst.py:17:20:17:53 | <script[^>]*?>[\\s\\S]*?<\\/script.*> | This regular expression does not match script end tags like </script\\t\\n bar>. |
+| tst.py:19:20:19:54 | <script\\b[^>]*>([\\s\\S]*?)<\\/script> | This regular expression does not match script end tags like </script >. |
+| tst.py:20:20:20:62 | <(?:!--([\\S\|\\s]*?)-->)\|([^\\/\\s>]+)[\\S\\s]*?> | Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group 1 and comments ending with --!> are matched with capture group 2. |
+| tst.py:21:20:21:161 | <(?:(?:\\/([^>]+)>)\|(?:!--([\\S\|\\s]*?)-->)\|(?:([^\\/\\s>]+)((?:\\s+[\\w\\-:.]+(?:\\s*=\\s*?(?:(?:"[^"]*")\|(?:'[^']*')\|[^\\s"'\\/>]+))?)*)[\\S\\s]*?(\\/?)>)) | Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group 2 and comments ending with --!> are matched with capture group 3, 4. |
+| tst.py:22:17:22:71 | (<[a-z\\/!$]("[^"]*"\|'[^']*'\|[^'">])*>\|<!(--.*?--\\s*)+>) | Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group 3 and comments ending with --!> are matched with capture group 1. |
+| tst.py:23:20:23:263 | <(?:(?:!--([\\w\\W]*?)-->)\|(?:!\\[CDATA\\[([\\w\\W]*?)\\]\\]>)\|(?:!DOCTYPE([\\w\\W]*?)>)\|(?:\\?([^\\s\\/<>]+) ?([\\w\\W]*?)[?/]>)\|(?:\\/([A-Za-z][A-Za-z0-9\\-_\\:\\.]*)>)\|(?:([A-Za-z][A-Za-z0-9\\-_\\:\\.]*)((?:\\s+[^"'>]+(?:(?:"[^"]*")\|(?:'[^']*')\|[^>]*))*\|\\/\|\\s+)>)) | This regular expression only parses --> (capture group 1) and not --!> as a HTML comment end tag. |
--- a/python/ql/test/query-tests/Security/CWE-116-BadTagFilter/BadTagFilter.qlref
+++ b/python/ql/test/query-tests/Security/CWE-116-BadTagFilter/BadTagFilter.qlref
@ -0,0 +1 @@
+Security/CWE-116/BadTagFilter.ql
--- a/python/ql/test/query-tests/Security/CWE-116-BadTagFilter/tst.py
+++ b/python/ql/test/query-tests/Security/CWE-116-BadTagFilter/tst.py
@ -0,0 +1,26 @@
+import re
+
+filters = [
+    re.compile(r"""<script.*?>.*?<\/script>""", re.IGNORECASE), # NOT OK - doesn't match newlines or `</script >`
+    re.compile(r"""<script.*?>.*?<\/script>""", re.IGNORECASE | re.DOTALL), # NOT OK - doesn't match `</script >`
+    re.compile(r"""<script.*?>.*?<\/script[^>]*>""", re.IGNORECASE | re.DOTALL), # OK
+    re.compile(r"""<!--.*-->""", re.IGNORECASE | re.DOTALL), # OK - we don't care regexps that only match comments
+    re.compile(r"""<!--.*--!?>""", re.IGNORECASE | re.DOTALL), # OK
+    re.compile(r"""<!--.*--!?>""", re.IGNORECASE), # NOT OK, does not match newlines
+    re.compile(r"""<script.*?>(.|\s)*?<\/script[^>]*>""", re.IGNORECASE), # NOT OK - doesn't match inside the script tag
+    re.compile(r"""<script[^>]*?>.*?<\/script[^>]*>""", re.IGNORECASE), # NOT OK - doesn't match newlines inside the content
+    re.compile(r"""<script(\s|\w|=|")*?>.*?<\/script[^>]*>""", re.IGNORECASE | re.DOTALL), # NOT OK - does not match single quotes for attribute values
+    re.compile(r"""<script(\s|\w|=|')*?>.*?<\/script[^>]*>""", re.IGNORECASE | re.DOTALL), # NOT OK - does not match double quotes for attribute values
+    re.compile(r"""<script( |\n|\w|=|'|")*?>.*?<\/script[^>]*>""", re.IGNORECASE | re.DOTALL), # NOT OK - does not match tabs between attributes
+    re.compile(r"""<script.*?>.*?<\/script[^>]*>""", re.re.DOTALL), # NOT OK - does not match uppercase SCRIPT tags
+    re.compile(r"""<(script|SCRIPT).*?>.*?<\/(script|SCRIPT)[^>]*>""", re.DOTALL), # NOT OK - does not match mixed case script tags
+    re.compile(r"""<script[^>]*?>[\s\S]*?<\/script.*>""", re.IGNORECASE), # NOT OK - doesn't match newlines in the end tag
+    re.compile(r"""<script[^>]*?>[\s\S]*?<\/script[^>]*?>""", re.IGNORECASE), # OK
+    re.compile(r"""<script\b[^>]*>([\s\S]*?)<\/script>""", re.IGNORECASE | re.DOTALL), # NOT OK - too strict matching on the end tag
+    re.compile(r"""<(?:!--([\S|\s]*?)-->)|([^\/\s>]+)[\S\s]*?>"""), #// NOT OK - doesn't match comments with the right capture groups
+    re.compile(r"""<(?:(?:\/([^>]+)>)|(?:!--([\S|\s]*?)-->)|(?:([^\/\s>]+)((?:\s+[\w\-:.]+(?:\s*=\s*?(?:(?:"[^"]*")|(?:'[^']*')|[^\s"'\/>]+))?)*)[\S\s]*?(\/?)>))"""), # NOT OK - capture groups
+	re.compile(r"""(<[a-z\/!$]("[^"]*"|'[^']*'|[^'">])*>|<!(--.*?--\s*)+>)""", re.IGNORECASE), # NOT OK - capture groups
+    re.compile(r"""<(?:(?:!--([\w\W]*?)-->)|(?:!\[CDATA\[([\w\W]*?)\]\]>)|(?:!DOCTYPE([\w\W]*?)>)|(?:\?([^\s\/<>]+) ?([\w\W]*?)[?/]>)|(?:\/([A-Za-z][A-Za-z0-9\-_\:\.]*)>)|(?:([A-Za-z][A-Za-z0-9\-_\:\.]*)((?:\s+[^"'>]+(?:(?:"[^"]*")|(?:'[^']*')|[^>]*))*|\/|\s+)>))"""), # NOT OK - capture groups
+]
+
+doFilters(filters)
--- a/python/ql/test/query-tests/Security/CWE-730-ReDoS/ReDoS.expected
+++ b/python/ql/test/query-tests/Security/CWE-730-ReDoS/ReDoS.expected
@ -35,6 +35,9 @@
 | redos.py:139:25:139:31 | (\\w\|G)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'G'. |
 | redos.py:145:25:145:32 | (\\d\|\\w)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '0'. |
 | redos.py:148:25:148:31 | (\\d\|5)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '5'. |
+| redos.py:151:25:151:34 | (\\s\|[\\f])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\u000c'. |
+| redos.py:154:25:154:38 | (\\s\|[\\v]\|\\\\v)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\u000b'. |
+| redos.py:157:25:157:34 | (\\f\|[\\f])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\u000c'. |
 | redos.py:160:25:160:32 | (\\W\|\\D)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of ' '. |
 | redos.py:163:25:163:32 | (\\S\|\\w)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '0'. |
 | redos.py:166:25:166:34 | (\\S\|[\\w])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '0'. |
--- a/ruby/ql/lib/codeql/ruby/ast/Literal.qll
+++ b/ruby/ql/lib/codeql/ruby/ast/Literal.qll
@ -1,5 +1,5 @@
 private import codeql.ruby.AST
-private import codeql.ruby.regexp.RegExpTreeView as RETV
+private import codeql.ruby.security.performance.RegExpTreeView as RETV
 private import internal.AST
 private import internal.Scope
 private import internal.TreeSitter
--- a/ruby/ql/lib/codeql/ruby/printAst.qll
+++ b/ruby/ql/lib/codeql/ruby/printAst.qll
@ -7,7 +7,7 @@
 */

 private import AST
-private import codeql.ruby.regexp.RegExpTreeView as RETV
+private import codeql.ruby.security.performance.RegExpTreeView as RETV

 /** Holds if `n` appears in the desugaring of some other node. */
 predicate isDesugared(AstNode n) {
--- a/ruby/ql/lib/codeql/ruby/security/BadTagFilterQuery.qll
+++ b/ruby/ql/lib/codeql/ruby/security/BadTagFilterQuery.qll
@ -0,0 +1,306 @@
+/**
+ * Provides precicates for reasoning about bad tag filter vulnerabilities.
+ */
+
+import performance.ReDoSUtil
+
+/**
+ * A module for determining if a regexp matches a given string,
+ * and reasoning about which capture groups are filled by a given string.
+ */
+private module RegexpMatching {
+  /**
+   * A class to test whether a regular expression matches a string.
+   * Override this class and extend `test`/`testWithGroups` to configure which strings should be tested for acceptance by this regular expression.
+   * The result can afterwards be read from the `matches` predicate.
+   *
+   * Strings in the `testWithGroups` predicate are also tested for which capture groups are filled by the given string.
+   * The result is available in the `fillCaptureGroup` predicate.
+   */
+  abstract class MatchedRegExp extends RegExpTerm {
+    MatchedRegExp() { this.isRootTerm() }
+
+    /**
+     * Holds if it should be tested whether this regular expression matches `str`.
+     *
+     * If `ignorePrefix` is true, then a regexp without a start anchor will be treated as if it had a start anchor.
+     * E.g. a regular expression `/foo$/` will match any string that ends with "foo",
+     * but if `ignorePrefix` is true, it will only match "foo".
+     */
+    predicate test(string str, boolean ignorePrefix) {
+      none() // maybe overriden in subclasses
+    }
+
+    /**
+     * Same as `test(..)`, but where the `fillsCaptureGroup` afterwards tells which capture groups were filled by the given string.
+     */
+    predicate testWithGroups(string str, boolean ignorePrefix) {
+      none() // maybe overriden in subclasses
+    }
+
+    /**
+     * Holds if this RegExp matches `str`, where `str` is either in the `test` or `testWithGroups` predicate.
+     */
+    final predicate matches(string str) {
+      exists(State state | state = getAState(this, str.length() - 1, str, _) |
+        epsilonSucc*(state) = Accept(_)
+      )
+    }
+
+    /**
+     * Holds if matching `str` may fill capture group number `g`.
+     * Only holds if `str` is in the `testWithGroups` predicate.
+     */
+    final predicate fillsCaptureGroup(string str, int g) {
+      exists(State s |
+        s = getAStateThatReachesAccept(this, _, str, _) and
+        g = group(s.getRepr())
+      )
+    }
+  }
+
+  /**
+   * Gets a state the regular expression `reg` can be in after matching the `i`th char in `str`.
+   * The regular expression is modelled as a non-determistic finite automaton,
+   * the regular expression can therefore be in multiple states after matching a character.
+   *
+   * It's a forward search to all possible states, and there is thus no guarantee that the state is on a path to an accepting state.
+   */
+  private State getAState(MatchedRegExp reg, int i, string str, boolean ignorePrefix) {
+    // start state, the -1 position before any chars have been matched
+    i = -1 and
+    (
+      reg.test(str, ignorePrefix)
+      or
+      reg.testWithGroups(str, ignorePrefix)
+    ) and
+    result.getRepr().getRootTerm() = reg and
+    isStartState(result)
+    or
+    // recursive case
+    result = getAStateAfterMatching(reg, _, str, i, _, ignorePrefix)
+  }
+
+  /**
+   * Gets the next state after the `prev` state from `reg`.
+   * `prev` is the state after matching `fromIndex` chars in `str`,
+   * and the result is the state after matching `toIndex` chars in `str`.
+   *
+   * This predicate is used as a step relation in the forwards search (`getAState`),
+   * and also as a step relation in the later backwards search (`getAStateThatReachesAccept`).
+   */
+  private State getAStateAfterMatching(
+    MatchedRegExp reg, State prev, string str, int toIndex, int fromIndex, boolean ignorePrefix
+  ) {
+    // the basic recursive case - outlined into a noopt helper to make performance work out.
+    result = getAStateAfterMatchingAux(reg, prev, str, toIndex, fromIndex, ignorePrefix)
+    or
+    // we can skip past word boundaries if the next char is a non-word char.
+    fromIndex = toIndex and
+    prev.getRepr() instanceof RegExpWordBoundary and
+    prev = getAState(reg, toIndex, str, ignorePrefix) and
+    after(prev.getRepr()) = result and
+    str.charAt(toIndex + 1).regexpMatch("\\W") // \W matches any non-word char.
+  }
+
+  pragma[noopt]
+  private State getAStateAfterMatchingAux(
+    MatchedRegExp reg, State prev, string str, int toIndex, int fromIndex, boolean ignorePrefix
+  ) {
+    prev = getAState(reg, fromIndex, str, ignorePrefix) and
+    fromIndex = toIndex - 1 and
+    exists(string char | char = str.charAt(toIndex) | specializedDeltaClosed(prev, char, result)) and
+    not discardedPrefixStep(prev, result, ignorePrefix)
+  }
+
+  /** Holds if a step from `prev` to `next` should be discarded when the `ignorePrefix` flag is set. */
+  private predicate discardedPrefixStep(State prev, State next, boolean ignorePrefix) {
+    prev = mkMatch(any(RegExpRoot r)) and
+    ignorePrefix = true and
+    next = prev
+  }
+
+  // The `deltaClosed` relation specialized to the chars that exists in strings tested by a `MatchedRegExp`.
+  private predicate specializedDeltaClosed(State prev, string char, State next) {
+    deltaClosed(prev, specializedGetAnInputSymbolMatching(char), next)
+  }
+
+  // The `getAnInputSymbolMatching` relation specialized to the chars that exists in strings tested by a `MatchedRegExp`.
+  pragma[noinline]
+  private InputSymbol specializedGetAnInputSymbolMatching(string char) {
+    exists(string s, MatchedRegExp r |
+      r.test(s, _)
+      or
+      r.testWithGroups(s, _)
+    |
+      char = s.charAt(_)
+    ) and
+    result = getAnInputSymbolMatching(char)
+  }
+
+  /**
+   * Gets the `i`th state on a path to the accepting state when `reg` matches `str`.
+   * Starts with an accepting state as found by `getAState` and searches backwards
+   * to the start state through the reachable states (as found by `getAState`).
+   *
+   * This predicate holds the invariant that the result state can be reached with `i` steps from a start state,
+   * and an accepting state can be found after (`str.length() - 1 - i`) steps from the result.
+   * The result state is therefore always on a valid path where `reg` accepts `str`.
+   *
+   * This predicate is only used to find which capture groups a regular expression has filled,
+   * and thus the search is only performed for the strings in the `testWithGroups(..)` predicate.
+   */
+  private State getAStateThatReachesAccept(
+    MatchedRegExp reg, int i, string str, boolean ignorePrefix
+  ) {
+    // base case, reaches an accepting state from the last state in `getAState(..)`
+    reg.testWithGroups(str, ignorePrefix) and
+    i = str.length() - 1 and
+    result = getAState(reg, i, str, ignorePrefix) and
+    epsilonSucc*(result) = Accept(_)
+    or
+    // recursive case. `next` is the next state to be matched after matching `prev`.
+    // this predicate is doing a backwards search, so `prev` is the result we are looking for.
+    exists(State next, State prev, int fromIndex, int toIndex |
+      next = getAStateThatReachesAccept(reg, toIndex, str, ignorePrefix) and
+      next = getAStateAfterMatching(reg, prev, str, toIndex, fromIndex, ignorePrefix) and
+      i = fromIndex and
+      result = prev
+    )
+  }
+
+  /** Gets the capture group number that `term` belongs to. */
+  private int group(RegExpTerm term) {
+    exists(RegExpGroup grp | grp.getNumber() = result | term.getParent*() = grp)
+  }
+}
+
+/** A class to test whether a regular expression matches certain HTML tags. */
+class HTMLMatchingRegExp extends RegexpMatching::MatchedRegExp {
+  HTMLMatchingRegExp() {
+    // the regexp must mention "<" and ">" explicitly.
+    forall(string angleBracket | angleBracket = ["<", ">"] |
+      any(RegExpConstant term | term.getValue().matches("%" + angleBracket + "%")).getRootTerm() =
+        this
+    )
+  }
+
+  override predicate testWithGroups(string str, boolean ignorePrefix) {
+    ignorePrefix = true and
+    str = ["<!-- foo -->", "<!-- foo --!>", "<!- foo ->", "<foo>", "<script>"]
+  }
+
+  override predicate test(string str, boolean ignorePrefix) {
+    ignorePrefix = true and
+    str =
+      [
+        "<!-- foo -->", "<!- foo ->", "<!-- foo --!>", "<!-- foo\n -->", "<script>foo</script>",
+        "<script \n>foo</script>", "<script >foo\n</script>", "<foo ></foo>", "<foo>",
+        "<foo src=\"foo\"></foo>", "<script>", "<script src=\"foo\"></script>",
+        "<script src='foo'></script>", "<SCRIPT>foo</SCRIPT>", "<script\tsrc=\"foo\"/>",
+        "<script\tsrc='foo'></script>", "<sCrIpT>foo</ScRiPt>", "<script src=\"foo\">foo</script >",
+        "<script src=\"foo\">foo</script foo=\"bar\">", "<script src=\"foo\">foo</script\t\n bar>"
+      ]
+  }
+}
+
+/**
+ * Holds if `regexp` matches some HTML tags, but misses some HTML tags that it should match.
+ *
+ * When adding a new case to this predicate, make sure the test string used in `matches(..)` calls are present in `HTMLMatchingRegExp::test` / `HTMLMatchingRegExp::testWithGroups`.
+ */
+predicate isBadRegexpFilter(HTMLMatchingRegExp regexp, string msg) {
+  // CVE-2021-33829 - matching both "<!-- foo -->" and "<!-- foo --!>", but in different capture groups
+  regexp.matches("<!-- foo -->") and
+  regexp.matches("<!-- foo --!>") and
+  exists(int a, int b | a != b |
+    regexp.fillsCaptureGroup("<!-- foo -->", a) and
+    // <!-- foo --> might be ambigously parsed (matching both capture groups), and that is ok here.
+    regexp.fillsCaptureGroup("<!-- foo --!>", b) and
+    not regexp.fillsCaptureGroup("<!-- foo --!>", a) and
+    msg =
+      "Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group "
+        + a + " and comments ending with --!> are matched with capture group " +
+        strictconcat(int i | regexp.fillsCaptureGroup("<!-- foo --!>", i) | i.toString(), ", ") +
+        "."
+  )
+  or
+  // CVE-2020-17480 - matching "<!-- foo -->" and other tags, but not "<!-- foo --!>".
+  exists(int group, int other |
+    group != other and
+    regexp.fillsCaptureGroup("<!-- foo -->", group) and
+    regexp.fillsCaptureGroup("<foo>", other) and
+    not regexp.matches("<!-- foo --!>") and
+    not regexp.fillsCaptureGroup("<!-- foo -->", any(int i | i != group)) and
+    not regexp.fillsCaptureGroup("<!- foo ->", group) and
+    not regexp.fillsCaptureGroup("<foo>", group) and
+    not regexp.fillsCaptureGroup("<script>", group) and
+    msg =
+      "This regular expression only parses --> (capture group " + group +
+        ") and not --!> as a HTML comment end tag."
+  )
+  or
+  regexp.matches("<!-- foo -->") and
+  not regexp.matches("<!-- foo\n -->") and
+  not regexp.matches("<!- foo ->") and
+  not regexp.matches("<foo>") and
+  not regexp.matches("<script>") and
+  msg = "This regular expression does not match comments containing newlines."
+  or
+  regexp.matches("<script>foo</script>") and
+  regexp.matches("<script src=\"foo\"></script>") and
+  not regexp.matches("<foo ></foo>") and
+  (
+    not regexp.matches("<script \n>foo</script>") and
+    msg = "This regular expression matches <script></script>, but not <script \\n></script>"
+    or
+    not regexp.matches("<script >foo\n</script>") and
+    msg = "This regular expression matches <script>...</script>, but not <script >...\\n</script>"
+  )
+  or
+  regexp.matches("<script>foo</script>") and
+  regexp.matches("<script src=\"foo\"></script>") and
+  not regexp.matches("<script src='foo'></script>") and
+  not regexp.matches("<foo>") and
+  msg = "This regular expression does not match script tags where the attribute uses single-quotes."
+  or
+  regexp.matches("<script>foo</script>") and
+  regexp.matches("<script src='foo'></script>") and
+  not regexp.matches("<script src=\"foo\"></script>") and
+  not regexp.matches("<foo>") and
+  msg = "This regular expression does not match script tags where the attribute uses double-quotes."
+  or
+  regexp.matches("<script>foo</script>") and
+  regexp.matches("<script src='foo'></script>") and
+  not regexp.matches("<script\tsrc='foo'></script>") and
+  not regexp.matches("<foo>") and
+  not regexp.matches("<foo src=\"foo\"></foo>") and
+  msg = "This regular expression does not match script tags where tabs are used between attributes."
+  or
+  regexp.matches("<script>foo</script>") and
+  not RegExpFlags::isIgnoreCase(regexp) and
+  not regexp.matches("<foo>") and
+  not regexp.matches("<foo ></foo>") and
+  (
+    not regexp.matches("<SCRIPT>foo</SCRIPT>") and
+    msg = "This regular expression does not match upper case <SCRIPT> tags."
+    or
+    not regexp.matches("<sCrIpT>foo</ScRiPt>") and
+    regexp.matches("<SCRIPT>foo</SCRIPT>") and
+    msg = "This regular expression does not match mixed case <sCrIpT> tags."
+  )
+  or
+  regexp.matches("<script src=\"foo\"></script>") and
+  not regexp.matches("<foo>") and
+  not regexp.matches("<foo ></foo>") and
+  (
+    not regexp.matches("<script src=\"foo\">foo</script >") and
+    msg = "This regular expression does not match script end tags like </script >."
+    or
+    not regexp.matches("<script src=\"foo\">foo</script foo=\"bar\">") and
+    msg = "This regular expression does not match script end tags like </script foo=\"bar\">."
+    or
+    not regexp.matches("<script src=\"foo\">foo</script\t\n bar>") and
+    msg = "This regular expression does not match script end tags like </script\\t\\n bar>."
+  )
+}
--- a/ruby/ql/lib/codeql/ruby/security/performance/ExponentialBackTracking.qll
+++ b/ruby/ql/lib/codeql/ruby/security/performance/ExponentialBackTracking.qll
--- a/ruby/ql/lib/codeql/ruby/security/performance/ParseRegExp.qll
+++ b/ruby/ql/lib/codeql/ruby/security/performance/ParseRegExp.qll
--- a/ruby/ql/lib/codeql/ruby/security/performance/PolynomialReDoSCustomizations.qll
+++ b/ruby/ql/lib/codeql/ruby/security/performance/PolynomialReDoSCustomizations.qll
@ -8,9 +8,9 @@ private import codeql.ruby.AST as AST
 private import codeql.ruby.CFG
 private import codeql.ruby.DataFlow
 private import codeql.ruby.dataflow.RemoteFlowSources
-private import codeql.ruby.regexp.ParseRegExp as RegExp
-private import codeql.ruby.regexp.RegExpTreeView
-private import codeql.ruby.regexp.SuperlinearBackTracking
+private import codeql.ruby.security.performance.ParseRegExp as RegExp
+private import codeql.ruby.security.performance.RegExpTreeView
+private import codeql.ruby.security.performance.SuperlinearBackTracking

 module PolynomialReDoS {
  /**
--- a/ruby/ql/lib/codeql/ruby/security/performance/PolynomialReDoSQuery.qll
+++ b/ruby/ql/lib/codeql/ruby/security/performance/PolynomialReDoSQuery.qll
--- a/ruby/ql/lib/codeql/ruby/security/performance/ReDoSUtil.qll
+++ b/ruby/ql/lib/codeql/ruby/security/performance/ReDoSUtil.qll
@ -140,12 +140,10 @@ class RegExpRoot extends RegExpTerm {
  predicate isRelevant() {
    // there is at least one repetition
    getRoot(any(InfiniteRepetitionQuantifier q)) = this and
-    // there are no lookbehinds
-    not exists(RegExpLookbehind lbh | getRoot(lbh) = this) and
    // is actually used as a RegExp
-    this.isUsedAsRegExp() //and
-    // // pragmatic performance optimization: ignore minified files.
-    // not getRootTerm().getParent().(Expr).getTopLevel().isMinified()
+    isUsedAsRegExp() and
+    // not excluded for library specific reasons
+    not isExcluded(getRootTerm().getParent())
  }
 }

@ -156,38 +154,68 @@ private class RegexpCharacterConstant extends RegExpConstant {
  RegexpCharacterConstant() { this.isCharacter() }
 }

+/**
+ * A regexp term that is relevant for this ReDoS analysis.
+ */
+class RelevantRegExpTerm extends RegExpTerm {
+  RelevantRegExpTerm() { getRoot(this).isRelevant() }
+}
+
 /**
 * Holds if `term` is the chosen canonical representative for all terms with string representation `str`.
+ * The string representation includes which flags are used with the regular expression.
 *
 * Using canonical representatives gives a huge performance boost when working with tuples containing multiple `InputSymbol`s.
 * The number of `InputSymbol`s is decreased by 3 orders of magnitude or more in some larger benchmarks.
 */
-private predicate isCanonicalTerm(RegExpTerm term, string str) {
+private predicate isCanonicalTerm(RelevantRegExpTerm term, string str) {
  term =
-    rank[1](RegExpTerm t, Location loc, File file |
+    min(RelevantRegExpTerm t, Location loc, File file |
      loc = t.getLocation() and
      file = t.getFile() and
-      str = t.getRawValue()
+      str = t.getRawValue() + "|" + getCanonicalizationFlags(t.getRootTerm())
    |
      t order by t.getFile().getRelativePath(), loc.getStartLine(), loc.getStartColumn()
    )
 }

+/**
+ * Gets a string reperesentation of the flags used with the regular expression.
+ * Only the flags that are relevant for the canonicalization are included.
+ */
+string getCanonicalizationFlags(RegExpTerm root) {
+  root.isRootTerm() and
+  (if RegExpFlags::isIgnoreCase(root) then result = "i" else result = "")
+}
+
 /**
 * An abstract input symbol, representing a set of concrete characters.
 */
 private newtype TInputSymbol =
  /** An input symbol corresponding to character `c`. */
  Char(string c) {
-    c = any(RegexpCharacterConstant cc | getRoot(cc).isRelevant()).getValue().charAt(_)
+    c =
+      any(RegexpCharacterConstant cc |
+        cc instanceof RelevantRegExpTerm and
+        not RegExpFlags::isIgnoreCase(cc.getRootTerm())
+      ).getValue().charAt(_)
+    or
+    // normalize everything to lower case if the regexp is case insensitive
+    c =
+      any(RegexpCharacterConstant cc, string char |
+        cc instanceof RelevantRegExpTerm and
+        RegExpFlags::isIgnoreCase(cc.getRootTerm()) and
+        char = cc.getValue().charAt(_)
+      |
+        char.toLowerCase()
+      )
  } or
  /**
   * An input symbol representing all characters matched by
   * a (non-universal) character class that has string representation `charClassString`.
   */
  CharClass(string charClassString) {
-    exists(RegExpTerm term | term.getRawValue() = charClassString | getRoot(term).isRelevant()) and
-    exists(RegExpTerm recc | isCanonicalTerm(recc, charClassString) |
+    exists(RelevantRegExpTerm recc | isCanonicalTerm(recc, charClassString) |
      recc instanceof RegExpCharacterClass and
      not recc.(RegExpCharacterClass).isUniversalClass()
      or
@ -254,7 +282,7 @@ class InputSymbol extends TInputSymbol {
 /**
 * An abstract input symbol that represents a character class.
 */
-abstract private class CharacterClass extends InputSymbol {
+abstract class CharacterClass extends InputSymbol {
  /**
   * Gets a character that is relevant for intersection-tests involving this
   * character class.
@ -277,7 +305,7 @@ abstract private class CharacterClass extends InputSymbol {
  /**
   * Gets a character matched by this character class.
   */
-  string choose() { result = this.getARelevantChar() and this.matches(result) }
+  string choose() { result = getARelevantChar() and matches(result) }
 }

 /**
@ -289,6 +317,19 @@ private module CharacterClasses {
   */
  pragma[noinline]
  predicate hasChildThatMatches(RegExpCharacterClass cc, string char) {
+    if RegExpFlags::isIgnoreCase(cc.getRootTerm())
+    then
+      // normalize everything to lower case if the regexp is case insensitive
+      exists(string c | hasChildThatMatchesIgnoringCasingFlags(cc, c) | char = c.toLowerCase())
+    else hasChildThatMatchesIgnoringCasingFlags(cc, char)
+  }
+
+  /**
+   * Holds if the character class `cc` has a child (constant or range) that matches `char`.
+   * Ignores whether the character class is inside a regular expression that has the ignore case flag.
+   */
+  pragma[noinline]
+  predicate hasChildThatMatchesIgnoringCasingFlags(RegExpCharacterClass cc, string char) {
    exists(getCanonicalCharClass(cc)) and
    exists(RegExpTerm child | child = cc.getAChild() |
      char = child.(RegexpCharacterConstant).getValue()
@ -433,7 +474,7 @@ private module CharacterClasses {
    char = "0123456789".charAt(_)
    or
    clazz = "s" and
-    char = [" ", "\t", "\r", "\n", 11.toUnicode(), 12.toUnicode()] // 11.toUnicode() = \v, 12.toUnicode() = \f'
+    char = [" ", "\t", "\r", "\n", 11.toUnicode(), 12.toUnicode()] // 11.toUnicode() = \v, 12.toUnicode() = \f
    or
    clazz = "w" and
    char = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_".charAt(_)
@ -477,7 +518,7 @@ private module CharacterClasses {
      result = ["0", "9"]
      or
      cc.getValue() = "s" and
-      result = [" "]
+      result = " "
      or
      cc.getValue() = "w" and
      result = ["a", "Z", "_", "0", "9"]
@ -490,7 +531,7 @@ private module CharacterClasses {
      result = "9"
      or
      cc.getValue() = "s" and
-      result = [" "]
+      result = " "
      or
      cc.getValue() = "w" and
      result = "a"
@ -604,7 +645,7 @@ private State before(RegExpTerm t) { result = Match(t, 0) }
 /**
 * Gets a state the NFA may be in after matching `t`.
 */
-private State after(RegExpTerm t) {
+State after(RegExpTerm t) {
  exists(RegExpAlt alt | t = alt.getAChild() | result = after(alt))
  or
  exists(RegExpSequence seq, int i | t = seq.getChild(i) |
@ -633,7 +674,14 @@ private State after(RegExpTerm t) {
 predicate delta(State q1, EdgeLabel lbl, State q2) {
  exists(RegexpCharacterConstant s, int i |
    q1 = Match(s, i) and
-    lbl = Char(s.getValue().charAt(i)) and
+    (
+      not RegExpFlags::isIgnoreCase(s.getRootTerm()) and
+      lbl = Char(s.getValue().charAt(i))
+      or
+      // normalize everything to lower case if the regexp is case insensitive
+      RegExpFlags::isIgnoreCase(s.getRootTerm()) and
+      exists(string c | c = s.getValue().charAt(i) | lbl = Char(c.toLowerCase()))
+    ) and
    (
      q2 = Match(s, i + 1)
      or
@ -643,20 +691,20 @@ predicate delta(State q1, EdgeLabel lbl, State q2) {
  )
  or
  exists(RegExpDot dot | q1 = before(dot) and q2 = after(dot) |
-    if dot.getLiteral().isDotAll() then lbl = Any() else lbl = Dot()
+    if RegExpFlags::isDotAll(dot.getRootTerm()) then lbl = Any() else lbl = Dot()
  )
  or
  exists(RegExpCharacterClass cc |
    cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc)
    or
    q1 = before(cc) and
-    lbl = CharClass(cc.getRawValue()) and
+    lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and
    q2 = after(cc)
  )
  or
  exists(RegExpCharacterClassEscape cc |
    q1 = before(cc) and
-    lbl = CharClass(cc.getRawValue()) and
+    lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and
    q2 = after(cc)
  )
  or
@ -729,16 +777,27 @@ RegExpRoot getRoot(RegExpTerm term) {
  result = getRoot(term.getParent())
 }

-private newtype TState =
-  Match(RegExpTerm t, int i) {
-    getRoot(t).isRelevant() and
-    (
-      i = 0
-      or
-      exists(t.(RegexpCharacterConstant).getValue().charAt(i))
-    )
+/**
+ * A state in the NFA.
+ */
+newtype TState =
+  /**
+   * A state representing that the NFA is about to match a term.
+   * `i` is used to index into multi-char literals.
+   */
+  Match(RelevantRegExpTerm t, int i) {
+    i = 0
+    or
+    exists(t.(RegexpCharacterConstant).getValue().charAt(i))
  } or
+  /**
+   * An accept state, where exactly the given input string is accepted.
+   */
  Accept(RegExpRoot l) { l.isRelevant() } or
+  /**
+   * An accept state, where the given input string, or any string that has this
+   * string as a prefix, is accepted.
+   */
  AcceptAnySuffix(RegExpRoot l) { l.isRelevant() }

 /**
@ -851,29 +910,26 @@ InputSymbol getAnInputSymbolMatching(string char) {
  result = Any()
 }

+/**
+ * Holds if `state` is a start state.
+ */
+predicate isStartState(State state) {
+  state = mkMatch(any(RegExpRoot r))
+  or
+  exists(RegExpCaret car | state = after(car))
+}
+
 /**
 * Predicates for constructing a prefix string that leads to a given state.
 */
 private module PrefixConstruction {
-  /**
-   * Holds if `state` starts the string matched by the regular expression.
-   */
-  private predicate isStartState(State state) {
-    state instanceof StateInPumpableRegexp and
-    (
-      state = Match(any(RegExpRoot r), _)
-      or
-      exists(RegExpCaret car | state = after(car))
-    )
-  }
-
  /**
   * Holds if `state` is the textually last start state for the regular expression.
   */
  private predicate lastStartState(State state) {
    exists(RegExpRoot root |
      state =
-        max(State s, Location l |
+        max(StateInPumpableRegexp s, Location l |
          isStartState(s) and getRoot(s.getRepr()) = root and l = s.getRepr().getLocation()
        |
          s
@ -1173,7 +1229,6 @@ private predicate isReDoSAttackable(RegExpTerm term, string pump, State s) {
 * `prefixMsg` contains a friendly message for a prefix that reaches `s` (or `prefixMsg` is the empty string if the prefix is empty or if no prefix could be found).
 */
 predicate hasReDoSResult(RegExpTerm t, string pump, State s, string prefixMsg) {
-  not t.getRegExp().hasFreeSpacingFlag() and // exclude free-spacing mode regexes
  isReDoSAttackable(t, pump, s) and
  (
    prefixMsg = "starting with '" + escape(PrefixConstruction::prefix(s)) + "' and " and
--- a/ruby/ql/lib/codeql/ruby/security/performance/RegExpTreeView.qll
+++ b/ruby/ql/lib/codeql/ruby/security/performance/RegExpTreeView.qll
@ -2,6 +2,42 @@ private import codeql.ruby.ast.Literal as AST
 private import codeql.Locations
 private import ParseRegExp

+/**
+ * Holds if the regular expression should not be considered.
+ */
+predicate isExcluded(RegExpParent parent) {
+  parent.(RegExpTerm).getRegExp().hasFreeSpacingFlag() // exclude free-spacing mode regexes
+}
+
+/**
+ * A module containing predicates for determining which flags a regular expression have.
+ */
+module RegExpFlags {
+  /**
+   * Holds if `root` has the `i` flag for case-insensitive matching.
+   */
+  predicate isIgnoreCase(RegExpTerm root) {
+    root.isRootTerm() and
+    root.getLiteral().isIgnoreCase()
+  }
+
+  /**
+   * Gets the flags for `root`, or the empty string if `root` has no flags.
+   */
+  string getFlags(RegExpTerm root) {
+    root.isRootTerm() and
+    result = root.getLiteral().getFlags()
+  }
+
+  /**
+   * Holds if `root` has the `s` flag for multi-line matching.
+   */
+  predicate isDotAll(RegExpTerm root) {
+    root.isRootTerm() and
+    root.getLiteral().isDotAll()
+  }
+}
+
 /**
 * An element containing a regular expression term, that is, either
 * a string literal (parsed as a regular expression)
@ -38,6 +74,10 @@ class RegExpLiteral extends TRegExpLiteral, RegExpParent {

  predicate isDotAll() { re.hasMultilineFlag() }

+  predicate isIgnoreCase() { re.hasCaseInsensitiveFlag() }
+
+  string getFlags() { result = re.getFlagString() }
+
  override string getAPrimaryQlClass() { result = "RegExpLiteral" }
 }

@ -398,6 +438,13 @@ private int toHex(string hex) {
  result = 15 and hex = ["f", "F"]
 }

+/**
+ * A word boundary, that is, a regular expression term of the form `\b`.
+ */
+class RegExpWordBoundary extends RegExpEscape {
+  RegExpWordBoundary() { this.getUnescaped() = "b" }
+}
+
 /**
 * A character class escape in a regular expression.
 * That is, an escaped character that denotes multiple characters.
--- a/ruby/ql/lib/codeql/ruby/security/performance/SuperlinearBackTracking.qll
+++ b/ruby/ql/lib/codeql/ruby/security/performance/SuperlinearBackTracking.qll
--- a/ruby/ql/src/queries/security/cwe-116/BadTagFilter.qhelp
+++ b/ruby/ql/src/queries/security/cwe-116/BadTagFilter.qhelp
@ -0,0 +1,54 @@
+<!DOCTYPE qhelp PUBLIC
+  "-//Semmle//qhelp//EN"
+  "qhelp.dtd">
+<qhelp>
+
+<overview>
+<p>
+It is possible to match some single HTML tags using regular expressions (parsing general HTML using 
+regular expressions is impossible). However, if the regular expression is not written well it might 
+be possible to circumvent it, which can lead to cross-site scripting or other security issues.
+</p>
+<p>
+Some of these mistakes are caused by browsers having very forgiving HTML parsers, and
+will often render invalid HTML containing syntax errors. 
+Regular expressions that attempt to match HTML should also recognize tags containing such syntax errors.
+</p>
+</overview>
+
+<recommendation>
+<p>
+Use a well-tested sanitization or parser library if at all possible. These libraries are much more
+likely to handle corner cases correctly than a custom implementation.
+</p>
+</recommendation>
+
+<example>
+<p>
+The following example attempts to filters out all <code>&lt;script&gt;</code> tags.
+</p>
+
+<sample src="examples/BadTagFilter.rb" />
+
+<p>
+The above sanitizer does not filter out all <code>&lt;script&gt;</code> tags. 
+Browsers will not only accept <code>&lt;/script&gt;</code> as script end tags, but also tags such as <code>&lt;/script foo="bar"&gt;</code> even though it is a parser error.
+This means that an attack string such as <code>&lt;script&gt;alert(1)&lt;/script foo="bar"&gt;</code> will not be filtered by 
+the function, and <code>alert(1)</code> will be executed by a browser if the string is rendered as HTML.
+</p>
+
+<p>
+Other corner cases include that HTML comments can end with <code>--!&gt;</code>, 
+and that HTML tag names can contain upper case characters.
+</p>
+</example>
+
+<references>
+<li>Securitum: <a href="https://research.securitum.com/the-curious-case-of-copy-paste/">The Curious Case of Copy &amp; Paste</a>.</li>
+<li>stackoverflow.com: <a href="https://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags#answer-1732454">You can't parse [X]HTML with regex</a>.</li>
+<li>HTML Standard: <a href="https://html.spec.whatwg.org/multipage/parsing.html#comment-end-bang-state">Comment end bang state</a>.</li>
+<li>stackoverflow.com: <a href="https://stackoverflow.com/questions/25559999/why-arent-browsers-strict-about-html">Why aren't browsers strict about HTML?</a>.</li>
+</references>
+</qhelp>
+
+
--- a/ruby/ql/src/queries/security/cwe-116/BadTagFilter.ql
+++ b/ruby/ql/src/queries/security/cwe-116/BadTagFilter.ql
@ -0,0 +1,19 @@
+/**
+ * @name Bad HTML filtering regexp
+ * @description Matching HTML tags using regular expressions is hard to do right, and can easily lead to security issues.
+ * @kind problem
+ * @problem.severity warning
+ * @security-severity 7.8
+ * @precision high
+ * @id rb/bad-tag-filter
+ * @tags correctness
+ *       security
+ *       external/cwe/cwe-116
+ *       external/cwe/cwe-020
+ */
+
+import codeql.ruby.security.BadTagFilterQuery
+
+from HTMLMatchingRegExp regexp, string msg
+where msg = min(string m | isBadRegexpFilter(regexp, m) | m order by m.length(), m) // there might be multiple, we arbitrarily pick the shortest one
+select regexp, msg
--- a/ruby/ql/src/queries/security/cwe-116/examples/BadTagFilter.rb
+++ b/ruby/ql/src/queries/security/cwe-116/examples/BadTagFilter.rb
@ -0,0 +1,8 @@
+def filter_script_tags(html)
+  old_html = ""
+  while (html != old_html)
+    old_html = html
+    html = html.gsub(/<script[^>]*>.*<\/script>/m, "")
+  end
+  html
+end
--- a/ruby/ql/src/queries/security/cwe-1333/PolynomialReDoS.ql
+++ b/ruby/ql/src/queries/security/cwe-1333/PolynomialReDoS.ql
@ -15,8 +15,8 @@

 import DataFlow::PathGraph
 import codeql.ruby.DataFlow
-import codeql.ruby.regexp.PolynomialReDoSQuery
-import codeql.ruby.regexp.SuperlinearBackTracking
+import codeql.ruby.security.performance.PolynomialReDoSQuery
+import codeql.ruby.security.performance.SuperlinearBackTracking

 from
  PolynomialReDoS::Configuration config, DataFlow::PathNode source, DataFlow::PathNode sink,
--- a/ruby/ql/src/queries/security/cwe-1333/ReDoS.ql
+++ b/ruby/ql/src/queries/security/cwe-1333/ReDoS.ql
@ -14,9 +14,9 @@
 *       external/cwe/cwe-400
 */

-import codeql.ruby.regexp.ExponentialBackTracking
-import codeql.ruby.regexp.ReDoSUtil
-import codeql.ruby.regexp.RegExpTreeView
+import codeql.ruby.security.performance.ExponentialBackTracking
+import codeql.ruby.security.performance.ReDoSUtil
+import codeql.ruby.security.performance.RegExpTreeView

 from RegExpTerm t, string pump, State s, string prefixMsg
 where hasReDoSResult(t, pump, s, prefixMsg)
--- a/ruby/ql/test/library-tests/regexp/parse.ql
+++ b/ruby/ql/test/library-tests/regexp/parse.ql
@ -3,7 +3,7 @@
 */

 import codeql.Locations
-import codeql.ruby.regexp.RegExpTreeView as RETV
+import codeql.ruby.security.performance.RegExpTreeView as RETV

 query predicate nodes(RETV::RegExpTerm n, string attr, string val) {
  attr = "semmle.label" and
--- a/ruby/ql/test/query-tests/security/cwe-116/BadTagFilter.expected
+++ b/ruby/ql/test/query-tests/security/cwe-116/BadTagFilter.expected
@ -0,0 +1,14 @@
+| test.rb:2:6:2:29 | <script.*?>.*?<\\/script> | This regular expression does not match script end tags like </script >. |
+| test.rb:3:6:3:29 | <script.*?>.*?<\\/script> | This regular expression does not match script end tags like </script >. |
+| test.rb:7:6:7:16 | <!--.*--!?> | This regular expression does not match comments containing newlines. |
+| test.rb:8:6:8:39 | <script.*?>(.\|\\s)*?<\\/script[^>]*> | This regular expression matches <script></script>, but not <script \\n></script> |
+| test.rb:9:6:9:37 | <script[^>]*?>.*?<\\/script[^>]*> | This regular expression matches <script>...</script>, but not <script >...\\n</script> |
+| test.rb:10:6:10:44 | <script(\\s\|\\w\|=\|")*?>.*?<\\/script[^>]*> | This regular expression does not match script tags where the attribute uses single-quotes. |
+| test.rb:11:6:11:44 | <script(\\s\|\\w\|=\|')*?>.*?<\\/script[^>]*> | This regular expression does not match script tags where the attribute uses double-quotes. |
+| test.rb:12:6:12:48 | <script( \|\\n\|\\w\|=\|'\|")*?>.*?<\\/script[^>]*> | This regular expression does not match script tags where tabs are used between attributes. |
+| test.rb:13:6:13:34 | <script.*?>.*?<\\/script[^>]*> | This regular expression does not match upper case <SCRIPT> tags. |
+| test.rb:14:6:14:52 | <(script\|SCRIPT).*?>.*?<\\/(script\|SCRIPT)[^>]*> | This regular expression does not match mixed case <sCrIpT> tags. |
+| test.rb:15:6:15:39 | <script[^>]*?>[\\s\\S]*?<\\/script.*> | This regular expression does not match script end tags like </script\\t\\n bar>. |
+| test.rb:17:6:17:40 | <script\\b[^>]*>([\\s\\S]*?)<\\/script> | This regular expression does not match script end tags like </script >. |
+| test.rb:18:6:18:48 | <(?:!--([\\S\|\\s]*?)-->)\|([^\\/\\s>]+)[\\S\\s]*?> | Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group 1 and comments ending with --!> are matched with capture group 2. |
+| test.rb:19:6:19:147 | <(?:(?:\\/([^>]+)>)\|(?:!--([\\S\|\\s]*?)-->)\|(?:([^\\/\\s>]+)((?:\\s+[\\w\\-:.]+(?:\\s*=\\s*?(?:(?:"[^"]*")\|(?:'[^']*')\|[^\\s"'\\/>]+))?)*)[\\S\\s]*?(\\/?)>)) | Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group 2 and comments ending with --!> are matched with capture group 1, 3, 4, 5. |
--- a/ruby/ql/test/query-tests/security/cwe-116/BadTagFilter.qlref
+++ b/ruby/ql/test/query-tests/security/cwe-116/BadTagFilter.qlref
@ -0,0 +1 @@
+queries/security/cwe-116/BadTagFilter.ql
--- a/ruby/ql/test/query-tests/security/cwe-116/test.rb
+++ b/ruby/ql/test/query-tests/security/cwe-116/test.rb
@ -0,0 +1,22 @@
+filters = [
+    /<script.*?>.*?<\/script>/i, # NOT OK - doesn't match newlines or `</script >`
+    /<script.*?>.*?<\/script>/im, # NOT OK - doesn't match `</script >`
+    /<script.*?>.*?<\/script[^>]*>/im, # OK
+    /<!--.*-->/im, # OK - we don't care regexps that only match comments
+    /<!--.*--!?>/im, # OK
+    /<!--.*--!?>/i, # NOT OK, does not match newlines
+    /<script.*?>(.|\s)*?<\/script[^>]*>/i, # NOT OK - doesn't match inside the script tag
+    /<script[^>]*?>.*?<\/script[^>]*>/i, # NOT OK - doesn't match newlines inside the content
+    /<script(\s|\w|=|")*?>.*?<\/script[^>]*>/im, # NOT OK - does not match single quotes for attribute values
+    /<script(\s|\w|=|')*?>.*?<\/script[^>]*>/im, # NOT OK - does not match double quotes for attribute values
+    /<script( |\n|\w|=|'|")*?>.*?<\/script[^>]*>/im, # NOT OK - does not match tabs between attributes
+    /<script.*?>.*?<\/script[^>]*>/m, # NOT OK - does not match uppercase SCRIPT tags
+    /<(script|SCRIPT).*?>.*?<\/(script|SCRIPT)[^>]*>/m, # NOT OK - does not match mixed case script tags
+    /<script[^>]*?>[\s\S]*?<\/script.*>/i, # NOT OK - doesn't match newlines in the end tag
+    /<script[^>]*?>[\s\S]*?<\/script[^>]*?>/i, # OK
+    /<script\b[^>]*>([\s\S]*?)<\/script>/gi, # NOT OK - too strict matching on the end tag
+    /<(?:!--([\S|\s]*?)-->)|([^\/\s>]+)[\S\s]*?>/, # NOT OK - doesn't match comments with the right capture groups
+    /<(?:(?:\/([^>]+)>)|(?:!--([\S|\s]*?)-->)|(?:([^\/\s>]+)((?:\s+[\w\-:.]+(?:\s*=\s*?(?:(?:"[^"]*")|(?:'[^']*')|[^\s"'\/>]+))?)*)[\S\s]*?(\/?)>))/, # NOT OK - capture groups
+]
+
+doFilters(filters)