Merge pull request #13914 from erik-krogh/escape-unicode

ReDoS: escape unicode chars in the output for the ReDoS queries
2023-08-15 11:21:21 +02:00 · 2023-08-15 11:21:21 +02:00 · 6a3b9e10eb
--- a/javascript/ql/test/query-tests/Security/CWE-400/ReDoS/PolynomialBackTracking.expected
+++ b/javascript/ql/test/query-tests/Security/CWE-400/ReDoS/PolynomialBackTracking.expected
@ -445,7 +445,7 @@
 | tst.js:146:15:146:21 | (\\d\|5)* | Strings with many repetitions of '0' can start matching anywhere after the start of the preceeding ((\\d\|5)*)" |
 | tst.js:149:15:149:24 | (\\s\|[\\f])* | Strings with many repetitions of '\\t' can start matching anywhere after the start of the preceeding ((\\s\|[\\f])*)" |
 | tst.js:152:15:152:28 | (\\s\|[\\v]\|\\\\v)* | Strings with many repetitions of '\\t' can start matching anywhere after the start of the preceeding ((\\s\|[\\v]\|\\\\v)*)" |
-| tst.js:155:15:155:24 | (\\f\|[\\f])* | Strings with many repetitions of '\u000c' can start matching anywhere after the start of the preceeding ((\\f\|[\\f])*)" |
+| tst.js:155:15:155:24 | (\\f\|[\\f])* | Strings with many repetitions of '\\u000c' can start matching anywhere after the start of the preceeding ((\\f\|[\\f])*)" |
 | tst.js:158:15:158:22 | (\\W\|\\D)* | Strings with many repetitions of '/' can start matching anywhere after the start of the preceeding ((\\W\|\\D)*)" |
 | tst.js:161:15:161:22 | (\\S\|\\w)* | Strings with many repetitions of '!' can start matching anywhere after the start of the preceeding ((\\S\|\\w)*)" |
 | tst.js:164:15:164:24 | (\\S\|[\\w])* | Strings with many repetitions of '!' can start matching anywhere after the start of the preceeding ((\\S\|[\\w])*)" |
--- a/javascript/ql/test/query-tests/Security/CWE-400/ReDoS/ReDoS.expected
+++ b/javascript/ql/test/query-tests/Security/CWE-400/ReDoS/ReDoS.expected
@ -123,9 +123,9 @@
 | tst.js:137:15:137:21 | (\\w\|G)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'G'. |
 | tst.js:143:15:143:22 | (\\d\|\\w)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '0'. |
 | tst.js:146:15:146:21 | (\\d\|5)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '5'. |
-| tst.js:149:15:149:24 | (\\s\|[\\f])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\u000c'. |
-| tst.js:152:15:152:28 | (\\s\|[\\v]\|\\\\v)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\u000b'. |
-| tst.js:155:15:155:24 | (\\f\|[\\f])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\u000c'. |
+| tst.js:149:15:149:24 | (\\s\|[\\f])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\u000c'. |
+| tst.js:152:15:152:28 | (\\s\|[\\v]\|\\\\v)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\u000b'. |
+| tst.js:155:15:155:24 | (\\f\|[\\f])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\u000c'. |
 | tst.js:158:15:158:22 | (\\W\|\\D)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '/'. |
 | tst.js:161:15:161:22 | (\\S\|\\w)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '0'. |
 | tst.js:164:15:164:24 | (\\S\|[\\w])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '0'. |
@ -199,3 +199,5 @@
 | tst.js:404:6:405:7 | (g\|gg)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'gg'. |
 | tst.js:407:125:407:127 | \\s* | This part of the regular expression may cause exponential backtracking on strings starting with '0/*' and containing many repetitions of ' ;0'. |
 | tst.js:411:15:411:19 | a{1,} | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. |
+| tst.js:413:25:413:35 | (\\u0000\|.)+ | This part of the regular expression may cause exponential backtracking on strings starting with '\\n\\u0000' and containing many repetitions of '\\u0000'. |
+| tst.js:415:44:415:57 | (\ud83d\ude80\|.)+ | This part of the regular expression may cause exponential backtracking on strings starting with '\\n\\u{1f680}' and containing many repetitions of '\\u{1f680}'. |
--- a/javascript/ql/test/query-tests/Security/CWE-400/ReDoS/tst.js
+++ b/javascript/ql/test/query-tests/Security/CWE-400/ReDoS/tst.js
@ -409,3 +409,7 @@ var bad98 = /^(?:\*\/\*|[a-zA-Z0-9][a-zA-Z0-9!\#\$&\-\^_\.\+]{0,126}\/(?:\*|[a-z
 var good48 = /(\/(?:\/[\w.-]*)*){0,1}:([\w.-]+)/;

 var bad99 = /(a{1,})*b/;
+
+var unicode = /^\n\u0000(\u0000|.)+$/;
+
+var largeUnicode = new RegExp("^\n\u{1F680}(\u{1F680}|.)+X$");
--- a/python/ql/test/query-tests/Security/CWE-730-ReDoS/ReDoS.expected
+++ b/python/ql/test/query-tests/Security/CWE-730-ReDoS/ReDoS.expected
@ -35,9 +35,9 @@
 | redos.py:139:25:139:31 | (\\w\|G)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'G'. |
 | redos.py:145:25:145:32 | (\\d\|\\w)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '0'. |
 | redos.py:148:25:148:31 | (\\d\|5)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '5'. |
-| redos.py:151:25:151:34 | (\\s\|[\\f])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\u000c'. |
-| redos.py:154:25:154:38 | (\\s\|[\\v]\|\\\\v)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\u000b'. |
-| redos.py:157:25:157:34 | (\\f\|[\\f])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\u000c'. |
+| redos.py:151:25:151:34 | (\\s\|[\\f])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\u000c'. |
+| redos.py:154:25:154:38 | (\\s\|[\\v]\|\\\\v)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\u000b'. |
+| redos.py:157:25:157:34 | (\\f\|[\\f])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\u000c'. |
 | redos.py:160:25:160:32 | (\\W\|\\D)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of ' '. |
 | redos.py:163:25:163:32 | (\\S\|\\w)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '0'. |
 | redos.py:166:25:166:34 | (\\S\|[\\w])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '0'. |
@ -67,8 +67,8 @@
 | redos.py:259:24:259:126 | (.thisisagoddamnlongstringforstresstestingthequery\|\\sthisisagoddamnlongstringforstresstestingthequery)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\tthisisagoddamnlongstringforstresstestingthequery'. |
 | redos.py:262:24:262:87 | (thisisagoddamnlongstringforstresstestingthequery\|this\\w+query)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'thisisagoddamnlongstringforstresstestingthequery'. |
 | redos.py:262:78:262:80 | \\w+ | This part of the regular expression may cause exponential backtracking on strings starting with 'this' and containing many repetitions of '0querythis'. |
-| redos.py:268:28:268:39 | ([\ufffd\ufffd]\|[\ufffd\ufffd])* | This part of the regular expression may cause exponential backtracking on strings starting with 'foo' and containing many repetitions of '\ufffd'. |
-| redos.py:271:28:271:41 | ((\ufffd\|\ufffd)\|(\ufffd\|\ufffd))* | This part of the regular expression may cause exponential backtracking on strings starting with 'foo' and containing many repetitions of '\ufffd'. |
+| redos.py:268:28:268:39 | ([\ufffd\ufffd]\|[\ufffd\ufffd])* | This part of the regular expression may cause exponential backtracking on strings starting with 'foo' and containing many repetitions of '\\ufffd'. |
+| redos.py:271:28:271:41 | ((\ufffd\|\ufffd)\|(\ufffd\|\ufffd))* | This part of the regular expression may cause exponential backtracking on strings starting with 'foo' and containing many repetitions of '\\ufffd'. |
 | redos.py:274:31:274:32 | b+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'b'. |
 | redos.py:277:48:277:50 | \\s* | This part of the regular expression may cause exponential backtracking on strings starting with '<0\\t0=' and containing many repetitions of '""\\t0='. |
 | redos.py:283:26:283:27 | a+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. |
@ -103,5 +103,5 @@
 | redos.py:385:24:385:30 | (\\d\|0)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '0'. |
 | redos.py:386:26:386:32 | (\\d\|0)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '0'. |
 | redos.py:391:15:391:25 | (\\u0061\|a)* | This part of the regular expression may cause exponential backtracking on strings starting with 'X' and containing many repetitions of 'a'. |
-| unittests.py:5:17:5:23 | (\u00c6\|\\\u00c6)+ | This part of the regular expression may cause exponential backtracking on strings starting with 'X' and containing many repetitions of '\u00c6'. |
+| unittests.py:5:17:5:23 | (\u00c6\|\\\u00c6)+ | This part of the regular expression may cause exponential backtracking on strings starting with 'X' and containing many repetitions of '\\u00c6'. |
 | unittests.py:9:16:9:24 | (?:.\|\\n)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\n'. |
--- a/ruby/ql/test/query-tests/security/cwe-1333-exponential-redos/ReDoS.expected
+++ b/ruby/ql/test/query-tests/security/cwe-1333-exponential-redos/ReDoS.expected
@ -33,9 +33,9 @@
 | tst.rb:137:11:137:17 | (\\w\|G)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'G'. |
 | tst.rb:143:11:143:18 | (\\d\|\\w)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '0'. |
 | tst.rb:146:11:146:17 | (\\d\|5)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '5'. |
-| tst.rb:149:11:149:20 | (\\s\|[\\f])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\u000c'. |
-| tst.rb:152:11:152:24 | (\\s\|[\\v]\|\\\\v)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\u000b'. |
-| tst.rb:155:11:155:20 | (\\f\|[\\f])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\u000c'. |
+| tst.rb:149:11:149:20 | (\\s\|[\\f])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\u000c'. |
+| tst.rb:152:11:152:24 | (\\s\|[\\v]\|\\\\v)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\u000b'. |
+| tst.rb:155:11:155:20 | (\\f\|[\\f])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\u000c'. |
 | tst.rb:158:11:158:18 | (\\W\|\\D)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of ' '. |
 | tst.rb:161:11:161:18 | (\\S\|\\w)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '0'. |
 | tst.rb:164:11:164:20 | (\\S\|[\\w])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '0'. |
--- a/shared/regex/codeql/regex/nfa/NfaUtils.qll
+++ b/shared/regex/codeql/regex/nfa/NfaUtils.qll
@ -3,6 +3,7 @@
 */

 private import codeql.regex.RegexTreeView
+private import codeql.util.Numbers

 /**
 * Classes and predicates that create an NFA and various algorithms for working with it.
@ -17,6 +18,20 @@ module Make<RegexTreeViewSig TreeImpl> {
    exists(int code | code = ascii(c) | code + 1 = ascii(result))
  }

+  /**
+   * Gets the `i`th codepoint in `s`.
+   */
+  bindingset[s]
+  private string getCodepointAt(string s, int i) { result = s.regexpFind("(.|\\s)", i, _) }
+
+  /**
+   * Gets the length of `s` in codepoints.
+   */
+  bindingset[str]
+  private int getCodepointLength(string str) {
+    result = str.regexpReplaceAll("(.|\\s)", "x").length()
+  }
+
  /**
   * Gets an approximation for the ASCII code for `char`.
   * Only the easily printable chars are included (so no newline, tab, null, etc).
@ -189,17 +204,17 @@ module Make<RegexTreeViewSig TreeImpl> {
    /** An input symbol corresponding to character `c`. */
    Char(string c) {
      c =
-        any(RegexpCharacterConstant cc |
-          cc instanceof RelevantRegExpTerm and
-          not isIgnoreCase(cc.getRootTerm())
-        ).getValue().charAt(_)
+        getCodepointAt(any(RegexpCharacterConstant cc |
+            cc instanceof RelevantRegExpTerm and
+            not isIgnoreCase(cc.getRootTerm())
+          ).getValue(), _)
      or
      // normalize everything to lower case if the regexp is case insensitive
      c =
        any(RegexpCharacterConstant cc, string char |
          cc instanceof RelevantRegExpTerm and
          isIgnoreCase(cc.getRootTerm()) and
-          char = cc.getValue().charAt(_)
+          char = getCodepointAt(cc.getValue(), _)
        |
          char.toLowerCase()
        )
@ -395,7 +410,7 @@ module Make<RegexTreeViewSig TreeImpl> {
    string getARelevantChar() {
      exists(ascii(result))
      or
-      exists(RegexpCharacterConstant c | result = c.getValue().charAt(_))
+      exists(RegexpCharacterConstant c | result = getCodepointAt(c.getValue(), _))
      or
      classEscapeMatches(_, result)
    }
@ -693,6 +708,12 @@ module Make<RegexTreeViewSig TreeImpl> {
    )
  }

+  pragma[noinline]
+  private int getCodepointLengthForState(string s) {
+    result = getCodepointLength(s) and
+    s = any(RegexpCharacterConstant reg).getValue()
+  }
+
  /**
   * Holds if the NFA has a transition from `q1` to `q2` labelled with `lbl`.
   */
@ -701,16 +722,16 @@ module Make<RegexTreeViewSig TreeImpl> {
      q1 = Match(s, i) and
      (
        not isIgnoreCase(s.getRootTerm()) and
-        lbl = Char(s.getValue().charAt(i))
+        lbl = Char(getCodepointAt(s.getValue(), i))
        or
        // normalize everything to lower case if the regexp is case insensitive
        isIgnoreCase(s.getRootTerm()) and
-        exists(string c | c = s.getValue().charAt(i) | lbl = Char(c.toLowerCase()))
+        exists(string c | c = getCodepointAt(s.getValue(), i) | lbl = Char(c.toLowerCase()))
      ) and
      (
        q2 = Match(s, i + 1)
        or
-        s.getValue().length() = i + 1 and
+        getCodepointLengthForState(s.getValue()) = i + 1 and
        q2 = after(s)
      )
    )
@ -811,7 +832,7 @@ module Make<RegexTreeViewSig TreeImpl> {
    Match(RelevantRegExpTerm t, int i) {
      i = 0
      or
-      exists(t.(RegexpCharacterConstant).getValue().charAt(i))
+      exists(getCodepointAt(t.(RegexpCharacterConstant).getValue(), i))
    } or
    /**
     * An accept state, where exactly the given input string is accepted.
@ -1104,7 +1125,9 @@ module Make<RegexTreeViewSig TreeImpl> {
       */
      predicate reachesOnlyRejectableSuffixes(State fork, string w) {
        isReDoSCandidate(fork, w) and
-        forex(State next | next = process(fork, w, w.length() - 1) | isLikelyRejectable(next)) and
+        forex(State next | next = process(fork, w, getCodepointLengthForCandidate(w) - 1) |
+          isLikelyRejectable(next)
+        ) and
        not getProcessPrevious(fork, _, w) = acceptsAnySuffix() // we stop `process(..)` early if we can, check here if it happened.
      }

@ -1214,6 +1237,13 @@ module Make<RegexTreeViewSig TreeImpl> {
        exists(string char | char = ["|", "\n", "Z"] | not deltaClosedChar(s, char, _))
      }

+      // `process` can't use pragma[inline] predicates. So a materialized version of `getCodepointAt` is needed.
+      pragma[noinline]
+      private string getCodePointAtForProcess(string str, int i) {
+        result = getCodepointAt(str, i) and
+        isReDoSCandidate(_, str)
+      }
+
      /**
       * Gets a state that can be reached from pumpable `fork` consuming all
       * chars in `w` any number of times followed by the first `i+1` characters of `w`.
@ -1223,7 +1253,7 @@ module Make<RegexTreeViewSig TreeImpl> {
        exists(State prev | prev = getProcessPrevious(fork, i, w) |
          not prev = acceptsAnySuffix() and // we stop `process(..)` early if we can. If the successor accepts any suffix, then we know it can never be rejected.
          exists(string char, InputSymbol sym |
-            char = w.charAt(i) and
+            char = getCodePointAtForProcess(w, i) and
            deltaClosed(prev, sym, result) and
            // noopt to prevent joining `prev` with all possible `chars` that could transition away from `prev`.
            // Instead only join with the set of `chars` where a relevant `InputSymbol` has already been found.
@ -1232,6 +1262,12 @@ module Make<RegexTreeViewSig TreeImpl> {
        )
      }

+      pragma[noinline]
+      private int getCodepointLengthForCandidate(string s) {
+        result = getCodepointLength(s) and
+        isReDoSCandidate(_, s)
+      }
+
      /**
       * Gets a state that can be reached from pumpable `fork` consuming all
       * chars in `w` any number of times followed by the first `i` characters of `w`.
@ -1245,7 +1281,7 @@ module Make<RegexTreeViewSig TreeImpl> {
          or
          // repeat until fixpoint
          i = 0 and
-          result = process(fork, w, w.length() - 1)
+          result = process(fork, w, getCodepointLengthForCandidate(w) - 1)
        )
      }

@ -1261,7 +1297,9 @@ module Make<RegexTreeViewSig TreeImpl> {
      /**
       * Gets a `char` that occurs in a `pump` string.
       */
-      private string getAProcessChar() { result = any(string s | isReDoSCandidate(_, s)).charAt(_) }
+      private string getAProcessChar() {
+        result = getCodepointAt(any(string s | isReDoSCandidate(_, s)), _)
+      }
    }

    /**
@ -1305,10 +1343,40 @@ module Make<RegexTreeViewSig TreeImpl> {
    bindingset[s]
    private string escape(string s) {
      result =
-        s.replaceAll("\\", "\\\\")
-            .replaceAll("\n", "\\n")
-            .replaceAll("\r", "\\r")
-            .replaceAll("\t", "\\t")
+        escapeUnicodeString(s.replaceAll("\\", "\\\\")
+              .replaceAll("\n", "\\n")
+              .replaceAll("\r", "\\r")
+              .replaceAll("\t", "\\t"))
+    }
+
+    /**
+     * Gets a string where the unicode characters in `s` have been escaped.
+     */
+    bindingset[s]
+    private string escapeUnicodeString(string s) {
+      result =
+        concat(int i, string char | char = escapeUnicodeChar(getCodepointAt(s, i)) | char order by i)
+    }
+
+    /**
+     * Gets a unicode escaped string for `char`.
+     * If `char` is a printable char, then `char` is returned.
+     */
+    bindingset[char]
+    private string escapeUnicodeChar(string char) {
+      if isPrintable(char)
+      then result = char
+      else
+        if exists(to4digitHex(any(int i | i.toUnicode() = char)))
+        then result = "\\u" + to4digitHex(any(int i | i.toUnicode() = char))
+        else result = "\\u{" + toHex(any(int i | i.toUnicode() = char)) + "}"
+    }
+
+    /** Holds if `char` is easily printable char, or whitespace. */
+    private predicate isPrintable(string char) {
+      exists(ascii(char))
+      or
+      char = "\n\r\t".charAt(_)
    }

    /**
--- a/shared/regex/qlpack.yml
+++ b/shared/regex/qlpack.yml
@ -3,4 +3,5 @@ version: 0.1.3-dev
 groups: shared
 library: true
 dependencies:
+  codeql/util: ${workspace}
 warnOnImplicitThis: true
--- a/shared/util/codeql/util/Numbers.qll
+++ b/shared/util/codeql/util/Numbers.qll
@ -50,7 +50,7 @@ int parseHexInt(string hex) {
      sum(int index, string c |
        c = stripped.charAt(index)
      |
-        sixteenToThe(stripped.length() - 1 - index) * toHex(c)
+        sixteenToThe(stripped.length() - 1 - index) * charToHex(c)
      )
  )
 }
@ -83,7 +83,7 @@ int parseOctalInt(string octal) {
 }

 /** Gets the integer value of the `hex` char. */
-private int toHex(string hex) {
+private int charToHex(string hex) {
  hex = [0 .. 9].toString() and
  result = hex.toInt()
  or
@ -100,6 +100,32 @@ private int toHex(string hex) {
  result = 15 and hex = ["f", "F"]
 }

+/**
+ * Gets a 4-digit hex representation of `i`.
+ */
+bindingset[i]
+string to4digitHex(int i) {
+  i >= 0 and
+  i <= 65535 and
+  exists(string hex | hex = toHex(i) |
+    result = concat(int zeroes | zeroes = [1 .. 4 - hex.length()] | "0") + hex
+  )
+}
+
+/**
+ * Gets a hex representation of `i`.
+ */
+bindingset[i]
+string toHex(int i) {
+  result =
+    // make the number with lots of preceding zeroes, then remove all preceding zeroes in a post-processing step
+    concat(int shift |
+      shift in [28, 24, 20, 16, 12, 8, 4, 0]
+    |
+      "0123456789abcdef".charAt(i.bitShiftRight(shift).bitAnd(15)) order by shift desc
+    ).regexpReplaceAll("^0*", "")
+}
+
 /**
 * Gets the value of 16 to the power of `n`. Holds only for `n` in the range
 * 0..7 (inclusive).