add a bad-tag-filter query for Python and JavaScript

This commit is contained in:
Erik Krogh Kristensen 2021-08-30 10:09:20 +02:00
Родитель fd64ff9ef1
Коммит 99ed4a1a89
20 изменённых файлов: 887 добавлений и 16 удалений

Просмотреть файл

@ -461,5 +461,9 @@
"ReDoS Polynomial Python/JS": [
"javascript/ql/lib/semmle/javascript/security/performance/SuperlinearBackTracking.qll",
"python/ql/lib/semmle/python/security/performance/SuperlinearBackTracking.qll"
],
"BadTagFilterQuery Python/JS": [
"javascript/ql/lib/semmle/javascript/security/BadTagFilterQuery.qll",
"python/ql/lib/semmle/python/security/BadTagFilterQuery.qll"
]
}

Просмотреть файл

@ -0,0 +1,4 @@
lgtm,codescanning
* A new query, `js/bad-tag-filter`, has been added to the query suite,
highlighting regular expressions that only match a subset of the HTML tags
it is supposed to match.

Просмотреть файл

@ -0,0 +1,306 @@
/**
* Provides precicates for reasoning about bad tag filter vulnerabilities.
*/
import performance.ReDoSUtil
/**
* A module for determining if a regexp matches a given string,
* and reasoning about which capture groups are filled by a given string.
*/
private module RegexpMatching {
/**
* A class to test whether a regular expression matches a string.
* Override this class and extend `test`/`testWithGroups` to configure which strings should be tested for acceptance by this regular expression.
* The result can afterwards be read from the `matches` predicate.
*
* Strings in the `testWithGroups` predicate are also tested for which capture groups are filled by the given string.
* The result is available in the `fillCaptureGroup` predicate.
*/
abstract class MatchedRegExp extends RegExpTerm {
MatchedRegExp() { this.isRootTerm() }
/**
* Holds if it should be tested whether this regular expression matches `str`.
*
* If `ignorePrefix` is true, then a regexp without a start anchor will be treated as if it had a start anchor.
* E.g. a regular expression `/foo$/` will match any string that ends with "foo",
* but if `ignorePrefix` is true, it will only match "foo".
*/
predicate test(string str, boolean ignorePrefix) {
none() // maybe overriden in subclasses
}
/**
* Same as `test(..)`, but where the `fillsCaptureGroup` afterwards tells which capture groups were filled by the given string.
*/
predicate testWithGroups(string str, boolean ignorePrefix) {
none() // maybe overriden in subclasses
}
/**
* Holds if `regexp` matches `str`, where `str` is either in the `test` or `testWithGroups` predicate.
*/
final predicate matches(string str) {
exists(State state | state = getAState(this, str.length() - 1, str, _) |
epsilonSucc*(state) = Accept(_)
)
}
/**
* Holds if matching `str` may fill capture group number `g`.
* Only holds if `str` is in the `testWithGroups` predicate.
*/
final predicate fillsCaptureGroup(string str, int g) {
exists(State s |
s = getAStateThatReachesAccept(this, _, str, _) and
g = group(s.getRepr())
)
}
}
/**
* Gets a state the regular expression `reg` can be in after matching the `i`th char in `str`.
* The regular expression is modelled as a non-determistic finite automaton,
* the regular expression can therefore be in multiple states after matching a character.
*
* It's a forward search to all possible states, and there is thus no guarantee that the state is on a path to an accepting state.
*/
private State getAState(MatchedRegExp reg, int i, string str, boolean ignorePrefix) {
// start state, the -1 position before any chars have been matched
i = -1 and
(
reg.test(str, ignorePrefix)
or
reg.testWithGroups(str, ignorePrefix)
) and
result.getRepr().getRootTerm() = reg and
isStartState(result)
or
// recursive case
result = getAStateAfterMatching(reg, _, str, i, _, ignorePrefix)
}
/**
* Gets the next state after the `prev` state from `reg`.
* `prev` is the state after matching `fromIndex` chars in `str`,
* and the result is the state after matching `toIndex` chars in `str`.
*
* This predicate is used as a step relation in the forwards search (`getAState`),
* and also as a step relation in the later backwards search (`getAStateThatReachesAccept`).
*/
private State getAStateAfterMatching(
MatchedRegExp reg, State prev, string str, int toIndex, int fromIndex, boolean ignorePrefix
) {
// the basic recursive case - outlined into a noopt helper to make performance work out.
result = getAStateAfterMatchingAux(reg, prev, str, toIndex, fromIndex, ignorePrefix)
or
// we can skip past word boundaries if the next char is a non-word char.
fromIndex = toIndex and
prev.getRepr() instanceof RegExpWordBoundary and
prev = getAState(reg, toIndex, str, ignorePrefix) and
after(prev.getRepr()) = result and
str.charAt(toIndex + 1).regexpMatch("\\W") // \W matches any non-word char.
}
pragma[noopt]
private State getAStateAfterMatchingAux(
MatchedRegExp reg, State prev, string str, int toIndex, int fromIndex, boolean ignorePrefix
) {
prev = getAState(reg, fromIndex, str, ignorePrefix) and
fromIndex = toIndex - 1 and
exists(string char | char = str.charAt(toIndex) | specializedDeltaClosed(prev, char, result)) and
not discardedPrefixStep(prev, result, ignorePrefix)
}
/** Holds if a step from `prev` to `next` should be discarded when the `ignorePrefix` flag is set. */
private predicate discardedPrefixStep(State prev, State next, boolean ignorePrefix) {
prev = mkMatch(any(RegExpRoot r)) and
ignorePrefix = true and
next = prev
}
// The `deltaClosed` relation specialized to the chars that exists in strings tested by a `MatchedRegExp`.
private predicate specializedDeltaClosed(State prev, string char, State next) {
deltaClosed(prev, specializedGetAnInputSymbolMatching(char), next)
}
// The `getAnInputSymbolMatching` relation specialized to the chars that exists in strings tested by a `MatchedRegExp`.
pragma[noinline]
private InputSymbol specializedGetAnInputSymbolMatching(string char) {
exists(string s, MatchedRegExp r |
r.test(s, _)
or
r.testWithGroups(s, _)
|
char = s.charAt(_)
) and
result = getAnInputSymbolMatching(char)
}
/**
* Gets the `i`th state on a path to the accepting state when `reg` matches `str`.
* Starts with an accepting state as found by `getAState` and searches backwards
* to the start state through the reachable states (as found by `getAState`).
*
* This predicate holds the invariant that the result state can be reached with `i` steps from a start state,
* and an accepting state can be found after (`str.length() - 1 - i`) steps from the result.
* The result state is therefore always on a valid path where `reg` accepts `str`.
*
* This predicate is only used to find which capture groups a regular expression has filled,
* and thus the search is only performed for the strings in the `testWithGroups(..)` predicate.
*/
private State getAStateThatReachesAccept(
MatchedRegExp reg, int i, string str, boolean ignorePrefix
) {
// base base, reaches an accepting state from the last state in `getAState(..)`
reg.testWithGroups(str, ignorePrefix) and
i = str.length() - 1 and
result = getAState(reg, i, str, ignorePrefix) and
epsilonSucc*(result) = Accept(_)
or
// recursive case. `next` is the next state to be matched after matching `prev`.
// this predicate is doing a backwards search, so `prev` is the result we are looking for.
exists(State next, State prev, int fromIndex, int toIndex |
next = getAStateThatReachesAccept(reg, toIndex, str, ignorePrefix) and
next = getAStateAfterMatching(reg, prev, str, toIndex, fromIndex, ignorePrefix) and
i = fromIndex and
result = prev
)
}
/** Gets the capture group number that `term` belongs to. */
private int group(RegExpTerm term) {
exists(RegExpGroup grp | grp.getNumber() = result | term.getParent*() = grp)
}
}
/** A class to test whether a regular expression matches certain HTML tags. */
class HTMLMatchingRegExp extends RegexpMatching::MatchedRegExp {
HTMLMatchingRegExp() {
// the regexp must mention "<" and ">" explicitly.
forall(string angleBracket | angleBracket = ["<", ">"] |
any(RegExpConstant term | term.getValue().matches("%" + angleBracket + "%")).getRootTerm() =
this
)
}
override predicate testWithGroups(string str, boolean ignorePrefix) {
ignorePrefix = true and
str = ["<!-- foo -->", "<!-- foo --!>", "<!- foo ->", "<foo>", "<script>"]
}
override predicate test(string str, boolean ignorePrefix) {
ignorePrefix = true and
str =
[
"<!-- foo -->", "<!- foo ->", "<!-- foo --!>", "<!-- foo\n -->", "<script>foo</script>",
"<script \n>foo</script>", "<script >foo\n</script>", "<foo ></foo>", "<foo>",
"<foo src=\"foo\"></foo>", "<script>", "<script src=\"foo\"></script>",
"<script src='foo'></script>", "<SCRIPT>foo</SCRIPT>", "<script\tsrc=\"foo\"/>",
"<script\tsrc='foo'></script>", "<sCrIpT>foo</ScRiPt>", "<script src=\"foo\">foo</script >",
"<script src=\"foo\">foo</script foo=\"bar\">", "<script src=\"foo\">foo</script\t\n bar>"
]
}
}
/**
* Holds if `regexp` matches some HTML tags, but misses some HTML tags that it should match.
*
* When adding a new case to this predicate, make sure the test string used in `matches(..)` calls are present in `HTMLMatchingRegExp::test` / `HTMLMatchingRegExp::testWithGroups`.
*/
predicate isBadRegexpFilter(HTMLMatchingRegExp regexp, string msg) {
// CVE-2021-33829 - matching both "<!-- foo -->" and "<!-- foo --!>", but in different capture groups
regexp.matches("<!-- foo -->") and
regexp.matches("<!-- foo --!>") and
exists(int a, int b | a != b |
regexp.fillsCaptureGroup("<!-- foo -->", a) and
// <!-- foo --> might be ambigously parsed (matching both capture groups), and that is ok here.
regexp.fillsCaptureGroup("<!-- foo --!>", b) and
not regexp.fillsCaptureGroup("<!-- foo --!>", a) and
msg =
"Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group "
+ a + " and comments ending with --!> are matched with capture group " +
strictconcat(int i | regexp.fillsCaptureGroup("<!-- foo --!>", i) | i.toString(), ", ") +
"."
)
or
// CVE-2020-17480 - matching "<!-- foo -->" and other tags, but not "<!-- foo --!>".
exists(int group, int other |
group != other and
regexp.fillsCaptureGroup("<!-- foo -->", group) and
regexp.fillsCaptureGroup("<foo>", other) and
not regexp.matches("<!-- foo --!>") and
not regexp.fillsCaptureGroup("<!-- foo -->", any(int i | i != group)) and
not regexp.fillsCaptureGroup("<!- foo ->", group) and
not regexp.fillsCaptureGroup("<foo>", group) and
not regexp.fillsCaptureGroup("<script>", group) and
msg =
"This regular expression only parses --> (capture group " + group +
") and not --!> as a HTML comment end tag."
)
or
regexp.matches("<!-- foo -->") and
not regexp.matches("<!-- foo\n -->") and
not regexp.matches("<!- foo ->") and
not regexp.matches("<foo>") and
not regexp.matches("<script>") and
msg = "This regular expression does not match comments containing newlines."
or
regexp.matches("<script>foo</script>") and
regexp.matches("<script src=\"foo\"></script>") and
not regexp.matches("<foo ></foo>") and
(
not regexp.matches("<script \n>foo</script>") and
msg = "This regular expression matches <script></script>, but not <script \\n></script>"
or
not regexp.matches("<script >foo\n</script>") and
msg = "This regular expression matches <script>...</script>, but not <script >...\\n</script>"
)
or
regexp.matches("<script>foo</script>") and
regexp.matches("<script src=\"foo\"></script>") and
not regexp.matches("<script src='foo'></script>") and
not regexp.matches("<foo>") and
msg = "This regular expression does not match script tags where the attribute uses single-quotes."
or
regexp.matches("<script>foo</script>") and
regexp.matches("<script src='foo'></script>") and
not regexp.matches("<script src=\"foo\"></script>") and
not regexp.matches("<foo>") and
msg = "This regular expression does not match script tags where the attribute uses double-quotes."
or
regexp.matches("<script>foo</script>") and
regexp.matches("<script src='foo'></script>") and
not regexp.matches("<script\tsrc='foo'></script>") and
not regexp.matches("<foo>") and
not regexp.matches("<foo src=\"foo\"></foo>") and
msg = "This regular expression does not match script tags where tabs are used between attributes."
or
regexp.matches("<script>foo</script>") and
not RegExpFlags::isIgnoreCase(regexp) and
not regexp.matches("<foo>") and
not regexp.matches("<foo ></foo>") and
(
not regexp.matches("<SCRIPT>foo</SCRIPT>") and
msg = "This regular expression does not match upper case <SCRIPT> tags."
or
not regexp.matches("<sCrIpT>foo</ScRiPt>") and
regexp.matches("<SCRIPT>foo</SCRIPT>") and
msg = "This regular expression does not match mixed case <sCrIpT> tags."
)
or
regexp.matches("<script src=\"foo\"></script>") and
not regexp.matches("<foo>") and
not regexp.matches("<foo ></foo>") and
(
not regexp.matches("<script src=\"foo\">foo</script >") and
msg = "This regular expression does not match script end tags like </script >."
or
not regexp.matches("<script src=\"foo\">foo</script foo=\"bar\">") and
msg = "This regular expression does not match script end tags like </script foo=\"bar\">."
or
not regexp.matches("<script src=\"foo\">foo</script\t\n bar>") and
msg = "This regular expression does not match script end tags like </script\\t\\n bar>."
)
}

Просмотреть файл

@ -544,7 +544,7 @@ private State before(RegExpTerm t) { result = Match(t, 0) }
/**
* Gets a state the NFA may be in after matching `t`.
*/
private State after(RegExpTerm t) {
State after(RegExpTerm t) {
exists(RegExpAlt alt | t = alt.getAChild() | result = after(alt))
or
exists(RegExpSequence seq, int i | t = seq.getChild(i) |
@ -673,7 +673,7 @@ RegExpRoot getRoot(RegExpTerm term) {
/**
* A state in the NFA.
*/
private newtype TState =
newtype TState =
/**
* A state representing that the NFA is about to match a term.
* `i` is used to index into multi-char literals.

Просмотреть файл

@ -20,12 +20,7 @@ module RegExpFlags {
/**
* Holds if `root` has the `i` flag for case-insensitive matching.
*/
predicate isIgnoreCase(RegExpTerm root) {
root.isRootTerm() and
exists(DataFlow::RegExpCreationNode node | node.getRoot() = root |
RegExp::isIgnoreCase(node.getFlags())
)
}
predicate isIgnoreCase(RegExpTerm root) { RegExp::isIgnoreCase(getFlags(root)) }
/**
* Gets the flags for `root`, or the empty string if `root` has no flags.
@ -38,15 +33,14 @@ module RegExpFlags {
not exists(node.getFlags()) and
result = ""
)
or
exists(RegExpPatternSource source | source.getRegExpTerm() = root |
result = source.getARegExpObject().(DataFlow::RegExpCreationNode).getFlags()
)
}
/**
* Holds if `root` has the `s` flag for multi-line matching.
*/
predicate isDotAll(RegExpTerm root) {
root.isRootTerm() and
exists(DataFlow::RegExpCreationNode node | node.getRoot() = root |
RegExp::isDotAll(node.getFlags())
)
}
predicate isDotAll(RegExpTerm root) { RegExp::isDotAll(getFlags(root)) }
}

Просмотреть файл

@ -0,0 +1,55 @@
<!DOCTYPE qhelp PUBLIC
"-//Semmle//qhelp//EN"
"qhelp.dtd">
<qhelp>
<overview>
<p>
Parsing general HTML using regular expressions is impossible, however it is possible to match
single HTML tags. However, if the regexp is not written well it might be easy
to circumvent the regexp, which can lead to XSS or other security issues.
</p>
<p>
Many of these mistakes are caused by browsers having very forgiving HTML parsers:
Browsers will often render invalid HTML with parser errors.
Regular expressions that attempt to match HTML must recognize tags containing these parser errors.
</p>
</overview>
<recommendation>
<p>
Use a well-tested sanitization or parser library if at all possible. These libraries are much more
likely to handle corner cases correctly than a custom implementation.
</p>
</recommendation>
<example>
<p>
For example, assume we want to write a function that filters out all <code>&lt;script&gt;</code> tags.
Such a function might be written like below:
</p>
<sample src="examples/BadTagFilter.js" />
<p>
This sanitizer does not filter out all <code>&lt;script&gt;</code> tags.
Browsers will not only accept <code>&lt;/script&gt;</code> as script end tags, but also tags such as <code>&lt;/script foo="bar"&gt;</code> even though it is a parser error.
This means that an attack string such as <code>&lt;script&gt;alert(1)&lt;/script foo="bar"&gt;</code> will not be filtered by
the function, but <code>alert(1)</code> will be executed by a browser if the string is rendered as HTML.
</p>
<p>
Other corner cases include that HTML comments can end with <code>--!&gt;</code>,
and that HTML tag names can contain upper case characters.
</p>
</example>
<references>
<li>Securitum: <a href="https://research.securitum.com/the-curious-case-of-copy-paste/">The Curious Case of Copy &amp; Paste</a>.</li>
<li>stackoverflow.com: <a href="https://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags#answer-1732454">You can't parse [X]HTML with regex</a>.</li>
<li>HTML Standard: <a href="https://html.spec.whatwg.org/multipage/parsing.html#comment-end-bang-state">Comment end bang state</a>.</li>
<li>stackoverflow.com: <a href="https://stackoverflow.com/questions/25559999/why-arent-browsers-strict-about-html">Why aren't browsers strict about HTML?</a>.</li>
</references>
</qhelp>

Просмотреть файл

@ -0,0 +1,19 @@
/**
* @name Bad HTML filtering regexp
* @description Matching HTML tags using regular expressions is hard to do right, and can easily lead to security issues.
* @kind problem
* @problem.severity warning
* @security-severity 7.8
* @precision high
* @id js/bad-tag-filter
* @tags correctness
* security
* external/cwe/cwe-116
* external/cwe/cwe-020
*/
import semmle.javascript.security.BadTagFilterQuery
from HTMLMatchingRegExp regexp, string msg
where msg = min(string m | isBadRegexpFilter(regexp, m) | m order by m.length(), m) // there might be multiple, we arbitrarily pick the shortest one
select regexp, msg

Просмотреть файл

@ -0,0 +1,8 @@
function filterScript(html) {
var scriptRegex = /<script\b[^>]*>([\s\S]*?)<\/script>/gi;
var match;
while ((match = scriptRegex.exec(html)) !== null) {
html = html.replace(match[0], match[1]);
}
return html;
}

Просмотреть файл

@ -0,0 +1,17 @@
| tst.js:2:6:2:29 | <script.*?>.*?<\\/script> | This regular expression does not match script end tags like </script >. |
| tst.js:3:6:3:29 | <script.*?>.*?<\\/script> | This regular expression does not match script end tags like </script >. |
| tst.js:7:6:7:16 | <!--.*--!?> | This regular expression does not match comments containing newlines. |
| tst.js:8:6:8:39 | <script.*?>(.\|\\s)*?<\\/script[^>]*> | This regular expression matches <script></script>, but not <script \\n></script> |
| tst.js:9:6:9:37 | <script[^>]*?>.*?<\\/script[^>]*> | This regular expression matches <script>...</script>, but not <script >...\\n</script> |
| tst.js:10:6:10:44 | <script(\\s\|\\w\|=\|")*?>.*?<\\/script[^>]*> | This regular expression does not match script tags where the attribute uses single-quotes. |
| tst.js:11:6:11:44 | <script(\\s\|\\w\|=\|')*?>.*?<\\/script[^>]*> | This regular expression does not match script tags where the attribute uses double-quotes. |
| tst.js:12:6:12:48 | <script( \|\\n\|\\w\|=\|'\|")*?>.*?<\\/script[^>]*> | This regular expression does not match script tags where tabs are used between attributes. |
| tst.js:13:6:13:34 | <script.*?>.*?<\\/script[^>]*> | This regular expression does not match upper case <SCRIPT> tags. |
| tst.js:14:6:14:52 | <(script\|SCRIPT).*?>.*?<\\/(script\|SCRIPT)[^>]*> | This regular expression does not match mixed case <sCrIpT> tags. |
| tst.js:15:6:15:39 | <script[^>]*?>[\\s\\S]*?<\\/script.*> | This regular expression does not match script end tags like </script\\t\\n bar>. |
| tst.js:17:6:17:40 | <script\\b[^>]*>([\\s\\S]*?)<\\/script> | This regular expression does not match script end tags like </script >. |
| tst.js:18:6:18:48 | <(?:!--([\\S\|\\s]*?)-->)\|([^\\/\\s>]+)[\\S\\s]*?> | Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group 1 and comments ending with --!> are matched with capture group 2. |
| tst.js:19:6:19:147 | <(?:(?:\\/([^>]+)>)\|(?:!--([\\S\|\\s]*?)-->)\|(?:([^\\/\\s>]+)((?:\\s+[\\w\\-:.]+(?:\\s*=\\s*?(?:(?:"[^"]*")\|(?:'[^']*')\|[^\\s"'\\/>]+))?)*)[\\S\\s]*?(\\/?)>)) | Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group 2 and comments ending with --!> are matched with capture group 3, 4. |
| tst.js:20:3:20:57 | (<[a-z\\/!$]("[^"]*"\|'[^']*'\|[^'">])*>\|<!(--.*?--\\s*)+>) | Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group 3 and comments ending with --!> are matched with capture group 1. |
| tst.js:21:6:21:249 | <(?:(?:!--([\\w\\W]*?)-->)\|(?:!\\[CDATA\\[([\\w\\W]*?)\\]\\]>)\|(?:!DOCTYPE([\\w\\W]*?)>)\|(?:\\?([^\\s\\/<>]+) ?([\\w\\W]*?)[?/]>)\|(?:\\/([A-Za-z][A-Za-z0-9\\-_\\:\\.]*)>)\|(?:([A-Za-z][A-Za-z0-9\\-_\\:\\.]*)((?:\\s+[^"'>]+(?:(?:"[^"]*")\|(?:'[^']*')\|[^>]*))*\|\\/\|\\s+)>)) | This regular expression only parses --> (capture group 1) and not --!> as a HTML comment end tag. |
| tst.js:22:6:22:33 | <!--([\\w\\W]*?)-->\|<([^>]*?)> | Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group 1 and comments ending with --!> are matched with capture group 2. |

Просмотреть файл

@ -0,0 +1 @@
Security/CWE-116/BadTagFilter.ql

Просмотреть файл

@ -0,0 +1,28 @@
var filters = [
/<script.*?>.*?<\/script>/i, // NOT OK - doesn't match newlines or `</script >`
/<script.*?>.*?<\/script>/is, // NOT OK - doesn't match `</script >`
/<script.*?>.*?<\/script[^>]*>/is, // OK
/<!--.*-->/is, // OK - we don't care regexps that only match comments
/<!--.*--!?>/is, // OK
/<!--.*--!?>/i, // NOT OK, does not match newlines
/<script.*?>(.|\s)*?<\/script[^>]*>/i, // NOT OK - doesn't match inside the script tag
/<script[^>]*?>.*?<\/script[^>]*>/i, // NOT OK - doesn't match newlines inside the content
/<script(\s|\w|=|")*?>.*?<\/script[^>]*>/is, // NOT OK - does not match single quotes for attribute values
/<script(\s|\w|=|')*?>.*?<\/script[^>]*>/is, // NOT OK - does not match double quotes for attribute values
/<script( |\n|\w|=|'|")*?>.*?<\/script[^>]*>/is, // NOT OK - does not match tabs between attributes
/<script.*?>.*?<\/script[^>]*>/s, // NOT OK - does not match uppercase SCRIPT tags
/<(script|SCRIPT).*?>.*?<\/(script|SCRIPT)[^>]*>/s, // NOT OK - does not match mixed case script tags
/<script[^>]*?>[\s\S]*?<\/script.*>/i, // NOT OK - doesn't match newlines in the end tag
/<script[^>]*?>[\s\S]*?<\/script[^>]*?>/i, // OK
/<script\b[^>]*>([\s\S]*?)<\/script>/gi, // NOT OK - too strict matching on the end tag
/<(?:!--([\S|\s]*?)-->)|([^\/\s>]+)[\S\s]*?>/, // NOT OK - doesn't match comments with the right capture groups
/<(?:(?:\/([^>]+)>)|(?:!--([\S|\s]*?)-->)|(?:([^\/\s>]+)((?:\s+[\w\-:.]+(?:\s*=\s*?(?:(?:"[^"]*")|(?:'[^']*')|[^\s"'\/>]+))?)*)[\S\s]*?(\/?)>))/, // NOT OK - capture groups
/(<[a-z\/!$]("[^"]*"|'[^']*'|[^'">])*>|<!(--.*?--\s*)+>)/gi, // NOT OK - capture groups
/<(?:(?:!--([\w\W]*?)-->)|(?:!\[CDATA\[([\w\W]*?)\]\]>)|(?:!DOCTYPE([\w\W]*?)>)|(?:\?([^\s\/<>]+) ?([\w\W]*?)[?/]>)|(?:\/([A-Za-z][A-Za-z0-9\-_\:\.]*)>)|(?:([A-Za-z][A-Za-z0-9\-_\:\.]*)((?:\s+[^"'>]+(?:(?:"[^"]*")|(?:'[^']*')|[^>]*))*|\/|\s+)>))/g, // NOT OK - capture groups
/<!--([\w\W]*?)-->|<([^>]*?)>/g, // NOT OK - capture groups
]
doFilters(filters)
var strip = '<script([^>]*)>([\\S\\s]*?)<\/script([^>]*)>'; // OK - it's used with the ignorecase flag
new RegExp(strip, 'gi');

Просмотреть файл

@ -0,0 +1,4 @@
lgtm,codescanning
* A new query, `py/bad-tag-filter`, has been added to the query suite,
highlighting regular expressions that only match a subset of the HTML tags
it is supposed to match.

Просмотреть файл

@ -0,0 +1,306 @@
/**
* Provides precicates for reasoning about bad tag filter vulnerabilities.
*/
import performance.ReDoSUtil
/**
* A module for determining if a regexp matches a given string,
* and reasoning about which capture groups are filled by a given string.
*/
private module RegexpMatching {
/**
* A class to test whether a regular expression matches a string.
* Override this class and extend `test`/`testWithGroups` to configure which strings should be tested for acceptance by this regular expression.
* The result can afterwards be read from the `matches` predicate.
*
* Strings in the `testWithGroups` predicate are also tested for which capture groups are filled by the given string.
* The result is available in the `fillCaptureGroup` predicate.
*/
abstract class MatchedRegExp extends RegExpTerm {
MatchedRegExp() { this.isRootTerm() }
/**
* Holds if it should be tested whether this regular expression matches `str`.
*
* If `ignorePrefix` is true, then a regexp without a start anchor will be treated as if it had a start anchor.
* E.g. a regular expression `/foo$/` will match any string that ends with "foo",
* but if `ignorePrefix` is true, it will only match "foo".
*/
predicate test(string str, boolean ignorePrefix) {
none() // maybe overriden in subclasses
}
/**
* Same as `test(..)`, but where the `fillsCaptureGroup` afterwards tells which capture groups were filled by the given string.
*/
predicate testWithGroups(string str, boolean ignorePrefix) {
none() // maybe overriden in subclasses
}
/**
* Holds if `regexp` matches `str`, where `str` is either in the `test` or `testWithGroups` predicate.
*/
final predicate matches(string str) {
exists(State state | state = getAState(this, str.length() - 1, str, _) |
epsilonSucc*(state) = Accept(_)
)
}
/**
* Holds if matching `str` may fill capture group number `g`.
* Only holds if `str` is in the `testWithGroups` predicate.
*/
final predicate fillsCaptureGroup(string str, int g) {
exists(State s |
s = getAStateThatReachesAccept(this, _, str, _) and
g = group(s.getRepr())
)
}
}
/**
* Gets a state the regular expression `reg` can be in after matching the `i`th char in `str`.
* The regular expression is modelled as a non-determistic finite automaton,
* the regular expression can therefore be in multiple states after matching a character.
*
* It's a forward search to all possible states, and there is thus no guarantee that the state is on a path to an accepting state.
*/
private State getAState(MatchedRegExp reg, int i, string str, boolean ignorePrefix) {
// start state, the -1 position before any chars have been matched
i = -1 and
(
reg.test(str, ignorePrefix)
or
reg.testWithGroups(str, ignorePrefix)
) and
result.getRepr().getRootTerm() = reg and
isStartState(result)
or
// recursive case
result = getAStateAfterMatching(reg, _, str, i, _, ignorePrefix)
}
/**
* Gets the next state after the `prev` state from `reg`.
* `prev` is the state after matching `fromIndex` chars in `str`,
* and the result is the state after matching `toIndex` chars in `str`.
*
* This predicate is used as a step relation in the forwards search (`getAState`),
* and also as a step relation in the later backwards search (`getAStateThatReachesAccept`).
*/
private State getAStateAfterMatching(
MatchedRegExp reg, State prev, string str, int toIndex, int fromIndex, boolean ignorePrefix
) {
// the basic recursive case - outlined into a noopt helper to make performance work out.
result = getAStateAfterMatchingAux(reg, prev, str, toIndex, fromIndex, ignorePrefix)
or
// we can skip past word boundaries if the next char is a non-word char.
fromIndex = toIndex and
prev.getRepr() instanceof RegExpWordBoundary and
prev = getAState(reg, toIndex, str, ignorePrefix) and
after(prev.getRepr()) = result and
str.charAt(toIndex + 1).regexpMatch("\\W") // \W matches any non-word char.
}
pragma[noopt]
private State getAStateAfterMatchingAux(
MatchedRegExp reg, State prev, string str, int toIndex, int fromIndex, boolean ignorePrefix
) {
prev = getAState(reg, fromIndex, str, ignorePrefix) and
fromIndex = toIndex - 1 and
exists(string char | char = str.charAt(toIndex) | specializedDeltaClosed(prev, char, result)) and
not discardedPrefixStep(prev, result, ignorePrefix)
}
/** Holds if a step from `prev` to `next` should be discarded when the `ignorePrefix` flag is set. */
private predicate discardedPrefixStep(State prev, State next, boolean ignorePrefix) {
prev = mkMatch(any(RegExpRoot r)) and
ignorePrefix = true and
next = prev
}
// The `deltaClosed` relation specialized to the chars that exists in strings tested by a `MatchedRegExp`.
private predicate specializedDeltaClosed(State prev, string char, State next) {
deltaClosed(prev, specializedGetAnInputSymbolMatching(char), next)
}
// The `getAnInputSymbolMatching` relation specialized to the chars that exists in strings tested by a `MatchedRegExp`.
pragma[noinline]
private InputSymbol specializedGetAnInputSymbolMatching(string char) {
exists(string s, MatchedRegExp r |
r.test(s, _)
or
r.testWithGroups(s, _)
|
char = s.charAt(_)
) and
result = getAnInputSymbolMatching(char)
}
/**
* Gets the `i`th state on a path to the accepting state when `reg` matches `str`.
* Starts with an accepting state as found by `getAState` and searches backwards
* to the start state through the reachable states (as found by `getAState`).
*
* This predicate holds the invariant that the result state can be reached with `i` steps from a start state,
* and an accepting state can be found after (`str.length() - 1 - i`) steps from the result.
* The result state is therefore always on a valid path where `reg` accepts `str`.
*
* This predicate is only used to find which capture groups a regular expression has filled,
* and thus the search is only performed for the strings in the `testWithGroups(..)` predicate.
*/
private State getAStateThatReachesAccept(
MatchedRegExp reg, int i, string str, boolean ignorePrefix
) {
// base base, reaches an accepting state from the last state in `getAState(..)`
reg.testWithGroups(str, ignorePrefix) and
i = str.length() - 1 and
result = getAState(reg, i, str, ignorePrefix) and
epsilonSucc*(result) = Accept(_)
or
// recursive case. `next` is the next state to be matched after matching `prev`.
// this predicate is doing a backwards search, so `prev` is the result we are looking for.
exists(State next, State prev, int fromIndex, int toIndex |
next = getAStateThatReachesAccept(reg, toIndex, str, ignorePrefix) and
next = getAStateAfterMatching(reg, prev, str, toIndex, fromIndex, ignorePrefix) and
i = fromIndex and
result = prev
)
}
/** Gets the capture group number that `term` belongs to. */
private int group(RegExpTerm term) {
exists(RegExpGroup grp | grp.getNumber() = result | term.getParent*() = grp)
}
}
/** A class to test whether a regular expression matches certain HTML tags. */
class HTMLMatchingRegExp extends RegexpMatching::MatchedRegExp {
HTMLMatchingRegExp() {
// the regexp must mention "<" and ">" explicitly.
forall(string angleBracket | angleBracket = ["<", ">"] |
any(RegExpConstant term | term.getValue().matches("%" + angleBracket + "%")).getRootTerm() =
this
)
}
override predicate testWithGroups(string str, boolean ignorePrefix) {
ignorePrefix = true and
str = ["<!-- foo -->", "<!-- foo --!>", "<!- foo ->", "<foo>", "<script>"]
}
override predicate test(string str, boolean ignorePrefix) {
ignorePrefix = true and
str =
[
"<!-- foo -->", "<!- foo ->", "<!-- foo --!>", "<!-- foo\n -->", "<script>foo</script>",
"<script \n>foo</script>", "<script >foo\n</script>", "<foo ></foo>", "<foo>",
"<foo src=\"foo\"></foo>", "<script>", "<script src=\"foo\"></script>",
"<script src='foo'></script>", "<SCRIPT>foo</SCRIPT>", "<script\tsrc=\"foo\"/>",
"<script\tsrc='foo'></script>", "<sCrIpT>foo</ScRiPt>", "<script src=\"foo\">foo</script >",
"<script src=\"foo\">foo</script foo=\"bar\">", "<script src=\"foo\">foo</script\t\n bar>"
]
}
}
/**
* Holds if `regexp` matches some HTML tags, but misses some HTML tags that it should match.
*
* When adding a new case to this predicate, make sure the test string used in `matches(..)` calls are present in `HTMLMatchingRegExp::test` / `HTMLMatchingRegExp::testWithGroups`.
*/
predicate isBadRegexpFilter(HTMLMatchingRegExp regexp, string msg) {
// CVE-2021-33829 - matching both "<!-- foo -->" and "<!-- foo --!>", but in different capture groups
regexp.matches("<!-- foo -->") and
regexp.matches("<!-- foo --!>") and
exists(int a, int b | a != b |
regexp.fillsCaptureGroup("<!-- foo -->", a) and
// <!-- foo --> might be ambigously parsed (matching both capture groups), and that is ok here.
regexp.fillsCaptureGroup("<!-- foo --!>", b) and
not regexp.fillsCaptureGroup("<!-- foo --!>", a) and
msg =
"Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group "
+ a + " and comments ending with --!> are matched with capture group " +
strictconcat(int i | regexp.fillsCaptureGroup("<!-- foo --!>", i) | i.toString(), ", ") +
"."
)
or
// CVE-2020-17480 - matching "<!-- foo -->" and other tags, but not "<!-- foo --!>".
exists(int group, int other |
group != other and
regexp.fillsCaptureGroup("<!-- foo -->", group) and
regexp.fillsCaptureGroup("<foo>", other) and
not regexp.matches("<!-- foo --!>") and
not regexp.fillsCaptureGroup("<!-- foo -->", any(int i | i != group)) and
not regexp.fillsCaptureGroup("<!- foo ->", group) and
not regexp.fillsCaptureGroup("<foo>", group) and
not regexp.fillsCaptureGroup("<script>", group) and
msg =
"This regular expression only parses --> (capture group " + group +
") and not --!> as a HTML comment end tag."
)
or
regexp.matches("<!-- foo -->") and
not regexp.matches("<!-- foo\n -->") and
not regexp.matches("<!- foo ->") and
not regexp.matches("<foo>") and
not regexp.matches("<script>") and
msg = "This regular expression does not match comments containing newlines."
or
regexp.matches("<script>foo</script>") and
regexp.matches("<script src=\"foo\"></script>") and
not regexp.matches("<foo ></foo>") and
(
not regexp.matches("<script \n>foo</script>") and
msg = "This regular expression matches <script></script>, but not <script \\n></script>"
or
not regexp.matches("<script >foo\n</script>") and
msg = "This regular expression matches <script>...</script>, but not <script >...\\n</script>"
)
or
regexp.matches("<script>foo</script>") and
regexp.matches("<script src=\"foo\"></script>") and
not regexp.matches("<script src='foo'></script>") and
not regexp.matches("<foo>") and
msg = "This regular expression does not match script tags where the attribute uses single-quotes."
or
regexp.matches("<script>foo</script>") and
regexp.matches("<script src='foo'></script>") and
not regexp.matches("<script src=\"foo\"></script>") and
not regexp.matches("<foo>") and
msg = "This regular expression does not match script tags where the attribute uses double-quotes."
or
regexp.matches("<script>foo</script>") and
regexp.matches("<script src='foo'></script>") and
not regexp.matches("<script\tsrc='foo'></script>") and
not regexp.matches("<foo>") and
not regexp.matches("<foo src=\"foo\"></foo>") and
msg = "This regular expression does not match script tags where tabs are used between attributes."
or
regexp.matches("<script>foo</script>") and
not RegExpFlags::isIgnoreCase(regexp) and
not regexp.matches("<foo>") and
not regexp.matches("<foo ></foo>") and
(
not regexp.matches("<SCRIPT>foo</SCRIPT>") and
msg = "This regular expression does not match upper case <SCRIPT> tags."
or
not regexp.matches("<sCrIpT>foo</ScRiPt>") and
regexp.matches("<SCRIPT>foo</SCRIPT>") and
msg = "This regular expression does not match mixed case <sCrIpT> tags."
)
or
regexp.matches("<script src=\"foo\"></script>") and
not regexp.matches("<foo>") and
not regexp.matches("<foo ></foo>") and
(
not regexp.matches("<script src=\"foo\">foo</script >") and
msg = "This regular expression does not match script end tags like </script >."
or
not regexp.matches("<script src=\"foo\">foo</script foo=\"bar\">") and
msg = "This regular expression does not match script end tags like </script foo=\"bar\">."
or
not regexp.matches("<script src=\"foo\">foo</script\t\n bar>") and
msg = "This regular expression does not match script end tags like </script\\t\\n bar>."
)
}

Просмотреть файл

@ -544,7 +544,7 @@ private State before(RegExpTerm t) { result = Match(t, 0) }
/**
* Gets a state the NFA may be in after matching `t`.
*/
private State after(RegExpTerm t) {
State after(RegExpTerm t) {
exists(RegExpAlt alt | t = alt.getAChild() | result = after(alt))
or
exists(RegExpSequence seq, int i | t = seq.getChild(i) |
@ -673,7 +673,7 @@ RegExpRoot getRoot(RegExpTerm term) {
/**
* A state in the NFA.
*/
private newtype TState =
newtype TState =
/**
* A state representing that the NFA is about to match a term.
* `i` is used to index into multi-char literals.

Просмотреть файл

@ -0,0 +1,55 @@
<!DOCTYPE qhelp PUBLIC
"-//Semmle//qhelp//EN"
"qhelp.dtd">
<qhelp>
<overview>
<p>
Parsing general HTML using regular expressions is impossible, however it is possible to match
single HTML tags. However, if the regexp is not written well it might be easy
to circumvent the regexp, which can lead to XSS or other security issues.
</p>
<p>
Many of these mistakes are caused by browsers having very forgiving HTML parsers:
Browsers will often render invalid HTML with parser errors.
Regular expressions that attempt to match HTML must recognize tags containing these parser errors.
</p>
</overview>
<recommendation>
<p>
Use a well-tested sanitization or parser library if at all possible. These libraries are much more
likely to handle corner cases correctly than a custom implementation.
</p>
</recommendation>
<example>
<p>
For example, assume we want to write a function that filters out all <code>&lt;script&gt;</code> tags.
Such a function might be written like below:
</p>
<sample src="examples/BadTagFilter.py" />
<p>
This sanitizer does not filter out all <code>&lt;script&gt;</code> tags.
Browsers will not only accept <code>&lt;/script&gt;</code> as script end tags, but also tags such as <code>&lt;/script foo="bar"&gt;</code> even though it is a parser error.
This means that an attack string such as <code>&lt;script&gt;alert(1)&lt;/script foo="bar"&gt;</code> will not be filtered by
the function, but <code>alert(1)</code> will be executed by a browser if the string is rendered as HTML.
</p>
<p>
Other corner cases include that HTML comments can end with <code>--!&gt;</code>,
and that HTML tag names can contain upper case characters.
</p>
</example>
<references>
<li>Securitum: <a href="https://research.securitum.com/the-curious-case-of-copy-paste/">The Curious Case of Copy &amp; Paste</a>.</li>
<li>stackoverflow.com: <a href="https://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags#answer-1732454">You can't parse [X]HTML with regex</a>.</li>
<li>HTML Standard: <a href="https://html.spec.whatwg.org/multipage/parsing.html#comment-end-bang-state">Comment end bang state</a>.</li>
<li>stackoverflow.com: <a href="https://stackoverflow.com/questions/25559999/why-arent-browsers-strict-about-html">Why aren't browsers strict about HTML?</a>.</li>
</references>
</qhelp>

Просмотреть файл

@ -0,0 +1,19 @@
/**
* @name Bad HTML filtering regexp
* @description Matching HTML tags using regular expressions is hard to do right, and can easily lead to security issues.
* @kind problem
* @problem.severity warning
* @security-severity 7.8
* @precision high
* @id py/bad-tag-filter
* @tags correctness
* security
* external/cwe/cwe-116
* external/cwe/cwe-020
*/
import semmle.python.security.BadTagFilterQuery
from HTMLMatchingRegExp regexp, string msg
where msg = min(string m | isBadRegexpFilter(regexp, m) | m order by m.length(), m) // there might be multiple, we arbitrarily pick the shortest one
select regexp, msg

Просмотреть файл

@ -0,0 +1,8 @@
import re
def filterScriptTags(content):
oldContent = ""
while oldContent != content:
oldContent = content
content = re.sub(r'<script.*?>.*?</script>', '', content, flags= re.DOTALL | re.IGNORECASE)
return content

Просмотреть файл

@ -0,0 +1,16 @@
| tst.py:4:20:4:43 | <script.*?>.*?<\\/script> | This regular expression does not match script end tags like </script >. |
| tst.py:5:20:5:43 | <script.*?>.*?<\\/script> | This regular expression does not match script end tags like </script >. |
| tst.py:9:20:9:30 | <!--.*--!?> | This regular expression does not match comments containing newlines. |
| tst.py:10:20:10:53 | <script.*?>(.\|\\s)*?<\\/script[^>]*> | This regular expression matches <script></script>, but not <script \\n></script> |
| tst.py:11:20:11:51 | <script[^>]*?>.*?<\\/script[^>]*> | This regular expression matches <script>...</script>, but not <script >...\\n</script> |
| tst.py:12:20:12:58 | <script(\\s\|\\w\|=\|")*?>.*?<\\/script[^>]*> | This regular expression does not match script tags where the attribute uses single-quotes. |
| tst.py:13:20:13:58 | <script(\\s\|\\w\|=\|')*?>.*?<\\/script[^>]*> | This regular expression does not match script tags where the attribute uses double-quotes. |
| tst.py:14:20:14:62 | <script( \|\\n\|\\w\|=\|'\|")*?>.*?<\\/script[^>]*> | This regular expression does not match script tags where tabs are used between attributes. |
| tst.py:15:20:15:48 | <script.*?>.*?<\\/script[^>]*> | This regular expression does not match upper case <SCRIPT> tags. |
| tst.py:16:20:16:66 | <(script\|SCRIPT).*?>.*?<\\/(script\|SCRIPT)[^>]*> | This regular expression does not match mixed case <sCrIpT> tags. |
| tst.py:17:20:17:53 | <script[^>]*?>[\\s\\S]*?<\\/script.*> | This regular expression does not match script end tags like </script\\t\\n bar>. |
| tst.py:19:20:19:54 | <script\\b[^>]*>([\\s\\S]*?)<\\/script> | This regular expression does not match script end tags like </script >. |
| tst.py:20:20:20:62 | <(?:!--([\\S\|\\s]*?)-->)\|([^\\/\\s>]+)[\\S\\s]*?> | Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group 1 and comments ending with --!> are matched with capture group 2. |
| tst.py:21:20:21:161 | <(?:(?:\\/([^>]+)>)\|(?:!--([\\S\|\\s]*?)-->)\|(?:([^\\/\\s>]+)((?:\\s+[\\w\\-:.]+(?:\\s*=\\s*?(?:(?:"[^"]*")\|(?:'[^']*')\|[^\\s"'\\/>]+))?)*)[\\S\\s]*?(\\/?)>)) | Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group 2 and comments ending with --!> are matched with capture group 3, 4. |
| tst.py:22:17:22:71 | (<[a-z\\/!$]("[^"]*"\|'[^']*'\|[^'">])*>\|<!(--.*?--\\s*)+>) | Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group 3 and comments ending with --!> are matched with capture group 1. |
| tst.py:23:20:23:263 | <(?:(?:!--([\\w\\W]*?)-->)\|(?:!\\[CDATA\\[([\\w\\W]*?)\\]\\]>)\|(?:!DOCTYPE([\\w\\W]*?)>)\|(?:\\?([^\\s\\/<>]+) ?([\\w\\W]*?)[?/]>)\|(?:\\/([A-Za-z][A-Za-z0-9\\-_\\:\\.]*)>)\|(?:([A-Za-z][A-Za-z0-9\\-_\\:\\.]*)((?:\\s+[^"'>]+(?:(?:"[^"]*")\|(?:'[^']*')\|[^>]*))*\|\\/\|\\s+)>)) | This regular expression only parses --> (capture group 1) and not --!> as a HTML comment end tag. |

Просмотреть файл

@ -0,0 +1 @@
Security/CWE-116/BadTagFilter.ql

Просмотреть файл

@ -0,0 +1,26 @@
import re
filters = [
re.compile(r"""<script.*?>.*?<\/script>""", re.IGNORECASE), # NOT OK - doesn't match newlines or `</script >`
re.compile(r"""<script.*?>.*?<\/script>""", re.IGNORECASE | re.DOTALL), # NOT OK - doesn't match `</script >`
re.compile(r"""<script.*?>.*?<\/script[^>]*>""", re.IGNORECASE | re.DOTALL), # OK
re.compile(r"""<!--.*-->""", re.IGNORECASE | re.DOTALL), # OK - we don't care regexps that only match comments
re.compile(r"""<!--.*--!?>""", re.IGNORECASE | re.DOTALL), # OK
re.compile(r"""<!--.*--!?>""", re.IGNORECASE), # NOT OK, does not match newlines
re.compile(r"""<script.*?>(.|\s)*?<\/script[^>]*>""", re.IGNORECASE), # NOT OK - doesn't match inside the script tag
re.compile(r"""<script[^>]*?>.*?<\/script[^>]*>""", re.IGNORECASE), # NOT OK - doesn't match newlines inside the content
re.compile(r"""<script(\s|\w|=|")*?>.*?<\/script[^>]*>""", re.IGNORECASE | re.DOTALL), # NOT OK - does not match single quotes for attribute values
re.compile(r"""<script(\s|\w|=|')*?>.*?<\/script[^>]*>""", re.IGNORECASE | re.DOTALL), # NOT OK - does not match double quotes for attribute values
re.compile(r"""<script( |\n|\w|=|'|")*?>.*?<\/script[^>]*>""", re.IGNORECASE | re.DOTALL), # NOT OK - does not match tabs between attributes
re.compile(r"""<script.*?>.*?<\/script[^>]*>""", re.re.DOTALL), # NOT OK - does not match uppercase SCRIPT tags
re.compile(r"""<(script|SCRIPT).*?>.*?<\/(script|SCRIPT)[^>]*>""", re.DOTALL), # NOT OK - does not match mixed case script tags
re.compile(r"""<script[^>]*?>[\s\S]*?<\/script.*>""", re.IGNORECASE), # NOT OK - doesn't match newlines in the end tag
re.compile(r"""<script[^>]*?>[\s\S]*?<\/script[^>]*?>""", re.IGNORECASE), # OK
re.compile(r"""<script\b[^>]*>([\s\S]*?)<\/script>""", re.IGNORECASE | re.DOTALL), # NOT OK - too strict matching on the end tag
re.compile(r"""<(?:!--([\S|\s]*?)-->)|([^\/\s>]+)[\S\s]*?>"""), #// NOT OK - doesn't match comments with the right capture groups
re.compile(r"""<(?:(?:\/([^>]+)>)|(?:!--([\S|\s]*?)-->)|(?:([^\/\s>]+)((?:\s+[\w\-:.]+(?:\s*=\s*?(?:(?:"[^"]*")|(?:'[^']*')|[^\s"'\/>]+))?)*)[\S\s]*?(\/?)>))"""), # NOT OK - capture groups
re.compile(r"""(<[a-z\/!$]("[^"]*"|'[^']*'|[^'">])*>|<!(--.*?--\s*)+>)""", re.IGNORECASE), # NOT OK - capture groups
re.compile(r"""<(?:(?:!--([\w\W]*?)-->)|(?:!\[CDATA\[([\w\W]*?)\]\]>)|(?:!DOCTYPE([\w\W]*?)>)|(?:\?([^\s\/<>]+) ?([\w\W]*?)[?/]>)|(?:\/([A-Za-z][A-Za-z0-9\-_\:\.]*)>)|(?:([A-Za-z][A-Za-z0-9\-_\:\.]*)((?:\s+[^"'>]+(?:(?:"[^"]*")|(?:'[^']*')|[^>]*))*|\/|\s+)>))"""), # NOT OK - capture groups
]
doFilters(filters)