Python: treat \A, \Z, \b, \B as special chars, not escapes

This commit is contained in:
Nick Rolfe 2021-11-19 15:49:53 +00:00
Родитель f63c768d9f
Коммит df6ba43cca
8 изменённых файлов: 67 добавлений и 16 удалений

Просмотреть файл

@ -539,8 +539,8 @@ private int toHex(string hex) {
/**
* A word boundary, that is, a regular expression term of the form `\b`.
*/
class RegExpWordBoundary extends RegExpEscape {
RegExpWordBoundary() { this.getUnescaped() = "b" }
class RegExpWordBoundary extends RegExpSpecialChar {
RegExpWordBoundary() { this.getChar() = "\\b" }
}
/**
@ -809,7 +809,7 @@ class RegExpDot extends RegExpSpecialChar {
}
/**
* A dollar assertion `$` matching the end of a line.
* A dollar assertion `$` or `\Z` matching the end of a line.
*
* Example:
*
@ -818,13 +818,13 @@ class RegExpDot extends RegExpSpecialChar {
* ```
*/
class RegExpDollar extends RegExpSpecialChar {
RegExpDollar() { this.getChar() = "$" }
RegExpDollar() { this.getChar() = ["$", "\\Z"] }
override string getPrimaryQLClass() { result = "RegExpDollar" }
}
/**
* A caret assertion `^` matching the beginning of a line.
* A caret assertion `^` or `\A` matching the beginning of a line.
*
* Example:
*
@ -833,7 +833,7 @@ class RegExpDollar extends RegExpSpecialChar {
* ```
*/
class RegExpCaret extends RegExpSpecialChar {
RegExpCaret() { this.getChar() = "^" }
RegExpCaret() { this.getChar() = ["^", "\\A"] }
override string getPrimaryQLClass() { result = "RegExpCaret" }
}

Просмотреть файл

@ -437,11 +437,18 @@ abstract class RegexString extends Expr {
}
predicate specialCharacter(int start, int end, string char) {
not this.inCharSet(start) and
this.character(start, end) and
end = start + 1 and
char = this.getChar(start) and
(char = "$" or char = "^" or char = ".") and
not this.inCharSet(start)
(
end = start + 1 and
char = this.getChar(start) and
(char = "$" or char = "^" or char = ".")
or
end = start + 2 and
this.escapingChar(start) and
char = this.getText().substring(start, end) and
char = ["\\A", "\\Z", "\\b", "\\B"]
)
}
/** Whether the text in the range start,end is a group */
@ -901,7 +908,8 @@ abstract class RegexString extends Expr {
exists(int x | this.firstPart(x, end) |
this.emptyMatchAtStartGroup(x, start) or
this.qualifiedItem(x, start, true, _) or
this.specialCharacter(x, start, "^")
// ^ and \A match the start of the string
this.specialCharacter(x, start, ["^", "\\A"])
)
or
exists(int y | this.firstPart(start, y) |
@ -926,9 +934,8 @@ abstract class RegexString extends Expr {
or
this.qualifiedItem(end, y, true, _)
or
this.specialCharacter(end, y, "$")
or
y = end + 2 and this.escapingChar(end) and this.getChar(end + 1) = "Z"
// $ and \Z match the end of the string.
this.specialCharacter(end, y, ["$", "\\Z"])
)
or
exists(int x |

Просмотреть файл

@ -58,6 +58,11 @@
| \\A[+-]?\\d+ | 3 | 4 |
| \\A[+-]?\\d+ | 4 | 5 |
| \\A[+-]?\\d+ | 7 | 9 |
| \\Afoo\\Z | 0 | 2 |
| \\Afoo\\Z | 2 | 3 |
| \\Afoo\\Z | 3 | 4 |
| \\Afoo\\Z | 4 | 5 |
| \\Afoo\\Z | 5 | 7 |
| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | 0 | 2 |
| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | 12 | 13 |
| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | 16 | 18 |
@ -71,6 +76,11 @@
| \\\|\\[\\][123]\|\\{\\} | 9 | 10 |
| \\\|\\[\\][123]\|\\{\\} | 12 | 14 |
| \\\|\\[\\][123]\|\\{\\} | 14 | 16 |
| \\bfoo\\B | 0 | 2 |
| \\bfoo\\B | 2 | 3 |
| \\bfoo\\B | 3 | 4 |
| \\bfoo\\B | 4 | 5 |
| \\bfoo\\B | 5 | 7 |
| \|x | 1 | 2 |
| ^(^y\|^z)(u$\|v$)$ | 0 | 1 |
| ^(^y\|^z)(u$\|v$)$ | 2 | 3 |

Просмотреть файл

@ -45,8 +45,16 @@
| \\+0 | first | 0 | 2 |
| \\+0 | last | 2 | 3 |
| \\A[+-]?\\d+ | first | 0 | 2 |
| \\A[+-]?\\d+ | first | 2 | 6 |
| \\A[+-]?\\d+ | first | 2 | 7 |
| \\A[+-]?\\d+ | first | 7 | 9 |
| \\A[+-]?\\d+ | first | 7 | 10 |
| \\A[+-]?\\d+ | last | 7 | 9 |
| \\A[+-]?\\d+ | last | 7 | 10 |
| \\Afoo\\Z | first | 0 | 2 |
| \\Afoo\\Z | first | 2 | 3 |
| \\Afoo\\Z | last | 4 | 5 |
| \\Afoo\\Z | last | 5 | 7 |
| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | first | 0 | 2 |
| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | last | 28 | 32 |
| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | last | 28 | 33 |
@ -54,6 +62,8 @@
| \\\|\\[\\][123]\|\\{\\} | first | 12 | 14 |
| \\\|\\[\\][123]\|\\{\\} | last | 6 | 11 |
| \\\|\\[\\][123]\|\\{\\} | last | 14 | 16 |
| \\bfoo\\B | first | 0 | 2 |
| \\bfoo\\B | last | 5 | 7 |
| \|x | first | 1 | 2 |
| \|x | last | 1 | 2 |
| ^(^y\|^z)(u$\|v$)$ | first | 0 | 1 |

Просмотреть файл

@ -116,7 +116,7 @@
| \\+0 | char | 0 | 2 |
| \\+0 | char | 2 | 3 |
| \\+0 | sequence | 0 | 3 |
| \\A[+-]?\\d+ | char | 0 | 2 |
| \\A[+-]?\\d+ | \\A | 0 | 2 |
| \\A[+-]?\\d+ | char | 3 | 4 |
| \\A[+-]?\\d+ | char | 4 | 5 |
| \\A[+-]?\\d+ | char | 7 | 9 |
@ -124,6 +124,12 @@
| \\A[+-]?\\d+ | qualified | 2 | 7 |
| \\A[+-]?\\d+ | qualified | 7 | 10 |
| \\A[+-]?\\d+ | sequence | 0 | 10 |
| \\Afoo\\Z | \\A | 0 | 2 |
| \\Afoo\\Z | \\Z | 5 | 7 |
| \\Afoo\\Z | char | 2 | 3 |
| \\Afoo\\Z | char | 3 | 4 |
| \\Afoo\\Z | char | 4 | 5 |
| \\Afoo\\Z | sequence | 0 | 7 |
| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | char | 0 | 2 |
| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | char | 12 | 13 |
| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | char | 16 | 18 |
@ -148,6 +154,12 @@
| \\\|\\[\\][123]\|\\{\\} | choice | 0 | 16 |
| \\\|\\[\\][123]\|\\{\\} | sequence | 0 | 11 |
| \\\|\\[\\][123]\|\\{\\} | sequence | 12 | 16 |
| \\bfoo\\B | \\B | 5 | 7 |
| \\bfoo\\B | \\b | 0 | 2 |
| \\bfoo\\B | char | 2 | 3 |
| \\bfoo\\B | char | 3 | 4 |
| \\bfoo\\B | char | 4 | 5 |
| \\bfoo\\B | sequence | 0 | 7 |
| \|x | char | 1 | 2 |
| \|x | choice | 0 | 2 |
| \|x | sequence | 1 | 2 |

Просмотреть файл

@ -73,3 +73,7 @@ escaped = re.escape("https://www.humblebundle.com/home/library")
# Consistency check
baz = re.compile(r'\+0')
# Anchors
re.compile(r'\Afoo\Z')
re.compile(r'\bfoo\B')

Просмотреть файл

@ -100,5 +100,8 @@
| redos.py:371:25:371:35 | (\\u0061\|a)* | This part of the regular expression may cause exponential backtracking on strings starting with 'X' and containing many repetitions of 'a'. |
| redos.py:380:35:380:41 | [^"\\s]+ | This part of the regular expression may cause exponential backtracking on strings starting with '/' and containing many repetitions of '!'. |
| redos.py:381:35:381:41 | [^"\\s]+ | This part of the regular expression may cause exponential backtracking on strings starting with '/' and containing many repetitions of '!'. |
| redos.py:384:26:384:32 | (\\d\|0)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '0'. |
| redos.py:385:24:385:30 | (\\d\|0)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '0'. |
| redos.py:386:26:386:32 | (\\d\|0)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '0'. |
| unittests.py:5:17:5:23 | (\u00c6\|\\\u00c6)+ | This part of the regular expression may cause exponential backtracking on strings starting with 'X' and containing many repetitions of '\u00c6'. |
| unittests.py:9:16:9:24 | (?:.\|\\n)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\n'. |

Просмотреть файл

@ -378,4 +378,9 @@ good44 = re.compile(r'("[^"]*?"|[^"\s]+)+(?=\s*|\s*$)')
# BAD
bad88 = re.compile(r'/("[^"]*?"|[^"\s]+)+(?=\s*|\s*$)X')
bad89 = re.compile(r'/("[^"]*?"|[^"\s]+)+(?=X)')
bad89 = re.compile(r'/("[^"]*?"|[^"\s]+)+(?=X)')
# BAD
bad90 = re.compile(r'\A(\d|0)*x')
bad91 = re.compile(r'(\d|0)*\Z')
bad92 = re.compile(r'\b(\d|0)*x')