[ruby/yarp] fix: octal, hex, and unicode strings at the end of a

file
(https://github.com/ruby/yarp/pull/1371)

* refactor: move EOF check into yp_unescape_calculate_difference

parser_lex is a bit more readable when we can rely on that behavior

* fix: octal and hex digits at the end of a file

Previously this resulted in invalid memory access.

* fix: unicode strings at the end of a file

Previously this resulted in invalid memory access.

* Unterminated curly-bracket unicode is a syntax error

https://github.com/ruby/yarp/commit/21cf11acb5
This commit is contained in:
Mike Dalessio 2023-08-31 18:40:29 -04:00 коммит произвёл git
Родитель 9930363aab
Коммит df4c77608e
4 изменённых файлов: 66 добавлений и 38 удалений

Просмотреть файл

@ -621,6 +621,13 @@ module YARP
]
end
def test_unterminated_unicode_brackets_should_be_a_syntax_error
assert_errors expression('?\\u{3'), '?\\u{3', [
["invalid Unicode escape.", 1..5],
["invalid Unicode escape.", 1..5],
]
end
def test_method_parameters_after_block
expected = DefNode(
Location(),

Просмотреть файл

@ -22,6 +22,19 @@ module YARP
snippet "incomplete escaped list", "%w[\\"
snippet "incomplete escaped regex", "/a\\"
snippet "unterminated heredoc with unterminated escape at end of file", "<<A\n\\"
snippet "escaped octal at end of file 1", '"\\3'
snippet "escaped octal at end of file 2", '"\\33'
snippet "escaped hex at end of file 1", '"\\x'
snippet "escaped hex at end of file 2", '"\\x3'
snippet "escaped unicode at end of file 1", '"\\u{3'
snippet "escaped unicode at end of file 2", '"\\u{33'
snippet "escaped unicode at end of file 3", '"\\u{333'
snippet "escaped unicode at end of file 4", '"\\u{3333'
snippet "escaped unicode at end of file 5", '"\\u{33333'
snippet "escaped unicode at end of file 6", '"\\u{333333'
snippet "escaped unicode at end of file 7", '"\\u3'
snippet "escaped unicode at end of file 8", '"\\u33'
snippet "escaped unicode at end of file 9", '"\\u333'
snippet "statements node with multiple heredocs", <<~EOF
for <<A + <<B

Просмотреть файл

@ -69,17 +69,15 @@ char_is_ascii_printable(const uint8_t b) {
// Scan the 1-3 digits of octal into the value. Returns the number of digits
// scanned.
static inline size_t
unescape_octal(const uint8_t *backslash, uint8_t *value) {
unescape_octal(const uint8_t *backslash, uint8_t *value, const uint8_t *end) {
*value = (uint8_t) (backslash[1] - '0');
if (!yp_char_is_octal_digit(backslash[2])) {
if (backslash + 2 >= end || !yp_char_is_octal_digit(backslash[2])) {
return 2;
}
*value = (uint8_t) ((*value << 3) | (backslash[2] - '0'));
if (!yp_char_is_octal_digit(backslash[3])) {
if (backslash + 3 >= end || !yp_char_is_octal_digit(backslash[3])) {
return 3;
}
*value = (uint8_t) ((*value << 3) | (backslash[3] - '0'));
return 4;
}
@ -93,12 +91,15 @@ unescape_hexadecimal_digit(const uint8_t value) {
// Scan the 1-2 digits of hexadecimal into the value. Returns the number of
// digits scanned.
static inline size_t
unescape_hexadecimal(const uint8_t *backslash, uint8_t *value) {
unescape_hexadecimal(const uint8_t *backslash, uint8_t *value, const uint8_t *end) {
*value = 0;
if (backslash + 2 >= end || !yp_char_is_hexadecimal_digit(backslash[2])) {
return 2;
}
*value = unescape_hexadecimal_digit(backslash[2]);
if (!yp_char_is_hexadecimal_digit(backslash[3])) {
if (backslash + 3 >= end || !yp_char_is_hexadecimal_digit(backslash[3])) {
return 3;
}
*value = (uint8_t) ((*value << 4) | unescape_hexadecimal_digit(backslash[3]));
return 4;
}
@ -204,7 +205,7 @@ unescape(yp_parser_t *parser, uint8_t *dest, size_t *dest_length, const uint8_t
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9': {
uint8_t value;
const uint8_t *cursor = backslash + unescape_octal(backslash, &value);
const uint8_t *cursor = backslash + unescape_octal(backslash, &value, end);
if (dest) {
dest[(*dest_length)++] = unescape_char(value, flags);
@ -214,7 +215,7 @@ unescape(yp_parser_t *parser, uint8_t *dest, size_t *dest_length, const uint8_t
// \xnn hexadecimal bit pattern, where nn is 1-2 hexadecimal digits ([0-9a-fA-F])
case 'x': {
uint8_t value;
const uint8_t *cursor = backslash + unescape_hexadecimal(backslash, &value);
const uint8_t *cursor = backslash + unescape_hexadecimal(backslash, &value, end);
if (dest) {
dest[(*dest_length)++] = unescape_char(value, flags);
@ -236,13 +237,14 @@ unescape(yp_parser_t *parser, uint8_t *dest, size_t *dest_length, const uint8_t
unicode_cursor += yp_strspn_whitespace(unicode_cursor, end - unicode_cursor);
while ((*unicode_cursor != '}') && (unicode_cursor < end)) {
while ((unicode_cursor < end) && (*unicode_cursor != '}')) {
const uint8_t *unicode_start = unicode_cursor;
size_t hexadecimal_length = yp_strspn_hexadecimal_digit(unicode_cursor, end - unicode_cursor);
// \u{nnnn} character literal allows only 1-6 hexadecimal digits
if (hexadecimal_length > 6)
if (hexadecimal_length > 6) {
yp_diagnostic_list_append(&parser->error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "invalid Unicode escape.");
}
// there are not hexadecimal characters
if (hexadecimal_length == 0) {
@ -269,10 +271,16 @@ unescape(yp_parser_t *parser, uint8_t *dest, size_t *dest_length, const uint8_t
if (flags & YP_UNESCAPE_FLAG_EXPECT_SINGLE && codepoints_count > 1)
yp_diagnostic_list_append(&parser->error_list, extra_codepoints_start, unicode_cursor - 1, "Multiple codepoints at single character literal");
return unicode_cursor + 1;
if (unicode_cursor < end && *unicode_cursor == '}') {
unicode_cursor++;
} else {
yp_diagnostic_list_append(&parser->error_list, backslash, unicode_cursor, "invalid Unicode escape.");
}
return unicode_cursor;
}
if ((backslash + 2) < end && yp_char_is_hexadecimal_digits(backslash + 2, 4)) {
if ((backslash + 5) < end && yp_char_is_hexadecimal_digits(backslash + 2, 4)) {
uint32_t value;
unescape_unicode(backslash + 2, 4, &value);
@ -538,6 +546,10 @@ size_t
yp_unescape_calculate_difference(yp_parser_t *parser, const uint8_t *backslash, yp_unescape_type_t unescape_type, bool expect_single_codepoint) {
assert(unescape_type != YP_UNESCAPE_NONE);
if (backslash + 1 >= parser->end) {
return 0;
}
switch (backslash[1]) {
case '\\':
case '\'':

Просмотреть файл

@ -7002,17 +7002,16 @@ parser_lex(yp_parser_t *parser) {
// literally. In this case we'll skip past the next character
// and find the next breakpoint.
if (*breakpoint == '\\') {
// Check that we're not at the end of the file.
if (breakpoint + 1 >= parser->end) {
yp_unescape_type_t unescape_type = lex_mode->as.list.interpolation ? YP_UNESCAPE_ALL : YP_UNESCAPE_MINIMAL;
size_t difference = yp_unescape_calculate_difference(parser, breakpoint, unescape_type, false);
if (difference == 0) {
// we're at the end of the file
breakpoint = NULL;
continue;
}
yp_unescape_type_t unescape_type = lex_mode->as.list.interpolation ? YP_UNESCAPE_ALL : YP_UNESCAPE_MINIMAL;
size_t difference = yp_unescape_calculate_difference(parser, breakpoint, unescape_type, false);
// If the result is an escaped newline ...
if (*(breakpoint + difference - 1) == '\n') {
if (breakpoint[difference - 1] == '\n') {
if (parser->heredoc_end) {
// ... if we are on the same line as a heredoc, flush the heredoc and
// continue parsing after heredoc_end.
@ -7141,16 +7140,15 @@ parser_lex(yp_parser_t *parser) {
// literally. In this case we'll skip past the next character
// and find the next breakpoint.
if (*breakpoint == '\\') {
// Check that we're not at the end of the file.
if (breakpoint + 1 >= parser->end) {
size_t difference = yp_unescape_calculate_difference(parser, breakpoint, YP_UNESCAPE_ALL, false);
if (difference == 0) {
// we're at the end of the file
breakpoint = NULL;
continue;
}
size_t difference = yp_unescape_calculate_difference(parser, breakpoint, YP_UNESCAPE_ALL, false);
// If the result is an escaped newline ...
if (*(breakpoint + difference - 1) == '\n') {
if (breakpoint[difference - 1] == '\n') {
if (parser->heredoc_end) {
// ... if we are on the same line as a heredoc, flush the heredoc and
// continue parsing after heredoc_end.
@ -7293,20 +7291,19 @@ parser_lex(yp_parser_t *parser) {
breakpoint = yp_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1));
break;
case '\\': {
// Check that we're not at the end of the file.
if (breakpoint + 1 >= parser->end) {
breakpoint = NULL;
break;
}
// If we hit escapes, then we need to treat the next token
// literally. In this case we'll skip past the next character and
// find the next breakpoint.
yp_unescape_type_t unescape_type = parser->lex_modes.current->as.string.interpolation ? YP_UNESCAPE_ALL : YP_UNESCAPE_MINIMAL;
size_t difference = yp_unescape_calculate_difference(parser, breakpoint, unescape_type, false);
if (difference == 0) {
// we're at the end of the file
breakpoint = NULL;
break;
}
// If the result is an escaped newline ...
if (*(breakpoint + difference - 1) == '\n') {
if (breakpoint[difference - 1] == '\n') {
if (parser->heredoc_end) {
// ... if we are on the same line as a heredoc, flush the heredoc and
// continue parsing after heredoc_end.
@ -7463,12 +7460,6 @@ parser_lex(yp_parser_t *parser) {
break;
}
case '\\': {
// Check that we're not at the end of the file.
if (breakpoint + 1 >= parser->end) {
breakpoint = NULL;
break;
}
// If we hit an escape, then we need to skip past
// however many characters the escape takes up. However
// it's important that if \n or \r\n are escaped that we
@ -7481,6 +7472,11 @@ parser_lex(yp_parser_t *parser) {
} else {
yp_unescape_type_t unescape_type = (quote == YP_HEREDOC_QUOTE_SINGLE) ? YP_UNESCAPE_MINIMAL : YP_UNESCAPE_ALL;
size_t difference = yp_unescape_calculate_difference(parser, breakpoint, unescape_type, false);
if (difference == 0) {
// we're at the end of the file
breakpoint = NULL;
break;
}
yp_newline_list_check_append(&parser->newline_list, breakpoint + difference - 1);