[ruby/yarp] Properly handle invalid underscores in number literals

https://github.com/ruby/yarp/commit/35da3d1a4c
2023-09-15 10:40:18 -04:00 · 2023-09-15 10:40:18 -04:00 · 18780c22f6
--- a/test/yarp/errors_test.rb
+++ b/test/yarp/errors_test.rb
@ -1237,6 +1237,24 @@ module YARP
      assert_errors expression(source), source, errors, compare_ripper: false
    end

+    def test_invalid_number_underscores
+      error_messages = ["Invalid underscore placement in number"]
+
+      assert_error_messages "1__1", error_messages
+      assert_error_messages "0b1__1", error_messages
+      assert_error_messages "0o1__1", error_messages
+      assert_error_messages "01__1", error_messages
+      assert_error_messages "0d1__1", error_messages
+      assert_error_messages "0x1__1", error_messages
+
+      assert_error_messages "1_1_", error_messages
+      assert_error_messages "0b1_1_", error_messages
+      assert_error_messages "0o1_1_", error_messages
+      assert_error_messages "01_1_", error_messages
+      assert_error_messages "0d1_1_", error_messages
+      assert_error_messages "0x1_1_", error_messages
+    end
+
    private

    def assert_errors(expected, source, errors, compare_ripper: RUBY_ENGINE == "ruby")
--- a/yarp/diagnostic.c
+++ b/yarp/diagnostic.c
@ -164,6 +164,7 @@ static const char* const diagnostic_messages[YP_DIAGNOSTIC_ID_LEN] = {
    [YP_ERR_INVALID_NUMBER_DECIMAL]             = "Invalid decimal number",
    [YP_ERR_INVALID_NUMBER_HEXADECIMAL]         = "Invalid hexadecimal number",
    [YP_ERR_INVALID_NUMBER_OCTAL]               = "Invalid octal number",
+    [YP_ERR_INVALID_NUMBER_UNDERSCORE]          = "Invalid underscore placement in number",
    [YP_ERR_INVALID_PERCENT]                    = "Invalid `%` token", // TODO WHAT?
    [YP_ERR_INVALID_TOKEN]                      = "Invalid token", // TODO WHAT?
    [YP_ERR_INVALID_VARIABLE_GLOBAL]            = "Invalid global variable",
--- a/yarp/diagnostic.h
+++ b/yarp/diagnostic.h
@ -130,6 +130,7 @@ typedef enum {
    YP_ERR_INVALID_NUMBER_DECIMAL,
    YP_ERR_INVALID_NUMBER_HEXADECIMAL,
    YP_ERR_INVALID_NUMBER_OCTAL,
+    YP_ERR_INVALID_NUMBER_UNDERSCORE,
    YP_ERR_INVALID_PERCENT,
    YP_ERR_INVALID_TOKEN,
    YP_ERR_INVALID_VARIABLE_GLOBAL,
--- a/yarp/util/yp_char.c
+++ b/yarp/util/yp_char.c
@ -123,6 +123,9 @@ yp_char_is_inline_whitespace(const uint8_t b) {
    return yp_char_is_char_kind(b, YP_CHAR_BIT_INLINE_WHITESPACE);
 }

+// Scan through the string and return the number of characters at the start of
+// the string that match the given kind. Disallows searching past the given
+// maximum number of characters.
 static inline size_t
 yp_strspn_number_kind(const uint8_t *string, ptrdiff_t length, uint8_t kind) {
    if (length <= 0) return 0;
@ -134,20 +137,57 @@ yp_strspn_number_kind(const uint8_t *string, ptrdiff_t length, uint8_t kind) {
    return size;
 }

+// Scan through the string and return the number of characters at the start of
+// the string that match the given kind. Disallows searching past the given
+// maximum number of characters.
+//
+// Additionally, report the location of the last invalid underscore character
+// found in the string through the out invalid parameter.
+static inline size_t
+yp_strspn_number_kind_underscores(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid, uint8_t kind) {
+    if (length <= 0) return 0;
+
+    size_t size = 0;
+    size_t maximum = (size_t) length;
+
+    bool underscore = false;
+    while (size < maximum && (yp_number_table[string[size]] & kind)) {
+        if (string[size] == '_') {
+            if (underscore) *invalid = string + size;
+            underscore = true;
+        } else {
+            underscore = false;
+        }
+
+        size++;
+    }
+
+    if (string[size - 1] == '_') *invalid = string + size - 1;
+    return size;
+}
+
 // Returns the number of characters at the start of the string that are binary
 // digits or underscores. Disallows searching past the given maximum number of
 // characters.
+//
+// If multiple underscores are found in a row or if an underscore is
+// found at the end of the number, then the invalid pointer is set to the index
+// of the first invalid underscore.
 size_t
-yp_strspn_binary_number(const uint8_t *string, ptrdiff_t length) {
-    return yp_strspn_number_kind(string, length, YP_NUMBER_BIT_BINARY_NUMBER);
+yp_strspn_binary_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid) {
+    return yp_strspn_number_kind_underscores(string, length, invalid, YP_NUMBER_BIT_BINARY_NUMBER);
 }

 // Returns the number of characters at the start of the string that are octal
-// digits or underscores.  Disallows searching past the given maximum number of
+// digits or underscores. Disallows searching past the given maximum number of
 // characters.
+//
+// If multiple underscores are found in a row or if an underscore is
+// found at the end of the number, then the invalid pointer is set to the index
+// of the first invalid underscore.
 size_t
-yp_strspn_octal_number(const uint8_t *string, ptrdiff_t length) {
-    return yp_strspn_number_kind(string, length, YP_NUMBER_BIT_OCTAL_NUMBER);
+yp_strspn_octal_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid) {
+    return yp_strspn_number_kind_underscores(string, length, invalid, YP_NUMBER_BIT_OCTAL_NUMBER);
 }

 // Returns the number of characters at the start of the string that are decimal
@ -160,9 +200,13 @@ yp_strspn_decimal_digit(const uint8_t *string, ptrdiff_t length) {
 // Returns the number of characters at the start of the string that are decimal
 // digits or underscores. Disallows searching past the given maximum number of
 // characters.
+//
+// If multiple underscores are found in a row or if an underscore is
+// found at the end of the number, then the invalid pointer is set to the index
+// of the first invalid underscore.
 size_t
-yp_strspn_decimal_number(const uint8_t *string, ptrdiff_t length) {
-    return yp_strspn_number_kind(string, length, YP_NUMBER_BIT_DECIMAL_NUMBER);
+yp_strspn_decimal_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid) {
+    return yp_strspn_number_kind_underscores(string, length, invalid, YP_NUMBER_BIT_DECIMAL_NUMBER);
 }

 // Returns the number of characters at the start of the string that are
@ -176,9 +220,13 @@ yp_strspn_hexadecimal_digit(const uint8_t *string, ptrdiff_t length) {
 // Returns the number of characters at the start of the string that are
 // hexadecimal digits or underscores. Disallows searching past the given maximum
 // number of characters.
+//
+// If multiple underscores are found in a row or if an underscore is
+// found at the end of the number, then the invalid pointer is set to the index
+// of the first invalid underscore.
 size_t
-yp_strspn_hexadecimal_number(const uint8_t *string, ptrdiff_t length) {
-    return yp_strspn_number_kind(string, length, YP_NUMBER_BIT_HEXADECIMAL_NUMBER);
+yp_strspn_hexadecimal_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid) {
+    return yp_strspn_number_kind_underscores(string, length, invalid, YP_NUMBER_BIT_HEXADECIMAL_NUMBER);
 }

 static inline bool
--- a/yarp/util/yp_char.h
+++ b/yarp/util/yp_char.h
@ -31,19 +31,31 @@ size_t yp_strspn_decimal_digit(const uint8_t *string, ptrdiff_t length);
 size_t yp_strspn_hexadecimal_digit(const uint8_t *string, ptrdiff_t length);

 // Returns the number of characters at the start of the string that are octal
-// digits or underscores.  Disallows searching past the given maximum number of
+// digits or underscores. Disallows searching past the given maximum number of
 // characters.
-size_t yp_strspn_octal_number(const uint8_t *string, ptrdiff_t length);
+//
+// If multiple underscores are found in a row or if an underscore is
+// found at the end of the number, then the invalid pointer is set to the index
+// of the first invalid underscore.
+size_t yp_strspn_octal_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid);

 // Returns the number of characters at the start of the string that are decimal
 // digits or underscores. Disallows searching past the given maximum number of
 // characters.
-size_t yp_strspn_decimal_number(const uint8_t *string, ptrdiff_t length);
+//
+// If multiple underscores are found in a row or if an underscore is
+// found at the end of the number, then the invalid pointer is set to the index
+// of the first invalid underscore.
+size_t yp_strspn_decimal_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid);

 // Returns the number of characters at the start of the string that are
 // hexadecimal digits or underscores. Disallows searching past the given maximum
 // number of characters.
-size_t yp_strspn_hexadecimal_number(const uint8_t *string, ptrdiff_t length);
+//
+// If multiple underscores are found in a row or if an underscore is
+// found at the end of the number, then the invalid pointer is set to the index
+// of the first invalid underscore.
+size_t yp_strspn_hexadecimal_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid);

 // Returns the number of characters at the start of the string that are regexp
 // options. Disallows searching past the given maximum number of characters.
@ -52,7 +64,11 @@ size_t yp_strspn_regexp_option(const uint8_t *string, ptrdiff_t length);
 // Returns the number of characters at the start of the string that are binary
 // digits or underscores. Disallows searching past the given maximum number of
 // characters.
-size_t yp_strspn_binary_number(const uint8_t *string, ptrdiff_t length);
+//
+// If multiple underscores are found in a row or if an underscore is
+// found at the end of the number, then the invalid pointer is set to the index
+// of the first invalid underscore.
+size_t yp_strspn_binary_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid);

 // Returns true if the given character is a whitespace character.
 bool yp_char_is_whitespace(const uint8_t b);
--- a/yarp/yarp.c
+++ b/yarp/yarp.c
@ -5330,6 +5330,45 @@ context_def_p(yp_parser_t *parser) {
 /* Specific token lexers                                                      */
 /******************************************************************************/

+static void
+yp_strspn_number_validate(yp_parser_t *parser, const uint8_t *invalid) {
+    if (invalid != NULL) {
+        yp_diagnostic_list_append(&parser->error_list, invalid, invalid + 1, YP_ERR_INVALID_NUMBER_UNDERSCORE);
+    }
+}
+
+static size_t
+yp_strspn_binary_number_validate(yp_parser_t *parser, const uint8_t *string) {
+    const uint8_t *invalid = NULL;
+    size_t length = yp_strspn_binary_number(string, parser->end - string, &invalid);
+    yp_strspn_number_validate(parser, invalid);
+    return length;
+}
+
+static size_t
+yp_strspn_octal_number_validate(yp_parser_t *parser, const uint8_t *string) {
+    const uint8_t *invalid = NULL;
+    size_t length = yp_strspn_octal_number(string, parser->end - string, &invalid);
+    yp_strspn_number_validate(parser, invalid);
+    return length;
+}
+
+static size_t
+yp_strspn_decimal_number_validate(yp_parser_t *parser, const uint8_t *string) {
+    const uint8_t *invalid = NULL;
+    size_t length = yp_strspn_decimal_number(string, parser->end - string, &invalid);
+    yp_strspn_number_validate(parser, invalid);
+    return length;
+}
+
+static size_t
+yp_strspn_hexadecimal_number_validate(yp_parser_t *parser, const uint8_t *string) {
+    const uint8_t *invalid = NULL;
+    size_t length = yp_strspn_hexadecimal_number(string, parser->end - string, &invalid);
+    yp_strspn_number_validate(parser, invalid);
+    return length;
+}
+
 static yp_token_type_t
 lex_optional_float_suffix(yp_parser_t *parser) {
    yp_token_type_t type = YP_TOKEN_INTEGER;
@ -5339,7 +5378,7 @@ lex_optional_float_suffix(yp_parser_t *parser) {
    if (peek(parser) == '.') {
        if (yp_char_is_decimal_digit(peek_offset(parser, 1))) {
            parser->current.end += 2;
-            parser->current.end += yp_strspn_decimal_number(parser->current.end, parser->end - parser->current.end);
+            parser->current.end += yp_strspn_decimal_number_validate(parser, parser->current.end);
            type = YP_TOKEN_FLOAT;
        } else {
            // If we had a . and then something else, then it's not a float suffix on
@ -5355,7 +5394,7 @@ lex_optional_float_suffix(yp_parser_t *parser) {

        if (yp_char_is_decimal_digit(*parser->current.end)) {
            parser->current.end++;
-            parser->current.end += yp_strspn_decimal_number(parser->current.end, parser->end - parser->current.end);
+            parser->current.end += yp_strspn_decimal_number_validate(parser, parser->current.end);
            type = YP_TOKEN_FLOAT;
        } else {
            yp_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, YP_ERR_INVALID_FLOAT_EXPONENT);
@ -5377,7 +5416,7 @@ lex_numeric_prefix(yp_parser_t *parser) {
            case 'D':
                parser->current.end++;
                if (yp_char_is_decimal_digit(peek(parser))) {
-                    parser->current.end += yp_strspn_decimal_number(parser->current.end, parser->end - parser->current.end);
+                    parser->current.end += yp_strspn_decimal_number_validate(parser, parser->current.end);
                } else {
                    yp_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, YP_ERR_INVALID_NUMBER_DECIMAL);
                }
@ -5389,7 +5428,7 @@ lex_numeric_prefix(yp_parser_t *parser) {
            case 'B':
                parser->current.end++;
                if (yp_char_is_binary_digit(peek(parser))) {
-                    parser->current.end += yp_strspn_binary_number(parser->current.end, parser->end - parser->current.end);
+                    parser->current.end += yp_strspn_binary_number_validate(parser, parser->current.end);
                } else {
                    yp_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, YP_ERR_INVALID_NUMBER_BINARY);
                }
@ -5402,7 +5441,7 @@ lex_numeric_prefix(yp_parser_t *parser) {
            case 'O':
                parser->current.end++;
                if (yp_char_is_octal_digit(peek(parser))) {
-                    parser->current.end += yp_strspn_octal_number(parser->current.end, parser->end - parser->current.end);
+                    parser->current.end += yp_strspn_octal_number_validate(parser, parser->current.end);
                } else {
                    yp_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, YP_ERR_INVALID_NUMBER_OCTAL);
                }
@ -5420,7 +5459,7 @@ lex_numeric_prefix(yp_parser_t *parser) {
            case '5':
            case '6':
            case '7':
-                parser->current.end += yp_strspn_octal_number(parser->current.end, parser->end - parser->current.end);
+                parser->current.end += yp_strspn_octal_number_validate(parser, parser->current.end);
                parser->integer_base = YP_INTEGER_BASE_FLAGS_OCTAL;
                break;

@ -5429,7 +5468,7 @@ lex_numeric_prefix(yp_parser_t *parser) {
            case 'X':
                parser->current.end++;
                if (yp_char_is_hexadecimal_digit(peek(parser))) {
-                    parser->current.end += yp_strspn_hexadecimal_number(parser->current.end, parser->end - parser->current.end);
+                    parser->current.end += yp_strspn_hexadecimal_number_validate(parser, parser->current.end);
                } else {
                    yp_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, YP_ERR_INVALID_NUMBER_HEXADECIMAL);
                }
@ -5453,18 +5492,12 @@ lex_numeric_prefix(yp_parser_t *parser) {
    } else {
        // If it didn't start with a 0, then we'll lex as far as we can into a
        // decimal number.
-        parser->current.end += yp_strspn_decimal_number(parser->current.end, parser->end - parser->current.end);
+        parser->current.end += yp_strspn_decimal_number_validate(parser, parser->current.end);

        // Afterward, we'll lex as far as we can into an optional float suffix.
        type = lex_optional_float_suffix(parser);
    }

-    // If the last character that we consumed was an underscore, then this is
-    // actually an invalid integer value, and we should return an invalid token.
-    if (peek_offset(parser, -1) == '_') {
-        yp_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, YP_ERR_NUMBER_LITERAL_UNDERSCORE);
-    }
-
    return type;
 }