From 82f18baa21d0df59c30d8a6e60bf3e0991de1114 Mon Sep 17 00:00:00 2001 From: Kevin Newton Date: Mon, 4 Dec 2023 12:51:22 -0500 Subject: [PATCH] [ruby/prism] Provide flags for changing encodings https://github.com/ruby/prism/commit/e838eaff6f --- prism/config.yml | 14 + prism/defines.h | 17 ++ prism/diagnostic.c | 1 + prism/diagnostic.h | 1 + prism/encoding.c | 35 ++- prism/encoding.h | 22 +- prism/extension.c | 10 +- prism/parser.h | 39 ++- prism/prism.c | 255 +++++++++++------- prism/prism.h | 2 +- prism/templates/src/serialize.c.erb | 6 +- prism/util/pm_strpbrk.c | 8 +- prism/util/pm_strpbrk.h | 6 +- test/prism/encoding_test.rb | 213 +++++++++------ test/prism/errors_test.rb | 2 +- test/prism/snapshots/arrays.txt | 3 + test/prism/snapshots/patterns.txt | 12 + test/prism/snapshots/seattlerb/case_in.txt | 1 + .../seattlerb/heredoc_bad_hex_escape.txt | 2 +- .../seattlerb/heredoc_bad_oct_escape.txt | 2 +- .../seattlerb/read_escape_unicode_curlies.txt | 2 +- .../seattlerb/read_escape_unicode_h4.txt | 2 +- .../snapshots/seattlerb/str_evstr_escape.txt | 2 +- .../str_lit_concat_bad_encodings.txt | 4 +- .../unparser/corpus/literal/literal.txt | 4 + .../whitequark/bug_ascii_8bit_in_literal.txt | 2 +- test/prism/snapshots/whitequark/heredoc.txt | 1 + .../snapshots/whitequark/interp_digit_var.txt | 6 + ...ser_slash_slash_n_escaping_in_literals.txt | 3 + .../snapshots/whitequark/xstring_plain.txt | 1 + test/prism/snapshots/xstring.txt | 3 + 31 files changed, 455 insertions(+), 226 deletions(-) diff --git a/prism/config.yml b/prism/config.yml index f7b6751eaa..381e5efcbc 100644 --- a/prism/config.yml +++ b/prism/config.yml @@ -346,6 +346,13 @@ flags: - name: VARIABLE_CALL comment: "a call that could have been a local variable" comment: Flags for call nodes. + - name: EncodingFlags + values: + - name: FORCED_UTF8_ENCODING + comment: "internal bytes forced the encoding to UTF-8" + - name: FORCED_BINARY_ENCODING + comment: "internal bytes forced the encoding to binary" + comment: Flags for nodes that have unescaped content. - name: IntegerBaseFlags values: - name: BINARY @@ -388,6 +395,10 @@ flags: comment: Flags for regular expression and match last line nodes. - name: StringFlags values: + - name: FORCED_UTF8_ENCODING + comment: "internal bytes forced the encoding to UTF-8" + - name: FORCED_BINARY_ENCODING + comment: "internal bytes forced the encoding to binary" - name: FROZEN comment: "frozen by virtue of a `frozen_string_literal` comment" comment: Flags for string nodes. @@ -2576,6 +2587,9 @@ nodes: ^^^^^^^^^^^^^^^^^^^^ - name: XStringNode fields: + - name: flags + type: flags + kind: EncodingFlags - name: opening_loc type: location - name: content_loc diff --git a/prism/defines.h b/prism/defines.h index f89a0bed8e..c9715c4eb0 100644 --- a/prism/defines.h +++ b/prism/defines.h @@ -74,4 +74,21 @@ # define snprintf _snprintf #endif +/** + * A simple utility macro to concatenate two tokens together, necessary when one + * of the tokens is itself a macro. + */ +#define PM_CONCATENATE(left, right) left ## right + +/** + * We want to be able to use static assertions, but they weren't standardized + * until C11. As such, we polyfill it here by making a hacky typedef that will + * fail to compile due to a negative array size if the condition is false. + */ +#if defined(_Static_assert) +# define PM_STATIC_ASSERT(line, condition, message) _Static_assert(condition, message) +#else +# define PM_STATIC_ASSERT(line, condition, message) typedef char PM_CONCATENATE(static_assert_, line)[(condition) ? 1 : -1] +#endif + #endif diff --git a/prism/diagnostic.c b/prism/diagnostic.c index f9fd95cb06..7cffce7c9f 100644 --- a/prism/diagnostic.c +++ b/prism/diagnostic.c @@ -185,6 +185,7 @@ static const char* const diagnostic_messages[PM_DIAGNOSTIC_ID_LEN] = { [PM_ERR_LIST_W_UPPER_ELEMENT] = "expected a string in a `%W` list", [PM_ERR_LIST_W_UPPER_TERM] = "expected a closing delimiter for the `%W` list", [PM_ERR_MALLOC_FAILED] = "failed to allocate memory", + [PM_ERR_MIXED_ENCODING] = "UTF-8 mixed within %s source", [PM_ERR_MODULE_IN_METHOD] = "unexpected module definition in a method definition", [PM_ERR_MODULE_NAME] = "expected a constant name after `module`", [PM_ERR_MODULE_TERM] = "expected an `end` to close the `module` statement", diff --git a/prism/diagnostic.h b/prism/diagnostic.h index fc408ccbd6..079d409147 100644 --- a/prism/diagnostic.h +++ b/prism/diagnostic.h @@ -177,6 +177,7 @@ typedef enum { PM_ERR_LIST_W_UPPER_ELEMENT, PM_ERR_LIST_W_UPPER_TERM, PM_ERR_MALLOC_FAILED, + PM_ERR_MIXED_ENCODING, PM_ERR_MODULE_IN_METHOD, PM_ERR_MODULE_NAME, PM_ERR_MODULE_TERM, diff --git a/prism/encoding.c b/prism/encoding.c index 4bf6b6a775..3493353b04 100644 --- a/prism/encoding.c +++ b/prism/encoding.c @@ -4212,9 +4212,9 @@ pm_encoding_shift_jis_char_width(const uint8_t *b, ptrdiff_t n) { } /** - * This is the definition of all of the encodings that we support. + * This is the table of all of the encodings that prisms supports. */ -static const pm_encoding_t pm_encodings[] = { +const pm_encoding_t pm_encodings[] = { [PM_ENCODING_UTF_8] = { .name = "UTF-8", .char_width = pm_encoding_utf_8_char_width, @@ -4223,14 +4223,6 @@ static const pm_encoding_t pm_encodings[] = { .isupper_char = pm_encoding_utf_8_isupper_char, .multibyte = true }, - [PM_ENCODING_ASCII] = { - .name = "US-ASCII", - .char_width = pm_encoding_ascii_char_width, - .alnum_char = pm_encoding_ascii_alnum_char, - .alpha_char = pm_encoding_ascii_alpha_char, - .isupper_char = pm_encoding_ascii_isupper_char, - .multibyte = false - }, [PM_ENCODING_ASCII_8BIT] = { .name = "ASCII-8BIT", .char_width = pm_encoding_single_char_width, @@ -4815,6 +4807,14 @@ static const pm_encoding_t pm_encodings[] = { .isupper_char = pm_encoding_tis_620_isupper_char, .multibyte = false }, + [PM_ENCODING_US_ASCII] = { + .name = "US-ASCII", + .char_width = pm_encoding_ascii_char_width, + .alnum_char = pm_encoding_ascii_alnum_char, + .alpha_char = pm_encoding_ascii_alpha_char, + .isupper_char = pm_encoding_ascii_isupper_char, + .multibyte = false + }, [PM_ENCODING_UTF8_MAC] = { .name = "UTF8-MAC", .char_width = pm_encoding_utf_8_char_width, @@ -4937,11 +4937,6 @@ static const pm_encoding_t pm_encodings[] = { } }; -/** - * This is the default UTF-8 encoding. We need it to quickly create parsers. - */ -const pm_encoding_t *pm_encoding_utf_8 = pm_encodings; - /** * Parse the given name of an encoding and return a pointer to the corresponding * encoding struct if one can be found, otherwise return NULL. @@ -4961,7 +4956,7 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) { } // Otherwise we'll return the default UTF-8 encoding. - return pm_encoding_utf_8; + return PM_ENCODING_UTF_8_ENTRY; } // Next, we're going to loop through each of the encodings that we handle @@ -4972,9 +4967,9 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) { if (width >= 3) { switch (*start) { case 'A': case 'a': - ENCODING1("ASCII", PM_ENCODING_ASCII); + ENCODING1("ASCII", PM_ENCODING_US_ASCII); ENCODING1("ASCII-8BIT", PM_ENCODING_ASCII_8BIT); - ENCODING1("ANSI_X3.4-1968", PM_ENCODING_ASCII); + ENCODING1("ANSI_X3.4-1968", PM_ENCODING_US_ASCII); break; case 'B': case 'b': ENCODING1("BINARY", PM_ENCODING_ASCII_8BIT); @@ -5109,7 +5104,7 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) { ENCODING1("TIS-620", PM_ENCODING_TIS_620); break; case 'U': case 'u': - ENCODING1("US-ASCII", PM_ENCODING_ASCII); + ENCODING1("US-ASCII", PM_ENCODING_US_ASCII); ENCODING2("UTF8-MAC", "UTF-8-HFS", PM_ENCODING_UTF8_MAC); ENCODING1("UTF8-DoCoMo", PM_ENCODING_UTF8_DOCOMO); ENCODING1("UTF8-KDDI", PM_ENCODING_UTF8_KDDI); @@ -5129,7 +5124,7 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) { ENCODING1("Windows-1258", PM_ENCODING_WINDOWS_1258); break; case '6': - ENCODING1("646", PM_ENCODING_ASCII); + ENCODING1("646", PM_ENCODING_US_ASCII); break; } } diff --git a/prism/encoding.h b/prism/encoding.h index 247db600ce..c286338160 100644 --- a/prism/encoding.h +++ b/prism/encoding.h @@ -125,7 +125,6 @@ extern const uint8_t pm_encoding_unicode_table[256]; */ typedef enum { PM_ENCODING_UTF_8 = 0, - PM_ENCODING_ASCII, PM_ENCODING_ASCII_8BIT, PM_ENCODING_BIG5, PM_ENCODING_BIG5_HKSCS, @@ -199,6 +198,7 @@ typedef enum { PM_ENCODING_STATELESS_ISO_2022_JP, PM_ENCODING_STATELESS_ISO_2022_JP_KDDI, PM_ENCODING_TIS_620, + PM_ENCODING_US_ASCII, PM_ENCODING_UTF8_MAC, PM_ENCODING_UTF8_DOCOMO, PM_ENCODING_UTF8_KDDI, @@ -213,13 +213,27 @@ typedef enum { PM_ENCODING_WINDOWS_1257, PM_ENCODING_WINDOWS_1258, PM_ENCODING_WINDOWS_31J, - PM_ENCODING_WINDOWS_874 + PM_ENCODING_WINDOWS_874, + PM_ENCODING_MAXIMUM } pm_encoding_type_t; /** - * This is the default UTF-8 encoding. We need it to quickly create parsers. + * This is the table of all of the encodings that prisms supports. */ -extern const pm_encoding_t *pm_encoding_utf_8; +extern const pm_encoding_t pm_encodings[PM_ENCODING_MAXIMUM]; + +/** + * This is the default UTF-8 encoding. We need a reference to it to quickly + * create parsers. + */ +#define PM_ENCODING_UTF_8_ENTRY (&pm_encodings[PM_ENCODING_UTF_8]) + +/** + * This is the US-ASCII encoding. We need a reference to it to be able to + * compare against it when a string is being created because it could possibly + * need to fall back to ASCII-8BIT. + */ +#define PM_ENCODING_US_ASCII_ENTRY (&pm_encodings[PM_ENCODING_US_ASCII]) /** * Parse the given name of an encoding and return a pointer to the corresponding diff --git a/prism/extension.c b/prism/extension.c index f6f2b6b195..fb252de3fe 100644 --- a/prism/extension.c +++ b/prism/extension.c @@ -469,7 +469,7 @@ parse_lex_token(void *data, pm_parser_t *parser, pm_token_t *token) { static void parse_lex_encoding_changed_callback(pm_parser_t *parser) { parse_lex_data_t *parse_lex_data = (parse_lex_data_t *) parser->lex_callback->data; - parse_lex_data->encoding = rb_enc_find(parser->encoding.name); + parse_lex_data->encoding = rb_enc_find(parser->encoding->name); // Since the encoding changed, we need to go back and change the encoding of // the tokens that were already lexed. This is only going to end up being @@ -599,7 +599,7 @@ parse_input(pm_string_t *input, const pm_options_t *options) { pm_parser_init(&parser, pm_string_source(input), pm_string_length(input), options); pm_node_t *node = pm_parse(&parser); - rb_encoding *encoding = rb_enc_find(parser.encoding.name); + rb_encoding *encoding = rb_enc_find(parser.encoding->name); VALUE source = pm_source_new(&parser, encoding); VALUE result_argv[] = { @@ -693,7 +693,7 @@ parse_input_comments(pm_string_t *input, const pm_options_t *options) { pm_parser_init(&parser, pm_string_source(input), pm_string_length(input), options); pm_node_t *node = pm_parse(&parser); - rb_encoding *encoding = rb_enc_find(parser.encoding.name); + rb_encoding *encoding = rb_enc_find(parser.encoding->name); VALUE source = pm_source_new(&parser, encoding); VALUE comments = parser_comments(&parser, source); @@ -872,7 +872,7 @@ static VALUE named_captures(VALUE self, VALUE source) { pm_string_list_t string_list = { 0 }; - if (!pm_regexp_named_capture_group_names((const uint8_t *) RSTRING_PTR(source), RSTRING_LEN(source), &string_list, false, pm_encoding_utf_8)) { + if (!pm_regexp_named_capture_group_names((const uint8_t *) RSTRING_PTR(source), RSTRING_LEN(source), &string_list, false, PM_ENCODING_UTF_8_ENTRY)) { pm_string_list_free(&string_list); return Qnil; } @@ -962,7 +962,7 @@ inspect_node(VALUE self, VALUE source) { pm_prettyprint(&buffer, &parser, node); - rb_encoding *encoding = rb_enc_find(parser.encoding.name); + rb_encoding *encoding = rb_enc_find(parser.encoding->name); VALUE string = rb_enc_str_new(pm_buffer_value(&buffer), pm_buffer_length(&buffer), encoding); pm_buffer_free(&buffer); diff --git a/prism/parser.h b/prism/parser.h index 98d8c0159b..2c58131b19 100644 --- a/prism/parser.h +++ b/prism/parser.h @@ -523,12 +523,6 @@ struct pm_parser { size_t index; } lex_modes; - /** - * The common_whitespace value from the most-recently-popped heredoc mode of the lexer, so we - * can dedent the heredoc after popping the lex mode. - */ - size_t current_string_common_whitespace; - /** The pointer to the start of the source. */ const uint8_t *start; @@ -581,7 +575,7 @@ struct pm_parser { * The encoding functions for the current file is attached to the parser as * it's parsing so that it can change with a magic comment. */ - pm_encoding_t encoding; + const pm_encoding_t *encoding; /** * When the encoding that is being used to parse the source is changed by @@ -637,6 +631,37 @@ struct pm_parser { */ int32_t start_line; + /** + * When a string-like expression is being lexed, any byte or escape sequence + * that resolves to a value whose top bit is set (i.e., >= 0x80) will + * explicitly set the encoding to the same encoding as the source. + * Alternatively, if a unicode escape sequence is used (e.g., \\u{80}) that + * resolves to a value whose top bit is set, then the encoding will be + * explicitly set to UTF-8. + * + * The _next_ time this happens, if the encoding that is about to become the + * explicitly set encoding does not match the previously set explicit + * encoding, a mixed encoding error will be emitted. + * + * When the expression is finished being lexed, the explicit encoding + * controls the encoding of the expression. For the most part this means + * that the expression will either be encoded in the source encoding or + * UTF-8. This holds for all encodings except US-ASCII. If the source is + * US-ASCII and an explicit encoding was set that was _not_ UTF-8, then the + * expression will be encoded as ASCII-8BIT. + * + * Note that if the expression is a list, different elements within the same + * list can have different encodings, so this will get reset between each + * element. Furthermore all of this only applies to lists that support + * interpolation, because otherwise escapes that could change the encoding + * are ignored. + * + * At first glance, it may make more sense for this to live on the lexer + * mode, but we need it here to communicate back to the parser for character + * literals that do not push a new lexer mode. + */ + const pm_encoding_t *explicit_encoding; + /** Whether or not we're at the beginning of a command. */ bool command_start; diff --git a/prism/prism.c b/prism/prism.c index fee14e395f..3ad21f3334 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -275,6 +275,7 @@ lex_mode_push_list(pm_parser_t *parser, bool interpolation, uint8_t delimiter) { breakpoints[index++] = incrementor; } + parser->explicit_encoding = NULL; return lex_mode_push(parser, lex_mode); } @@ -356,6 +357,7 @@ lex_mode_push_string(pm_parser_t *parser, bool interpolation, bool label_allowed breakpoints[index++] = incrementor; } + parser->explicit_encoding = NULL; return lex_mode_push(parser, lex_mode); } @@ -539,7 +541,7 @@ pm_parser_err_token(pm_parser_t *parser, const pm_token_t *token, pm_diagnostic_ * Append an error to the list of errors on the parser using the location of the * given token and a format string. */ -#define PM_PARSER_ERR_TOKEN_FORMAT(parser, token, diag_id, ...) pm_diagnostic_list_append_format(&parser->error_list, token->start, token->end, diag_id, __VA_ARGS__) +#define PM_PARSER_ERR_TOKEN_FORMAT(parser, token, diag_id, ...) pm_diagnostic_list_append_format(&parser->error_list, (token).start, (token).end, diag_id, __VA_ARGS__) /** * Append a warning to the list of warnings on the parser. @@ -5714,6 +5716,7 @@ pm_xstring_node_create_unescaped(pm_parser_t *parser, const pm_token_t *opening, *node = (pm_x_string_node_t) { { .type = PM_X_STRING_NODE, + .flags = PM_STRING_FLAGS_FROZEN, .location = { .start = opening->start, .end = closing->end @@ -5922,12 +5925,12 @@ static inline size_t char_is_identifier_start(pm_parser_t *parser, const uint8_t *b) { if (parser->encoding_changed) { size_t width; - if ((width = parser->encoding.alpha_char(b, parser->end - b)) != 0) { + if ((width = parser->encoding->alpha_char(b, parser->end - b)) != 0) { return width; } else if (*b == '_') { return 1; } else if (*b >= 0x80) { - return parser->encoding.char_width(b, parser->end - b); + return parser->encoding->char_width(b, parser->end - b); } else { return 0; } @@ -5960,12 +5963,12 @@ static inline size_t char_is_identifier(pm_parser_t *parser, const uint8_t *b) { if (parser->encoding_changed) { size_t width; - if ((width = parser->encoding.alnum_char(b, parser->end - b)) != 0) { + if ((width = parser->encoding->alnum_char(b, parser->end - b)) != 0) { return width; } else if (*b == '_') { return 1; } else if (*b >= 0x80) { - return parser->encoding.char_width(b, parser->end - b); + return parser->encoding->char_width(b, parser->end - b); } else { return 0; } @@ -6148,8 +6151,8 @@ parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *star const pm_encoding_t *encoding = pm_encoding_find(start, end); if (encoding != NULL) { - if (encoding != pm_encoding_utf_8) { - parser->encoding = *encoding; + if (encoding != PM_ENCODING_UTF_8_ENTRY) { + parser->encoding = encoding; parser->encoding_changed = true; if (parser->encoding_changed_callback != NULL) parser->encoding_changed_callback(parser); } @@ -6205,7 +6208,7 @@ parser_lex_magic_comment_encoding(pm_parser_t *parser) { } const uint8_t *value_start = cursor; - while ((*cursor == '-' || *cursor == '_' || parser->encoding.alnum_char(cursor, 1)) && ++cursor < end); + while ((*cursor == '-' || *cursor == '_' || parser->encoding->alnum_char(cursor, 1)) && ++cursor < end); if (!parser_lex_magic_comment_encoding_value(parser, value_start, cursor)) { // If we were unable to parse the encoding value, then we've got an @@ -6239,7 +6242,7 @@ pm_char_is_magic_comment_key_delimiter(const uint8_t b) { */ static inline const uint8_t * parser_lex_magic_comment_emacs_marker(pm_parser_t *parser, const uint8_t *cursor, const uint8_t *end) { - while ((cursor + 3 <= end) && (cursor = pm_memchr(cursor, '-', (size_t) (end - cursor), parser->encoding_changed, &parser->encoding)) != NULL) { + while ((cursor + 3 <= end) && (cursor = pm_memchr(cursor, '-', (size_t) (end - cursor), parser->encoding_changed, parser->encoding)) != NULL) { if (cursor + 3 <= end && cursor[1] == '*' && cursor[2] == '-') { return cursor; } @@ -6329,7 +6332,7 @@ parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) { // underscores. We only need to do this if there _is_ a dash in the key. pm_string_t key; const size_t key_length = (size_t) (key_end - key_start); - const uint8_t *dash = pm_memchr(key_start, '-', (size_t) key_length, parser->encoding_changed, &parser->encoding); + const uint8_t *dash = pm_memchr(key_start, '-', (size_t) key_length, parser->encoding_changed, parser->encoding); if (dash == NULL) { pm_string_shared_init(&key, key_start, key_end); @@ -6341,7 +6344,7 @@ parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) { memcpy(buffer, key_start, width); buffer[dash - key_start] = '_'; - while ((dash = pm_memchr(dash + 1, '-', (size_t) (key_end - dash - 1), parser->encoding_changed, &parser->encoding)) != NULL) { + while ((dash = pm_memchr(dash + 1, '-', (size_t) (key_end - dash - 1), parser->encoding_changed, parser->encoding)) != NULL) { buffer[dash - key_start] = '_'; } @@ -7000,7 +7003,7 @@ lex_identifier(pm_parser_t *parser, bool previous_command_start) { } if (encoding_changed) { - return parser->encoding.isupper_char(current_start, end - current_start) ? PM_TOKEN_CONSTANT : PM_TOKEN_IDENTIFIER; + return parser->encoding->isupper_char(current_start, end - current_start) ? PM_TOKEN_CONSTANT : PM_TOKEN_IDENTIFIER; } return pm_encoding_utf_8_isupper_char(current_start, end - current_start) ? PM_TOKEN_CONSTANT : PM_TOKEN_IDENTIFIER; } @@ -7214,7 +7217,18 @@ escape_byte(uint8_t value, const uint8_t flags) { * Write a unicode codepoint to the given buffer. */ static inline void -escape_write_unicode(pm_parser_t *parser, pm_buffer_t *buffer, const uint8_t *start, const uint8_t *end, uint32_t value) { +escape_write_unicode(pm_parser_t *parser, pm_buffer_t *buffer, const uint8_t flags, const uint8_t *start, const uint8_t *end, uint32_t value) { + // \u escape sequences in string-like structures implicitly change the + // encoding to UTF-8 if they are >= 0x80 or if they are used in a character + // literal. + if (value >= 0x80 || flags & PM_ESCAPE_FLAG_SINGLE) { + if (parser->explicit_encoding != NULL && parser->explicit_encoding != PM_ENCODING_UTF_8_ENTRY) { + PM_PARSER_ERR_FORMAT(parser, start, end, PM_ERR_MIXED_ENCODING, parser->explicit_encoding->name); + } + + parser->explicit_encoding = PM_ENCODING_UTF_8_ENTRY; + } + if (value <= 0x7F) { // 0xxxxxxx pm_buffer_append_byte(buffer, (uint8_t) value); } else if (value <= 0x7FF) { // 110xxxxx 10xxxxxx @@ -7237,6 +7251,23 @@ escape_write_unicode(pm_parser_t *parser, pm_buffer_t *buffer, const uint8_t *st } } +/** + * When you're writing a byte to the unescape buffer, if the byte is non-ASCII + * (i.e., the top bit is set) then it locks in the encoding. + */ +static inline void +escape_write_byte_encoded(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t byte) { + if (byte >= 0x80) { + if (parser->explicit_encoding != NULL && parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY && parser->encoding != PM_ENCODING_UTF_8_ENTRY) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_MIXED_ENCODING, parser->encoding->name); + } + + parser->explicit_encoding = parser->encoding; + } + + pm_buffer_append_byte(buffer, byte); +} + /** * The regular expression engine doesn't support the same escape sequences as * Ruby does. So first we have to read the escape sequence, and then we have to @@ -7253,7 +7284,7 @@ escape_write_unicode(pm_parser_t *parser, pm_buffer_t *buffer, const uint8_t *st * source so that the regular expression engine will perform its own unescaping. */ static inline void -escape_write_byte(pm_buffer_t *buffer, uint8_t flags, uint8_t byte) { +escape_write_byte(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags, uint8_t byte) { if (flags & PM_ESCAPE_FLAG_REGEXP) { pm_buffer_append_bytes(buffer, (const uint8_t *) "\\x", 2); @@ -7272,7 +7303,7 @@ escape_write_byte(pm_buffer_t *buffer, uint8_t flags, uint8_t byte) { pm_buffer_append_byte(buffer, (uint8_t) (byte2 + '0')); } } else { - pm_buffer_append_byte(buffer, byte); + escape_write_byte_encoded(parser, buffer, byte); } } @@ -7351,7 +7382,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { } } - pm_buffer_append_byte(buffer, value); + escape_write_byte_encoded(parser, buffer, value); return; } case 'x': { @@ -7373,7 +7404,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { if (flags & PM_ESCAPE_FLAG_REGEXP) { pm_buffer_append_bytes(buffer, start, (size_t) (parser->current.end - start)); } else { - pm_buffer_append_byte(buffer, value); + escape_write_byte_encoded(parser, buffer, value); } } else { pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_HEXADECIMAL); @@ -7397,7 +7428,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { if (flags & PM_ESCAPE_FLAG_REGEXP) { pm_buffer_append_bytes(buffer, start, (size_t) (parser->current.end + 4 - start)); } else { - escape_write_unicode(parser, buffer, start, parser->current.end + 4, value); + escape_write_unicode(parser, buffer, flags, start, parser->current.end + 4, value); } parser->current.end += 4; @@ -7431,13 +7462,14 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { if (!(flags & PM_ESCAPE_FLAG_REGEXP)) { uint32_t value = escape_unicode(unicode_start, hexadecimal_length); - escape_write_unicode(parser, buffer, unicode_start, parser->current.end, value); + escape_write_unicode(parser, buffer, flags, unicode_start, parser->current.end, value); } parser->current.end += pm_strspn_whitespace(parser->current.end, parser->end - parser->current.end); } - // ?\u{nnnn} character literal should contain only one codepoint and cannot be like ?\u{nnnn mmmm} + // ?\u{nnnn} character literal should contain only one codepoint + // and cannot be like ?\u{nnnn mmmm}. if (flags & PM_ESCAPE_FLAG_SINGLE && codepoints_count > 1) { pm_parser_err(parser, extra_codepoints_start, parser->current.end - 1, PM_ERR_ESCAPE_INVALID_UNICODE_LITERAL); } @@ -7468,7 +7500,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { switch (peeked) { case '?': { parser->current.end++; - escape_write_byte(buffer, flags, escape_byte(0x7f, flags)); + escape_write_byte(parser, buffer, flags, escape_byte(0x7f, flags)); return; } case '\\': @@ -7486,7 +7518,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { } parser->current.end++; - escape_write_byte(buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL)); + escape_write_byte(parser, buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL)); return; } } @@ -7508,7 +7540,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { switch (peeked) { case '?': { parser->current.end++; - escape_write_byte(buffer, flags, escape_byte(0x7f, flags)); + escape_write_byte(parser, buffer, flags, escape_byte(0x7f, flags)); return; } case '\\': @@ -7526,7 +7558,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { } parser->current.end++; - escape_write_byte(buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL)); + escape_write_byte(parser, buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL)); return; } } @@ -7561,7 +7593,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { } parser->current.end++; - escape_write_byte(buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_META)); + escape_write_byte(parser, buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_META)); return; } case '\r': { @@ -7574,7 +7606,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { /* fallthrough */ default: { if (parser->current.end < parser->end) { - pm_buffer_append_byte(buffer, *parser->current.end++); + escape_write_byte_encoded(parser, buffer, *parser->current.end++); } return; } @@ -7637,13 +7669,12 @@ lex_question_mark(pm_parser_t *parser) { return PM_TOKEN_CHARACTER_LITERAL; } else { - size_t encoding_width = parser->encoding.char_width(parser->current.end, parser->end - parser->current.end); + size_t encoding_width = parser->encoding->char_width(parser->current.end, parser->end - parser->current.end); - // Ternary operators can have a ? immediately followed by an identifier which starts with - // an underscore. We check for this case + // Ternary operators can have a ? immediately followed by an identifier + // which starts with an underscore. We check for this case here. if ( - !(parser->encoding.alnum_char(parser->current.end, parser->end - parser->current.end) || - peek(parser) == '_') || + !(parser->encoding->alnum_char(parser->current.end, parser->end - parser->current.end) || peek(parser) == '_') || ( (parser->current.end + encoding_width >= parser->end) || !char_is_identifier(parser, parser->current.end + encoding_width) @@ -8491,6 +8522,7 @@ parser_lex(pm_parser_t *parser) { // TODO: handle unterminated heredoc } + parser->explicit_encoding = NULL; lex_mode_push(parser, (pm_lex_mode_t) { .mode = PM_LEX_HEREDOC, .as.heredoc = { @@ -8897,7 +8929,7 @@ parser_lex(pm_parser_t *parser) { (lex_state_p(parser, PM_LEX_STATE_FITEM) && (peek(parser) == 's')) || lex_state_spcarg_p(parser, space_seen) ) { - if (!parser->encoding.alnum_char(parser->current.end, parser->end - parser->current.end)) { + if (!parser->encoding->alnum_char(parser->current.end, parser->end - parser->current.end)) { if (*parser->current.end >= 0x80) { pm_parser_err_current(parser, PM_ERR_INVALID_PERCENT); } @@ -8920,7 +8952,7 @@ parser_lex(pm_parser_t *parser) { // Delimiters for %-literals cannot be alphanumeric. We // validate that here. uint8_t delimiter = peek_offset(parser, 1); - if (delimiter >= 0x80 || parser->encoding.alnum_char(&delimiter, 1)) { + if (delimiter >= 0x80 || parser->encoding->alnum_char(&delimiter, 1)) { pm_parser_err_current(parser, PM_ERR_INVALID_PERCENT); goto lex_next_token; } @@ -9766,7 +9798,6 @@ parser_lex(pm_parser_t *parser) { if (current_token_starts_line(parser)) { const uint8_t *start = parser->current.start; if (start + ident_length <= parser->end) { - bool at_end = false; const uint8_t *newline = next_newline(start, parser->end - start); const uint8_t *ident_end = newline; const uint8_t *terminator_end = newline; @@ -9774,7 +9805,6 @@ parser_lex(pm_parser_t *parser) { if (newline == NULL) { terminator_end = parser->end; ident_end = parser->end; - at_end = true; } else { terminator_end++; if (newline[-1] == '\r') { @@ -9801,6 +9831,7 @@ parser_lex(pm_parser_t *parser) { if (newline != NULL) { pm_newline_list_append(&parser->newline_list, newline); } + parser->current.end = terminator_end; if (*lex_mode->as.heredoc.next_start == '\\') { parser->next_start = NULL; @@ -9809,14 +9840,11 @@ parser_lex(pm_parser_t *parser) { parser->heredoc_end = parser->current.end; } - parser->current_string_common_whitespace = parser->lex_modes.current->as.heredoc.common_whitespace; - lex_mode_pop(parser); - if (!at_end) { - lex_state_set(parser, PM_LEX_STATE_END); - } + lex_state_set(parser, PM_LEX_STATE_END); LEX(PM_TOKEN_HEREDOC_END); } } + size_t whitespace = pm_heredoc_strspn_inline_whitespace(parser, &start, lex_mode->as.heredoc.indent); if ( lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE && @@ -10588,7 +10616,7 @@ parse_target(pm_parser_t *parser, pm_node_t *target) { return target; } - if (*call->message_loc.start == '_' || parser->encoding.alnum_char(call->message_loc.start, call->message_loc.end - call->message_loc.start)) { + if (*call->message_loc.start == '_' || parser->encoding->alnum_char(call->message_loc.start, call->message_loc.end - call->message_loc.start)) { parse_write_name(parser, &call->name); return (pm_node_t *) call; } @@ -10735,7 +10763,7 @@ parse_write(pm_parser_t *parser, pm_node_t *target, pm_token_t *operator, pm_nod return target; } - if (*call->message_loc.start == '_' || parser->encoding.alnum_char(call->message_loc.start, call->message_loc.end - call->message_loc.start)) { + if (*call->message_loc.start == '_' || parser->encoding->alnum_char(call->message_loc.start, call->message_loc.end - call->message_loc.start)) { // When we get here, we have a method call, because it was // previously marked as a method call but now we have an =. This // looks like: @@ -10970,7 +10998,7 @@ parse_assocs(pm_parser_t *parser, pm_node_t *node) { if (token_begins_expression_p(parser->current.type)) { value = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, false, PM_ERR_HASH_EXPRESSION_AFTER_LABEL); } else { - if (parser->encoding.isupper_char(label.start, (label.end - 1) - label.start)) { + if (parser->encoding->isupper_char(label.start, (label.end - 1) - label.start)) { pm_token_t constant = { .type = PM_TOKEN_CONSTANT, .start = label.start, .end = label.end - 1 }; value = (pm_node_t *) pm_constant_read_node_create(parser, &constant); } else { @@ -12239,6 +12267,26 @@ parse_conditional(pm_parser_t *parser, pm_context_t context) { case PM_INSTANCE_VARIABLE_READ_NODE: case PM_MULTI_TARGET_NODE: case PM_BACK_REFERENCE_READ_NODE: \ case PM_NUMBERED_REFERENCE_READ_NODE +// Assert here that the flags are the same so that we can safely switch the type +// of the node without having to move the flags. +PM_STATIC_ASSERT(__LINE__, ((int) PM_STRING_FLAGS_FORCED_UTF8_ENCODING) == ((int) PM_ENCODING_FLAGS_FORCED_UTF8_ENCODING), "Expected the flags to match."); + +/** + * If the encoding was explicitly set through the lexing process, then we need + * to potentially mark the string's flags to indicate how to encode it. + */ +static inline pm_node_flags_t +parse_unescaped_encoding(const pm_parser_t *parser) { + if (parser->explicit_encoding != NULL) { + if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) { + return PM_STRING_FLAGS_FORCED_UTF8_ENCODING; + } else if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) { + return PM_STRING_FLAGS_FORCED_BINARY_ENCODING; + } + } + return 0; +} + /** * Parse a node that is part of a string. If the subsequent tokens cannot be * parsed as a string part, then NULL is returned. @@ -12255,7 +12303,9 @@ parse_string_part(pm_parser_t *parser) { case PM_TOKEN_STRING_CONTENT: { pm_token_t opening = not_provided(parser); pm_token_t closing = not_provided(parser); + pm_node_t *node = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing); + node->flags |= parse_unescaped_encoding(parser); parser_lex(parser); return node; @@ -13459,8 +13509,9 @@ parse_strings(pm_parser_t *parser, pm_node_t *current) { // Here we have found a string literal. We'll parse it and add it to // the list of strings. - assert(parser->lex_modes.current->mode == PM_LEX_STRING); - bool lex_interpolation = parser->lex_modes.current->as.string.interpolation; + const pm_lex_mode_t *lex_mode = parser->lex_modes.current; + assert(lex_mode->mode == PM_LEX_STRING); + bool lex_interpolation = lex_mode->as.string.interpolation; pm_token_t opening = parser->current; parser_lex(parser); @@ -13544,6 +13595,7 @@ parse_strings(pm_parser_t *parser, pm_node_t *current) { if (match1(parser, PM_TOKEN_STRING_END)) { node = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &content, &parser->current, &unescaped); + node->flags |= parse_unescaped_encoding(parser); parser_lex(parser); } else if (accept1(parser, PM_TOKEN_LABEL_END)) { node = (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped); @@ -13555,6 +13607,7 @@ parse_strings(pm_parser_t *parser, pm_node_t *current) { pm_token_t string_closing = not_provided(parser); pm_node_t *part = (pm_node_t *) pm_string_node_create_unescaped(parser, &string_opening, &parser->previous, &string_closing, &unescaped); + part->flags |= parse_unescaped_encoding(parser); pm_node_list_append(&parts, part); while (!match3(parser, PM_TOKEN_STRING_END, PM_TOKEN_LABEL_END, PM_TOKEN_EOF)) { @@ -13888,6 +13941,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b pm_token_t closing = not_provided(parser); pm_node_t *node = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &content, &closing); + node->flags |= parse_unescaped_encoding(parser); // Characters can be followed by strings in which case they are // automatically concatenated. @@ -14074,7 +14128,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b if (match2(parser, PM_TOKEN_HEREDOC_END, PM_TOKEN_EOF)) { // If we get here, then we have an empty heredoc. We'll create // an empty content token and return an empty string node. - lex_state_set(parser, PM_LEX_STATE_END); + lex_mode_pop(parser); expect1(parser, PM_TOKEN_HEREDOC_END, PM_ERR_HEREDOC_TERM); pm_token_t content = parse_strings_empty_content(parser->previous.start); @@ -14095,6 +14149,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b // content and we're at the end of the heredoc, so we can return // just a string node with the heredoc opening and closing as // its opening and closing. + part->flags |= parse_unescaped_encoding(parser); pm_string_node_t *cast = (pm_string_node_t *) part; cast->opening_loc = PM_LOCATION_TOKEN_VALUE(&opening); @@ -14106,13 +14161,13 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b cast->base.type = PM_X_STRING_NODE; } - size_t common_whitespace = parser->current_string_common_whitespace; + size_t common_whitespace = lex_mode->as.heredoc.common_whitespace; if (indent == PM_HEREDOC_INDENT_TILDE && (common_whitespace != (size_t) -1) && (common_whitespace != 0)) { parse_heredoc_dedent_string(&cast->unescaped, common_whitespace); } node = (pm_node_t *) cast; - lex_state_set(parser, PM_LEX_STATE_END); + lex_mode_pop(parser); expect1(parser, PM_TOKEN_HEREDOC_END, PM_ERR_HEREDOC_TERM); } else { // If we get here, then we have multiple parts in the heredoc, @@ -14127,13 +14182,15 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b } } + size_t common_whitespace = lex_mode->as.heredoc.common_whitespace; + // Now that we have all of the parts, create the correct type of // interpolated node. if (quote == PM_HEREDOC_QUOTE_BACKTICK) { pm_interpolated_x_string_node_t *cast = pm_interpolated_xstring_node_create(parser, &opening, &opening); cast->parts = parts; - lex_state_set(parser, PM_LEX_STATE_END); + lex_mode_pop(parser); expect1(parser, PM_TOKEN_HEREDOC_END, PM_ERR_HEREDOC_TERM); pm_interpolated_xstring_node_closing_set(cast, &parser->previous); @@ -14142,7 +14199,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b } else { pm_interpolated_string_node_t *cast = pm_interpolated_string_node_create(parser, &opening, &parts, &opening); - lex_state_set(parser, PM_LEX_STATE_END); + lex_mode_pop(parser); expect1(parser, PM_TOKEN_HEREDOC_END, PM_ERR_HEREDOC_TERM); pm_interpolated_string_node_closing_set(cast, &parser->previous); @@ -14152,7 +14209,6 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b // If this is a heredoc that is indented with a ~, then we need // to dedent each line by the common leading whitespace. - size_t common_whitespace = parser->current_string_common_whitespace; if (indent == PM_HEREDOC_INDENT_TILDE && (common_whitespace != (size_t) -1) && (common_whitespace != 0)) { pm_node_list_t *nodes; if (quote == PM_HEREDOC_QUOTE_BACKTICK) { @@ -15409,8 +15465,8 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b } else { expect1(parser, PM_TOKEN_STRING_END, PM_ERR_LIST_W_LOWER_TERM); } - pm_array_node_close_set(array, &closing); + pm_array_node_close_set(array, &closing); return (pm_node_t *) array; } case PM_TOKEN_PERCENT_UPPER_W: { @@ -15418,19 +15474,24 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b pm_token_t opening = parser->previous; pm_array_node_t *array = pm_array_node_create(parser, &opening); - // This is the current node that we are parsing that will be added to the - // list of elements. + // This is the current node that we are parsing that will be added + // to the list of elements. pm_node_t *current = NULL; while (!match2(parser, PM_TOKEN_STRING_END, PM_TOKEN_EOF)) { switch (parser->current.type) { case PM_TOKEN_WORDS_SEP: { + // Reset the explicit encoding if we hit a separator + // since each element can have its own encoding. + parser->explicit_encoding = NULL; + if (current == NULL) { - // If we hit a separator before we have any content, then we don't - // need to do anything. + // If we hit a separator before we have any content, + // then we don't need to do anything. } else { - // If we hit a separator after we've hit content, then we need to - // append that content to the list and reset the current node. + // If we hit a separator after we've hit content, + // then we need to append that content to the list + // and reset the current node. pm_array_node_elements_append(array, current); current = NULL; } @@ -15443,22 +15504,25 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b pm_token_t closing = not_provided(parser); pm_node_t *string = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing); + string->flags |= parse_unescaped_encoding(parser); parser_lex(parser); if (current == NULL) { - // If we hit content and the current node is NULL, then this is - // the first string content we've seen. In that case we're going - // to create a new string node and set that to the current. + // If we hit content and the current node is NULL, + // then this is the first string content we've seen. + // In that case we're going to create a new string + // node and set that to the current. current = string; } else if (PM_NODE_TYPE_P(current, PM_INTERPOLATED_STRING_NODE)) { - // If we hit string content and the current node is an - // interpolated string, then we need to append the string content - // to the list of child nodes. + // If we hit string content and the current node is + // an interpolated string, then we need to append + // the string content to the list of child nodes. pm_interpolated_string_node_append((pm_interpolated_string_node_t *) current, string); } else if (PM_NODE_TYPE_P(current, PM_STRING_NODE)) { - // If we hit string content and the current node is a string node, - // then we need to convert the current node into an interpolated - // string and add the string content to the list of child nodes. + // If we hit string content and the current node is + // a string node, then we need to convert the + // current node into an interpolated string and add + // the string content to the list of child nodes. pm_interpolated_string_node_t *interpolated = pm_interpolated_string_node_create(parser, &opening, NULL, &closing); pm_interpolated_string_node_append(interpolated, current); pm_interpolated_string_node_append(interpolated, string); @@ -15471,24 +15535,27 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b } case PM_TOKEN_EMBVAR: { if (current == NULL) { - // If we hit an embedded variable and the current node is NULL, - // then this is the start of a new string. We'll set the current - // node to a new interpolated string. + // If we hit an embedded variable and the current + // node is NULL, then this is the start of a new + // string. We'll set the current node to a new + // interpolated string. pm_token_t opening = not_provided(parser); pm_token_t closing = not_provided(parser); current = (pm_node_t *) pm_interpolated_string_node_create(parser, &opening, NULL, &closing); } else if (PM_NODE_TYPE_P(current, PM_STRING_NODE)) { - // If we hit an embedded variable and the current node is a string - // node, then we'll convert the current into an interpolated - // string and add the string node to the list of parts. + // If we hit an embedded variable and the current + // node is a string node, then we'll convert the + // current into an interpolated string and add the + // string node to the list of parts. pm_token_t opening = not_provided(parser); pm_token_t closing = not_provided(parser); pm_interpolated_string_node_t *interpolated = pm_interpolated_string_node_create(parser, &opening, NULL, &closing); pm_interpolated_string_node_append(interpolated, current); current = (pm_node_t *) interpolated; } else { - // If we hit an embedded variable and the current node is an - // interpolated string, then we'll just add the embedded variable. + // If we hit an embedded variable and the current + // node is an interpolated string, then we'll just + // add the embedded variable. } pm_node_t *part = parse_string_part(parser); @@ -15497,25 +15564,27 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b } case PM_TOKEN_EMBEXPR_BEGIN: { if (current == NULL) { - // If we hit an embedded expression and the current node is NULL, - // then this is the start of a new string. We'll set the current - // node to a new interpolated string. + // If we hit an embedded expression and the current + // node is NULL, then this is the start of a new + // string. We'll set the current node to a new + // interpolated string. pm_token_t opening = not_provided(parser); pm_token_t closing = not_provided(parser); current = (pm_node_t *) pm_interpolated_string_node_create(parser, &opening, NULL, &closing); } else if (PM_NODE_TYPE_P(current, PM_STRING_NODE)) { - // If we hit an embedded expression and the current node is a - // string node, then we'll convert the current into an - // interpolated string and add the string node to the list of - // parts. + // If we hit an embedded expression and the current + // node is a string node, then we'll convert the + // current into an interpolated string and add the + // string node to the list of parts. pm_token_t opening = not_provided(parser); pm_token_t closing = not_provided(parser); pm_interpolated_string_node_t *interpolated = pm_interpolated_string_node_create(parser, &opening, NULL, &closing); pm_interpolated_string_node_append(interpolated, current); current = (pm_node_t *) interpolated; } else if (PM_NODE_TYPE_P(current, PM_INTERPOLATED_STRING_NODE)) { - // If we hit an embedded expression and the current node is an - // interpolated string, then we'll just continue on. + // If we hit an embedded expression and the current + // node is an interpolated string, then we'll just + // continue on. } else { assert(false && "unreachable"); } @@ -15543,8 +15612,8 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b } else { expect1(parser, PM_TOKEN_STRING_END, PM_ERR_LIST_W_UPPER_TERM); } - pm_array_node_close_set(array, &closing); + pm_array_node_close_set(array, &closing); return (pm_node_t *) array; } case PM_TOKEN_REGEXP_BEGIN: { @@ -15652,8 +15721,11 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b pm_token_t content = parser->current; parser_lex(parser); - if (accept1(parser, PM_TOKEN_STRING_END)) { - return (pm_node_t *) pm_xstring_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped); + if (match1(parser, PM_TOKEN_STRING_END)) { + pm_node_t *node = (pm_node_t *) pm_xstring_node_create_unescaped(parser, &opening, &content, &parser->current, &unescaped); + node->flags |= parse_unescaped_encoding(parser); + parser_lex(parser); + return node; } // If we get here, then we have interpolation so we'll need to @@ -15662,7 +15734,9 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b pm_token_t opening = not_provided(parser); pm_token_t closing = not_provided(parser); + pm_node_t *part = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &parser->previous, &closing, &unescaped); + part->flags |= parse_unescaped_encoding(parser); pm_interpolated_xstring_node_append(node, part); } else { @@ -15986,7 +16060,7 @@ parse_regular_expression_named_captures(pm_parser_t *parser, const pm_string_t * pm_string_list_t named_captures = { 0 }; pm_node_t *result; - if (pm_regexp_named_capture_group_names(pm_string_source(content), pm_string_length(content), &named_captures, parser->encoding_changed, &parser->encoding) && (named_captures.length > 0)) { + if (pm_regexp_named_capture_group_names(pm_string_source(content), pm_string_length(content), &named_captures, parser->encoding_changed, parser->encoding) && (named_captures.length > 0)) { // Since we should not create a MatchWriteNode when all capture names // are invalid, creating a MatchWriteNode is delayed here. pm_match_write_node_t *match = NULL; @@ -17004,7 +17078,7 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm .error_list = { 0 }, .current_scope = NULL, .current_context = NULL, - .encoding = *pm_encoding_utf_8, + .encoding = PM_ENCODING_UTF_8_ENTRY, .encoding_changed_callback = NULL, .encoding_comment_start = source, .lex_callback = NULL, @@ -17014,6 +17088,7 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm .integer_base = 0, .current_string = PM_STRING_EMPTY, .start_line = 1, + .explicit_encoding = NULL, .command_start = true, .recovering = false, .encoding_changed = false, @@ -17240,7 +17315,7 @@ pm_serialize_parse_comments(pm_buffer_t *buffer, const uint8_t *source, size_t s pm_node_t *node = pm_parse(&parser); pm_serialize_header(buffer); - pm_serialize_encoding(&parser.encoding, buffer); + pm_serialize_encoding(parser.encoding, buffer); pm_buffer_append_varsint(buffer, parser.start_line); pm_serialize_comment_list(&parser, &parser.comment_list, buffer); diff --git a/prism/prism.h b/prism/prism.h index 590cd74016..40ba5e554d 100644 --- a/prism/prism.h +++ b/prism/prism.h @@ -91,7 +91,7 @@ void pm_serialize_comment_list(pm_parser_t *parser, pm_list_t *list, pm_buffer_t * @param encoding The encoding to serialize. * @param buffer The buffer to serialize to. */ -void pm_serialize_encoding(pm_encoding_t *encoding, pm_buffer_t *buffer); +void pm_serialize_encoding(const pm_encoding_t *encoding, pm_buffer_t *buffer); /** * Serialize the encoding, metadata, nodes, and constant pool. diff --git a/prism/templates/src/serialize.c.erb b/prism/templates/src/serialize.c.erb index 92fe9188f9..e9cdd1e82c 100644 --- a/prism/templates/src/serialize.c.erb +++ b/prism/templates/src/serialize.c.erb @@ -206,7 +206,7 @@ pm_serialize_diagnostic_list(pm_parser_t *parser, pm_list_t *list, pm_buffer_t * * Serialize the name of the encoding to the buffer. */ void -pm_serialize_encoding(pm_encoding_t *encoding, pm_buffer_t *buffer) { +pm_serialize_encoding(const pm_encoding_t *encoding, pm_buffer_t *buffer) { size_t encoding_length = strlen(encoding->name); pm_buffer_append_varuint(buffer, pm_sizet_to_u32(encoding_length)); pm_buffer_append_string(buffer, encoding->name, encoding_length); @@ -218,7 +218,7 @@ pm_serialize_encoding(pm_encoding_t *encoding, pm_buffer_t *buffer) { */ void pm_serialize_content(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer) { - pm_serialize_encoding(&parser->encoding, buffer); + pm_serialize_encoding(parser->encoding, buffer); pm_buffer_append_varsint(buffer, parser->start_line); <%- unless Prism::SERIALIZE_ONLY_SEMANTICS_FIELDS -%> pm_serialize_comment_list(parser, &parser->comment_list, buffer); @@ -317,7 +317,7 @@ pm_serialize_lex(pm_buffer_t *buffer, const uint8_t *source, size_t size, const // Append 0 to mark end of tokens. pm_buffer_append_byte(buffer, 0); - pm_serialize_encoding(&parser.encoding, buffer); + pm_serialize_encoding(parser.encoding, buffer); pm_buffer_append_varsint(buffer, parser.start_line); pm_serialize_comment_list(&parser, &parser.comment_list, buffer); pm_serialize_magic_comment_list(&parser, &parser.magic_comment_list, buffer); diff --git a/prism/util/pm_strpbrk.c b/prism/util/pm_strpbrk.c index ce1f36910b..115eba1fd2 100644 --- a/prism/util/pm_strpbrk.c +++ b/prism/util/pm_strpbrk.c @@ -4,7 +4,7 @@ * This is the slow path that does care about the encoding. */ static inline const uint8_t * -pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum) { +pm_strpbrk_multi_byte(const pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum) { size_t index = 0; while (index < maximum) { @@ -12,7 +12,7 @@ pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t return source + index; } - size_t width = parser->encoding.char_width(source + index, (ptrdiff_t) (maximum - index)); + size_t width = parser->encoding->char_width(source + index, (ptrdiff_t) (maximum - index)); if (width == 0) { return NULL; } @@ -61,10 +61,10 @@ pm_strpbrk_single_byte(const uint8_t *source, const uint8_t *charset, size_t max * need to take a slower path and iterate one multi-byte character at a time. */ const uint8_t * -pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length) { +pm_strpbrk(const pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length) { if (length <= 0) { return NULL; - } else if (parser->encoding_changed && parser->encoding.multibyte) { + } else if (parser->encoding_changed && parser->encoding->multibyte) { return pm_strpbrk_multi_byte(parser, source, charset, (size_t) length); } else { return pm_strpbrk_single_byte(source, charset, (size_t) length); diff --git a/prism/util/pm_strpbrk.h b/prism/util/pm_strpbrk.h index 61a443e51a..c1cf0d54db 100644 --- a/prism/util/pm_strpbrk.h +++ b/prism/util/pm_strpbrk.h @@ -32,12 +32,12 @@ * need to take a slower path and iterate one multi-byte character at a time. * * @param parser The parser. - * @param source The source string. + * @param source The source to search. * @param charset The charset to search for. - * @param length The maximum length to search. + * @param length The maximum number of bytes to search. * @return A pointer to the first character in the source string that is in the * charset, or NULL if no such character exists. */ -const uint8_t * pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length); +const uint8_t * pm_strpbrk(const pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length); #endif diff --git a/test/prism/encoding_test.rb b/test/prism/encoding_test.rb index 94ba3a6c2a..e4678c6f82 100644 --- a/test/prism/encoding_test.rb +++ b/test/prism/encoding_test.rb @@ -7,90 +7,16 @@ require_relative "test_helper" module Prism class EncodingTest < TestCase codepoints_1byte = 0...0x100 - codepoints_2bytes = 0...0x10000 - encodings = { - Encoding::ASCII => codepoints_1byte, - Encoding::ASCII_8BIT => codepoints_1byte, - Encoding::CP850 => codepoints_1byte, - Encoding::CP852 => codepoints_1byte, - Encoding::CP855 => codepoints_1byte, - Encoding::GB1988 => codepoints_1byte, - Encoding::IBM437 => codepoints_1byte, - Encoding::IBM720 => codepoints_1byte, - Encoding::IBM737 => codepoints_1byte, - Encoding::IBM775 => codepoints_1byte, - Encoding::IBM852 => codepoints_1byte, - Encoding::IBM855 => codepoints_1byte, - Encoding::IBM857 => codepoints_1byte, - Encoding::IBM860 => codepoints_1byte, - Encoding::IBM861 => codepoints_1byte, - Encoding::IBM862 => codepoints_1byte, - Encoding::IBM863 => codepoints_1byte, - Encoding::IBM864 => codepoints_1byte, - Encoding::IBM865 => codepoints_1byte, - Encoding::IBM866 => codepoints_1byte, - Encoding::IBM869 => codepoints_1byte, - Encoding::ISO_8859_1 => codepoints_1byte, - Encoding::ISO_8859_2 => codepoints_1byte, - Encoding::ISO_8859_3 => codepoints_1byte, - Encoding::ISO_8859_4 => codepoints_1byte, - Encoding::ISO_8859_5 => codepoints_1byte, - Encoding::ISO_8859_6 => codepoints_1byte, - Encoding::ISO_8859_7 => codepoints_1byte, - Encoding::ISO_8859_8 => codepoints_1byte, - Encoding::ISO_8859_9 => codepoints_1byte, - Encoding::ISO_8859_10 => codepoints_1byte, - Encoding::ISO_8859_11 => codepoints_1byte, - Encoding::ISO_8859_13 => codepoints_1byte, - Encoding::ISO_8859_14 => codepoints_1byte, - Encoding::ISO_8859_15 => codepoints_1byte, - Encoding::ISO_8859_16 => codepoints_1byte, - Encoding::KOI8_R => codepoints_1byte, - Encoding::KOI8_U => codepoints_1byte, - Encoding::MACCENTEURO => codepoints_1byte, - Encoding::MACCROATIAN => codepoints_1byte, - Encoding::MACCYRILLIC => codepoints_1byte, - Encoding::MACGREEK => codepoints_1byte, - Encoding::MACICELAND => codepoints_1byte, - Encoding::MACROMAN => codepoints_1byte, - Encoding::MACROMANIA => codepoints_1byte, - Encoding::MACTHAI => codepoints_1byte, - Encoding::MACTURKISH => codepoints_1byte, - Encoding::MACUKRAINE => codepoints_1byte, - Encoding::TIS_620 => codepoints_1byte, - Encoding::Windows_1250 => codepoints_1byte, - Encoding::Windows_1251 => codepoints_1byte, - Encoding::Windows_1252 => codepoints_1byte, - Encoding::Windows_1253 => codepoints_1byte, - Encoding::Windows_1254 => codepoints_1byte, - Encoding::Windows_1255 => codepoints_1byte, - Encoding::Windows_1256 => codepoints_1byte, - Encoding::Windows_1257 => codepoints_1byte, - Encoding::Windows_1258 => codepoints_1byte, - Encoding::Windows_874 => codepoints_1byte, - Encoding::Big5 => codepoints_2bytes, - Encoding::Big5_HKSCS => codepoints_2bytes, - Encoding::Big5_UAO => codepoints_2bytes, - Encoding::CP949 => codepoints_2bytes, - Encoding::CP950 => codepoints_2bytes, - Encoding::CP951 => codepoints_2bytes, - Encoding::EUC_KR => codepoints_2bytes, - Encoding::GBK => codepoints_2bytes, - Encoding::GB12345 => codepoints_2bytes, - Encoding::GB2312 => codepoints_2bytes, - Encoding::MACJAPANESE => codepoints_2bytes, - Encoding::Shift_JIS => codepoints_2bytes, - Encoding::SJIS_DoCoMo => codepoints_2bytes, - Encoding::SJIS_KDDI => codepoints_2bytes, - Encoding::SJIS_SoftBank => codepoints_2bytes, - Encoding::Windows_31J => codepoints_2bytes + Encoding::ASCII_8BIT => codepoints_1byte, + Encoding::US_ASCII => codepoints_1byte, + Encoding::Windows_1253 => codepoints_1byte } - # By default we don't test every codepoint in these encodings because they - # are 3 and 4 byte representations so it can drastically slow down the test - # suite. + # By default we don't test every codepoint in these encodings because it + # takes a very long time. if ENV["PRISM_TEST_ALL_ENCODINGS"] + codepoints_2bytes = 0...0x10000 codepoints_unicode = (0...0x110000) codepoints_eucjp = [ @@ -118,6 +44,78 @@ module Prism ] encodings.merge!( + Encoding::CP850 => codepoints_1byte, + Encoding::CP852 => codepoints_1byte, + Encoding::CP855 => codepoints_1byte, + Encoding::GB1988 => codepoints_1byte, + Encoding::IBM437 => codepoints_1byte, + Encoding::IBM720 => codepoints_1byte, + Encoding::IBM737 => codepoints_1byte, + Encoding::IBM775 => codepoints_1byte, + Encoding::IBM852 => codepoints_1byte, + Encoding::IBM855 => codepoints_1byte, + Encoding::IBM857 => codepoints_1byte, + Encoding::IBM860 => codepoints_1byte, + Encoding::IBM861 => codepoints_1byte, + Encoding::IBM862 => codepoints_1byte, + Encoding::IBM863 => codepoints_1byte, + Encoding::IBM864 => codepoints_1byte, + Encoding::IBM865 => codepoints_1byte, + Encoding::IBM866 => codepoints_1byte, + Encoding::IBM869 => codepoints_1byte, + Encoding::ISO_8859_1 => codepoints_1byte, + Encoding::ISO_8859_2 => codepoints_1byte, + Encoding::ISO_8859_3 => codepoints_1byte, + Encoding::ISO_8859_4 => codepoints_1byte, + Encoding::ISO_8859_5 => codepoints_1byte, + Encoding::ISO_8859_6 => codepoints_1byte, + Encoding::ISO_8859_7 => codepoints_1byte, + Encoding::ISO_8859_8 => codepoints_1byte, + Encoding::ISO_8859_9 => codepoints_1byte, + Encoding::ISO_8859_10 => codepoints_1byte, + Encoding::ISO_8859_11 => codepoints_1byte, + Encoding::ISO_8859_13 => codepoints_1byte, + Encoding::ISO_8859_14 => codepoints_1byte, + Encoding::ISO_8859_15 => codepoints_1byte, + Encoding::ISO_8859_16 => codepoints_1byte, + Encoding::KOI8_R => codepoints_1byte, + Encoding::KOI8_U => codepoints_1byte, + Encoding::MACCENTEURO => codepoints_1byte, + Encoding::MACCROATIAN => codepoints_1byte, + Encoding::MACCYRILLIC => codepoints_1byte, + Encoding::MACGREEK => codepoints_1byte, + Encoding::MACICELAND => codepoints_1byte, + Encoding::MACROMAN => codepoints_1byte, + Encoding::MACROMANIA => codepoints_1byte, + Encoding::MACTHAI => codepoints_1byte, + Encoding::MACTURKISH => codepoints_1byte, + Encoding::MACUKRAINE => codepoints_1byte, + Encoding::TIS_620 => codepoints_1byte, + Encoding::Windows_1250 => codepoints_1byte, + Encoding::Windows_1251 => codepoints_1byte, + Encoding::Windows_1252 => codepoints_1byte, + Encoding::Windows_1254 => codepoints_1byte, + Encoding::Windows_1255 => codepoints_1byte, + Encoding::Windows_1256 => codepoints_1byte, + Encoding::Windows_1257 => codepoints_1byte, + Encoding::Windows_1258 => codepoints_1byte, + Encoding::Windows_874 => codepoints_1byte, + Encoding::Big5 => codepoints_2bytes, + Encoding::Big5_HKSCS => codepoints_2bytes, + Encoding::Big5_UAO => codepoints_2bytes, + Encoding::CP949 => codepoints_2bytes, + Encoding::CP950 => codepoints_2bytes, + Encoding::CP951 => codepoints_2bytes, + Encoding::EUC_KR => codepoints_2bytes, + Encoding::GBK => codepoints_2bytes, + Encoding::GB12345 => codepoints_2bytes, + Encoding::GB2312 => codepoints_2bytes, + Encoding::MACJAPANESE => codepoints_2bytes, + Encoding::Shift_JIS => codepoints_2bytes, + Encoding::SJIS_DoCoMo => codepoints_2bytes, + Encoding::SJIS_KDDI => codepoints_2bytes, + Encoding::SJIS_SoftBank => codepoints_2bytes, + Encoding::Windows_31J => codepoints_2bytes, Encoding::UTF_8 => codepoints_unicode, Encoding::UTF8_MAC => codepoints_unicode, Encoding::UTF8_DoCoMo => codepoints_unicode, @@ -136,6 +134,8 @@ module Prism ) end + # These test that we're correctly parsing codepoints for each alias of each + # encoding that prism supports. encodings.each do |encoding, range| encoding.names.each do |name| next if name == "locale" @@ -146,6 +146,17 @@ module Prism end end + # These test that we're correctly setting the flags on strings for each + # encoding that prism supports. + escapes = ["\\x00", "\\x7F", "\\x80", "\\xFF", "\\u{00}", "\\u{7F}", "\\u{80}", "\\M-\\C-?"] + escapes = escapes.concat(escapes.product(escapes).map(&:join)) + + encodings.each_key do |encoding| + define_method(:"test_encoding_flags_#{encoding.name}") do + assert_encoding_flags(encoding, escapes) + end + end + def test_coding result = Prism.parse("# coding: utf-8\n'string'") actual = result.value.statements.body.first.unescaped.encoding @@ -292,5 +303,47 @@ module Prism refute Prism.parse(source).success? end end + + def assert_encoding_flags(encoding, escapes) + escapes.each do |escaped| + source = "# encoding: #{encoding.name}\n\"#{escaped}\"" + + expected = + begin + eval(source).encoding + rescue SyntaxError => error + if error.message.include?("UTF-8 mixed within") + error.message[/: (.+?)\n/, 1] + else + raise + end + end + + actual = + Prism.parse(source).then do |result| + if result.success? + string = result.value.statements.body.first + + if string.forced_utf8_encoding? + Encoding::UTF_8 + elsif string.forced_binary_encoding? + Encoding::ASCII_8BIT + else + encoding + end + else + error = result.errors.first + + if error.message.include?("mixed") + error.message + else + raise error.message + end + end + end + + assert_equal expected, actual + end + end end end diff --git a/test/prism/errors_test.rb b/test/prism/errors_test.rb index 54b710b146..58bb2e3218 100644 --- a/test/prism/errors_test.rb +++ b/test/prism/errors_test.rb @@ -659,7 +659,7 @@ module Prism end def test_do_not_allow_multiple_codepoints_in_a_single_character_literal - expected = StringNode(0, Location(), Location(), nil, "\u0001\u0002") + expected = StringNode(StringFlags::FORCED_UTF8_ENCODING, Location(), Location(), nil, "\u0001\u0002") assert_errors expected, '?\u{0001 0002}', [ ["invalid Unicode escape sequence; multiple codepoints are not allowed in a character literal", 9..12] diff --git a/test/prism/snapshots/arrays.txt b/test/prism/snapshots/arrays.txt index a9adea9627..c3d4ba1e6c 100644 --- a/test/prism/snapshots/arrays.txt +++ b/test/prism/snapshots/arrays.txt @@ -796,6 +796,7 @@ │ ├── closing_loc: (64,16)-(64,17) = "#" │ └── flags: ∅ ├── @ XStringNode (location: (66,0)-(66,17)) + │ ├── flags: ∅ │ ├── opening_loc: (66,0)-(66,3) = "%x#" │ ├── content_loc: (66,3)-(66,16) = "one two three" │ ├── closing_loc: (66,16)-(66,17) = "#" @@ -844,6 +845,7 @@ │ ├── closing_loc: (71,16)-(71,17) = "@" │ └── flags: ∅ ├── @ XStringNode (location: (73,0)-(73,17)) + │ ├── flags: ∅ │ ├── opening_loc: (73,0)-(73,3) = "%x@" │ ├── content_loc: (73,3)-(73,16) = "one two three" │ ├── closing_loc: (73,16)-(73,17) = "@" @@ -892,6 +894,7 @@ │ ├── closing_loc: (78,16)-(78,17) = "}" │ └── flags: ∅ ├── @ XStringNode (location: (80,0)-(80,17)) + │ ├── flags: ∅ │ ├── opening_loc: (80,0)-(80,3) = "%x{" │ ├── content_loc: (80,3)-(80,16) = "one two three" │ ├── closing_loc: (80,16)-(80,17) = "}" diff --git a/test/prism/snapshots/patterns.txt b/test/prism/snapshots/patterns.txt index ee18448206..05f558d7cb 100644 --- a/test/prism/snapshots/patterns.txt +++ b/test/prism/snapshots/patterns.txt @@ -178,6 +178,7 @@ │ │ └── flags: variable_call │ ├── pattern: │ │ @ XStringNode (location: (10,7)-(10,12)) + │ │ ├── flags: ∅ │ │ ├── opening_loc: (10,7)-(10,8) = "`" │ │ ├── content_loc: (10,8)-(10,11) = "foo" │ │ ├── closing_loc: (10,11)-(10,12) = "`" @@ -197,6 +198,7 @@ │ │ └── flags: variable_call │ ├── pattern: │ │ @ XStringNode (location: (11,7)-(11,14)) + │ │ ├── flags: ∅ │ │ ├── opening_loc: (11,7)-(11,10) = "%x[" │ │ ├── content_loc: (11,10)-(11,13) = "foo" │ │ ├── closing_loc: (11,13)-(11,14) = "]" @@ -725,12 +727,14 @@ │ │ @ RangeNode (location: (36,7)-(36,21)) │ │ ├── left: │ │ │ @ XStringNode (location: (36,7)-(36,12)) + │ │ │ ├── flags: ∅ │ │ │ ├── opening_loc: (36,7)-(36,8) = "`" │ │ │ ├── content_loc: (36,8)-(36,11) = "foo" │ │ │ ├── closing_loc: (36,11)-(36,12) = "`" │ │ │ └── unescaped: "foo" │ │ ├── right: │ │ │ @ XStringNode (location: (36,16)-(36,21)) + │ │ │ ├── flags: ∅ │ │ │ ├── opening_loc: (36,16)-(36,17) = "`" │ │ │ ├── content_loc: (36,17)-(36,20) = "foo" │ │ │ ├── closing_loc: (36,20)-(36,21) = "`" @@ -754,12 +758,14 @@ │ │ @ RangeNode (location: (37,7)-(37,25)) │ │ ├── left: │ │ │ @ XStringNode (location: (37,7)-(37,14)) + │ │ │ ├── flags: ∅ │ │ │ ├── opening_loc: (37,7)-(37,10) = "%x[" │ │ │ ├── content_loc: (37,10)-(37,13) = "foo" │ │ │ ├── closing_loc: (37,13)-(37,14) = "]" │ │ │ └── unescaped: "foo" │ │ ├── right: │ │ │ @ XStringNode (location: (37,18)-(37,25)) + │ │ │ ├── flags: ∅ │ │ │ ├── opening_loc: (37,18)-(37,21) = "%x[" │ │ │ ├── content_loc: (37,21)-(37,24) = "foo" │ │ │ ├── closing_loc: (37,24)-(37,25) = "]" @@ -2483,6 +2489,7 @@ │ │ └── flags: variable_call │ ├── pattern: │ │ @ XStringNode (location: (109,7)-(109,12)) + │ │ ├── flags: ∅ │ │ ├── opening_loc: (109,7)-(109,8) = "`" │ │ ├── content_loc: (109,8)-(109,11) = "foo" │ │ ├── closing_loc: (109,11)-(109,12) = "`" @@ -2502,6 +2509,7 @@ │ │ └── flags: variable_call │ ├── pattern: │ │ @ XStringNode (location: (110,7)-(110,14)) + │ │ ├── flags: ∅ │ │ ├── opening_loc: (110,7)-(110,10) = "%x[" │ │ ├── content_loc: (110,10)-(110,13) = "foo" │ │ ├── closing_loc: (110,13)-(110,14) = "]" @@ -3038,6 +3046,7 @@ │ │ └── @ InNode (location: (136,10)-(136,23)) │ │ ├── pattern: │ │ │ @ XStringNode (location: (136,13)-(136,18)) + │ │ │ ├── flags: ∅ │ │ │ ├── opening_loc: (136,13)-(136,14) = "`" │ │ │ ├── content_loc: (136,14)-(136,17) = "foo" │ │ │ ├── closing_loc: (136,17)-(136,18) = "`" @@ -3064,6 +3073,7 @@ │ │ └── @ InNode (location: (137,10)-(137,25)) │ │ ├── pattern: │ │ │ @ XStringNode (location: (137,13)-(137,20)) + │ │ │ ├── flags: ∅ │ │ │ ├── opening_loc: (137,13)-(137,16) = "%x[" │ │ │ ├── content_loc: (137,16)-(137,19) = "foo" │ │ │ ├── closing_loc: (137,19)-(137,20) = "]" @@ -3828,6 +3838,7 @@ │ │ │ │ @ StatementsNode (location: (163,13)-(163,18)) │ │ │ │ └── body: (length: 1) │ │ │ │ └── @ XStringNode (location: (163,13)-(163,18)) + │ │ │ │ ├── flags: ∅ │ │ │ │ ├── opening_loc: (163,13)-(163,14) = "`" │ │ │ │ ├── content_loc: (163,14)-(163,17) = "foo" │ │ │ │ ├── closing_loc: (163,17)-(163,18) = "`" @@ -3866,6 +3877,7 @@ │ │ │ │ @ StatementsNode (location: (164,13)-(164,20)) │ │ │ │ └── body: (length: 1) │ │ │ │ └── @ XStringNode (location: (164,13)-(164,20)) + │ │ │ │ ├── flags: ∅ │ │ │ │ ├── opening_loc: (164,13)-(164,16) = "%x[" │ │ │ │ ├── content_loc: (164,16)-(164,19) = "foo" │ │ │ │ ├── closing_loc: (164,19)-(164,20) = "]" diff --git a/test/prism/snapshots/seattlerb/case_in.txt b/test/prism/snapshots/seattlerb/case_in.txt index 9134e2cb52..e66b4597b2 100644 --- a/test/prism/snapshots/seattlerb/case_in.txt +++ b/test/prism/snapshots/seattlerb/case_in.txt @@ -806,6 +806,7 @@ │ │ └── @ InNode (location: (98,0)-(98,12)) │ │ ├── pattern: │ │ │ @ XStringNode (location: (98,3)-(98,12)) + │ │ │ ├── flags: ∅ │ │ │ ├── opening_loc: (98,3)-(98,4) = "`" │ │ │ ├── content_loc: (98,4)-(98,11) = "echo hi" │ │ │ ├── closing_loc: (98,11)-(98,12) = "`" diff --git a/test/prism/snapshots/seattlerb/heredoc_bad_hex_escape.txt b/test/prism/snapshots/seattlerb/heredoc_bad_hex_escape.txt index e97c1fd7f4..2b1d776404 100644 --- a/test/prism/snapshots/seattlerb/heredoc_bad_hex_escape.txt +++ b/test/prism/snapshots/seattlerb/heredoc_bad_hex_escape.txt @@ -9,7 +9,7 @@ ├── name_loc: (1,0)-(1,1) = "s" ├── value: │ @ StringNode (location: (1,4)-(1,9)) - │ ├── flags: ∅ + │ ├── flags: forced_utf8_encoding │ ├── opening_loc: (1,4)-(1,9) = "<