[ruby/prism] Provide flags for changing encodings

https://github.com/ruby/prism/commit/e838eaff6f
This commit is contained in:
Kevin Newton 2023-12-04 12:51:22 -05:00
Родитель 9620ca6789
Коммит 82f18baa21
31 изменённых файлов: 455 добавлений и 226 удалений

Просмотреть файл

@ -346,6 +346,13 @@ flags:
- name: VARIABLE_CALL - name: VARIABLE_CALL
comment: "a call that could have been a local variable" comment: "a call that could have been a local variable"
comment: Flags for call nodes. comment: Flags for call nodes.
- name: EncodingFlags
values:
- name: FORCED_UTF8_ENCODING
comment: "internal bytes forced the encoding to UTF-8"
- name: FORCED_BINARY_ENCODING
comment: "internal bytes forced the encoding to binary"
comment: Flags for nodes that have unescaped content.
- name: IntegerBaseFlags - name: IntegerBaseFlags
values: values:
- name: BINARY - name: BINARY
@ -388,6 +395,10 @@ flags:
comment: Flags for regular expression and match last line nodes. comment: Flags for regular expression and match last line nodes.
- name: StringFlags - name: StringFlags
values: values:
- name: FORCED_UTF8_ENCODING
comment: "internal bytes forced the encoding to UTF-8"
- name: FORCED_BINARY_ENCODING
comment: "internal bytes forced the encoding to binary"
- name: FROZEN - name: FROZEN
comment: "frozen by virtue of a `frozen_string_literal` comment" comment: "frozen by virtue of a `frozen_string_literal` comment"
comment: Flags for string nodes. comment: Flags for string nodes.
@ -2576,6 +2587,9 @@ nodes:
^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^
- name: XStringNode - name: XStringNode
fields: fields:
- name: flags
type: flags
kind: EncodingFlags
- name: opening_loc - name: opening_loc
type: location type: location
- name: content_loc - name: content_loc

Просмотреть файл

@ -74,4 +74,21 @@
# define snprintf _snprintf # define snprintf _snprintf
#endif #endif
/**
* A simple utility macro to concatenate two tokens together, necessary when one
* of the tokens is itself a macro.
*/
#define PM_CONCATENATE(left, right) left ## right
/**
* We want to be able to use static assertions, but they weren't standardized
* until C11. As such, we polyfill it here by making a hacky typedef that will
* fail to compile due to a negative array size if the condition is false.
*/
#if defined(_Static_assert)
# define PM_STATIC_ASSERT(line, condition, message) _Static_assert(condition, message)
#else
# define PM_STATIC_ASSERT(line, condition, message) typedef char PM_CONCATENATE(static_assert_, line)[(condition) ? 1 : -1]
#endif
#endif #endif

Просмотреть файл

@ -185,6 +185,7 @@ static const char* const diagnostic_messages[PM_DIAGNOSTIC_ID_LEN] = {
[PM_ERR_LIST_W_UPPER_ELEMENT] = "expected a string in a `%W` list", [PM_ERR_LIST_W_UPPER_ELEMENT] = "expected a string in a `%W` list",
[PM_ERR_LIST_W_UPPER_TERM] = "expected a closing delimiter for the `%W` list", [PM_ERR_LIST_W_UPPER_TERM] = "expected a closing delimiter for the `%W` list",
[PM_ERR_MALLOC_FAILED] = "failed to allocate memory", [PM_ERR_MALLOC_FAILED] = "failed to allocate memory",
[PM_ERR_MIXED_ENCODING] = "UTF-8 mixed within %s source",
[PM_ERR_MODULE_IN_METHOD] = "unexpected module definition in a method definition", [PM_ERR_MODULE_IN_METHOD] = "unexpected module definition in a method definition",
[PM_ERR_MODULE_NAME] = "expected a constant name after `module`", [PM_ERR_MODULE_NAME] = "expected a constant name after `module`",
[PM_ERR_MODULE_TERM] = "expected an `end` to close the `module` statement", [PM_ERR_MODULE_TERM] = "expected an `end` to close the `module` statement",

Просмотреть файл

@ -177,6 +177,7 @@ typedef enum {
PM_ERR_LIST_W_UPPER_ELEMENT, PM_ERR_LIST_W_UPPER_ELEMENT,
PM_ERR_LIST_W_UPPER_TERM, PM_ERR_LIST_W_UPPER_TERM,
PM_ERR_MALLOC_FAILED, PM_ERR_MALLOC_FAILED,
PM_ERR_MIXED_ENCODING,
PM_ERR_MODULE_IN_METHOD, PM_ERR_MODULE_IN_METHOD,
PM_ERR_MODULE_NAME, PM_ERR_MODULE_NAME,
PM_ERR_MODULE_TERM, PM_ERR_MODULE_TERM,

Просмотреть файл

@ -4212,9 +4212,9 @@ pm_encoding_shift_jis_char_width(const uint8_t *b, ptrdiff_t n) {
} }
/** /**
* This is the definition of all of the encodings that we support. * This is the table of all of the encodings that prisms supports.
*/ */
static const pm_encoding_t pm_encodings[] = { const pm_encoding_t pm_encodings[] = {
[PM_ENCODING_UTF_8] = { [PM_ENCODING_UTF_8] = {
.name = "UTF-8", .name = "UTF-8",
.char_width = pm_encoding_utf_8_char_width, .char_width = pm_encoding_utf_8_char_width,
@ -4223,14 +4223,6 @@ static const pm_encoding_t pm_encodings[] = {
.isupper_char = pm_encoding_utf_8_isupper_char, .isupper_char = pm_encoding_utf_8_isupper_char,
.multibyte = true .multibyte = true
}, },
[PM_ENCODING_ASCII] = {
.name = "US-ASCII",
.char_width = pm_encoding_ascii_char_width,
.alnum_char = pm_encoding_ascii_alnum_char,
.alpha_char = pm_encoding_ascii_alpha_char,
.isupper_char = pm_encoding_ascii_isupper_char,
.multibyte = false
},
[PM_ENCODING_ASCII_8BIT] = { [PM_ENCODING_ASCII_8BIT] = {
.name = "ASCII-8BIT", .name = "ASCII-8BIT",
.char_width = pm_encoding_single_char_width, .char_width = pm_encoding_single_char_width,
@ -4815,6 +4807,14 @@ static const pm_encoding_t pm_encodings[] = {
.isupper_char = pm_encoding_tis_620_isupper_char, .isupper_char = pm_encoding_tis_620_isupper_char,
.multibyte = false .multibyte = false
}, },
[PM_ENCODING_US_ASCII] = {
.name = "US-ASCII",
.char_width = pm_encoding_ascii_char_width,
.alnum_char = pm_encoding_ascii_alnum_char,
.alpha_char = pm_encoding_ascii_alpha_char,
.isupper_char = pm_encoding_ascii_isupper_char,
.multibyte = false
},
[PM_ENCODING_UTF8_MAC] = { [PM_ENCODING_UTF8_MAC] = {
.name = "UTF8-MAC", .name = "UTF8-MAC",
.char_width = pm_encoding_utf_8_char_width, .char_width = pm_encoding_utf_8_char_width,
@ -4937,11 +4937,6 @@ static const pm_encoding_t pm_encodings[] = {
} }
}; };
/**
* This is the default UTF-8 encoding. We need it to quickly create parsers.
*/
const pm_encoding_t *pm_encoding_utf_8 = pm_encodings;
/** /**
* Parse the given name of an encoding and return a pointer to the corresponding * Parse the given name of an encoding and return a pointer to the corresponding
* encoding struct if one can be found, otherwise return NULL. * encoding struct if one can be found, otherwise return NULL.
@ -4961,7 +4956,7 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
} }
// Otherwise we'll return the default UTF-8 encoding. // Otherwise we'll return the default UTF-8 encoding.
return pm_encoding_utf_8; return PM_ENCODING_UTF_8_ENTRY;
} }
// Next, we're going to loop through each of the encodings that we handle // Next, we're going to loop through each of the encodings that we handle
@ -4972,9 +4967,9 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
if (width >= 3) { if (width >= 3) {
switch (*start) { switch (*start) {
case 'A': case 'a': case 'A': case 'a':
ENCODING1("ASCII", PM_ENCODING_ASCII); ENCODING1("ASCII", PM_ENCODING_US_ASCII);
ENCODING1("ASCII-8BIT", PM_ENCODING_ASCII_8BIT); ENCODING1("ASCII-8BIT", PM_ENCODING_ASCII_8BIT);
ENCODING1("ANSI_X3.4-1968", PM_ENCODING_ASCII); ENCODING1("ANSI_X3.4-1968", PM_ENCODING_US_ASCII);
break; break;
case 'B': case 'b': case 'B': case 'b':
ENCODING1("BINARY", PM_ENCODING_ASCII_8BIT); ENCODING1("BINARY", PM_ENCODING_ASCII_8BIT);
@ -5109,7 +5104,7 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
ENCODING1("TIS-620", PM_ENCODING_TIS_620); ENCODING1("TIS-620", PM_ENCODING_TIS_620);
break; break;
case 'U': case 'u': case 'U': case 'u':
ENCODING1("US-ASCII", PM_ENCODING_ASCII); ENCODING1("US-ASCII", PM_ENCODING_US_ASCII);
ENCODING2("UTF8-MAC", "UTF-8-HFS", PM_ENCODING_UTF8_MAC); ENCODING2("UTF8-MAC", "UTF-8-HFS", PM_ENCODING_UTF8_MAC);
ENCODING1("UTF8-DoCoMo", PM_ENCODING_UTF8_DOCOMO); ENCODING1("UTF8-DoCoMo", PM_ENCODING_UTF8_DOCOMO);
ENCODING1("UTF8-KDDI", PM_ENCODING_UTF8_KDDI); ENCODING1("UTF8-KDDI", PM_ENCODING_UTF8_KDDI);
@ -5129,7 +5124,7 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
ENCODING1("Windows-1258", PM_ENCODING_WINDOWS_1258); ENCODING1("Windows-1258", PM_ENCODING_WINDOWS_1258);
break; break;
case '6': case '6':
ENCODING1("646", PM_ENCODING_ASCII); ENCODING1("646", PM_ENCODING_US_ASCII);
break; break;
} }
} }

Просмотреть файл

@ -125,7 +125,6 @@ extern const uint8_t pm_encoding_unicode_table[256];
*/ */
typedef enum { typedef enum {
PM_ENCODING_UTF_8 = 0, PM_ENCODING_UTF_8 = 0,
PM_ENCODING_ASCII,
PM_ENCODING_ASCII_8BIT, PM_ENCODING_ASCII_8BIT,
PM_ENCODING_BIG5, PM_ENCODING_BIG5,
PM_ENCODING_BIG5_HKSCS, PM_ENCODING_BIG5_HKSCS,
@ -199,6 +198,7 @@ typedef enum {
PM_ENCODING_STATELESS_ISO_2022_JP, PM_ENCODING_STATELESS_ISO_2022_JP,
PM_ENCODING_STATELESS_ISO_2022_JP_KDDI, PM_ENCODING_STATELESS_ISO_2022_JP_KDDI,
PM_ENCODING_TIS_620, PM_ENCODING_TIS_620,
PM_ENCODING_US_ASCII,
PM_ENCODING_UTF8_MAC, PM_ENCODING_UTF8_MAC,
PM_ENCODING_UTF8_DOCOMO, PM_ENCODING_UTF8_DOCOMO,
PM_ENCODING_UTF8_KDDI, PM_ENCODING_UTF8_KDDI,
@ -213,13 +213,27 @@ typedef enum {
PM_ENCODING_WINDOWS_1257, PM_ENCODING_WINDOWS_1257,
PM_ENCODING_WINDOWS_1258, PM_ENCODING_WINDOWS_1258,
PM_ENCODING_WINDOWS_31J, PM_ENCODING_WINDOWS_31J,
PM_ENCODING_WINDOWS_874 PM_ENCODING_WINDOWS_874,
PM_ENCODING_MAXIMUM
} pm_encoding_type_t; } pm_encoding_type_t;
/** /**
* This is the default UTF-8 encoding. We need it to quickly create parsers. * This is the table of all of the encodings that prisms supports.
*/ */
extern const pm_encoding_t *pm_encoding_utf_8; extern const pm_encoding_t pm_encodings[PM_ENCODING_MAXIMUM];
/**
* This is the default UTF-8 encoding. We need a reference to it to quickly
* create parsers.
*/
#define PM_ENCODING_UTF_8_ENTRY (&pm_encodings[PM_ENCODING_UTF_8])
/**
* This is the US-ASCII encoding. We need a reference to it to be able to
* compare against it when a string is being created because it could possibly
* need to fall back to ASCII-8BIT.
*/
#define PM_ENCODING_US_ASCII_ENTRY (&pm_encodings[PM_ENCODING_US_ASCII])
/** /**
* Parse the given name of an encoding and return a pointer to the corresponding * Parse the given name of an encoding and return a pointer to the corresponding

Просмотреть файл

@ -469,7 +469,7 @@ parse_lex_token(void *data, pm_parser_t *parser, pm_token_t *token) {
static void static void
parse_lex_encoding_changed_callback(pm_parser_t *parser) { parse_lex_encoding_changed_callback(pm_parser_t *parser) {
parse_lex_data_t *parse_lex_data = (parse_lex_data_t *) parser->lex_callback->data; parse_lex_data_t *parse_lex_data = (parse_lex_data_t *) parser->lex_callback->data;
parse_lex_data->encoding = rb_enc_find(parser->encoding.name); parse_lex_data->encoding = rb_enc_find(parser->encoding->name);
// Since the encoding changed, we need to go back and change the encoding of // Since the encoding changed, we need to go back and change the encoding of
// the tokens that were already lexed. This is only going to end up being // the tokens that were already lexed. This is only going to end up being
@ -599,7 +599,7 @@ parse_input(pm_string_t *input, const pm_options_t *options) {
pm_parser_init(&parser, pm_string_source(input), pm_string_length(input), options); pm_parser_init(&parser, pm_string_source(input), pm_string_length(input), options);
pm_node_t *node = pm_parse(&parser); pm_node_t *node = pm_parse(&parser);
rb_encoding *encoding = rb_enc_find(parser.encoding.name); rb_encoding *encoding = rb_enc_find(parser.encoding->name);
VALUE source = pm_source_new(&parser, encoding); VALUE source = pm_source_new(&parser, encoding);
VALUE result_argv[] = { VALUE result_argv[] = {
@ -693,7 +693,7 @@ parse_input_comments(pm_string_t *input, const pm_options_t *options) {
pm_parser_init(&parser, pm_string_source(input), pm_string_length(input), options); pm_parser_init(&parser, pm_string_source(input), pm_string_length(input), options);
pm_node_t *node = pm_parse(&parser); pm_node_t *node = pm_parse(&parser);
rb_encoding *encoding = rb_enc_find(parser.encoding.name); rb_encoding *encoding = rb_enc_find(parser.encoding->name);
VALUE source = pm_source_new(&parser, encoding); VALUE source = pm_source_new(&parser, encoding);
VALUE comments = parser_comments(&parser, source); VALUE comments = parser_comments(&parser, source);
@ -872,7 +872,7 @@ static VALUE
named_captures(VALUE self, VALUE source) { named_captures(VALUE self, VALUE source) {
pm_string_list_t string_list = { 0 }; pm_string_list_t string_list = { 0 };
if (!pm_regexp_named_capture_group_names((const uint8_t *) RSTRING_PTR(source), RSTRING_LEN(source), &string_list, false, pm_encoding_utf_8)) { if (!pm_regexp_named_capture_group_names((const uint8_t *) RSTRING_PTR(source), RSTRING_LEN(source), &string_list, false, PM_ENCODING_UTF_8_ENTRY)) {
pm_string_list_free(&string_list); pm_string_list_free(&string_list);
return Qnil; return Qnil;
} }
@ -962,7 +962,7 @@ inspect_node(VALUE self, VALUE source) {
pm_prettyprint(&buffer, &parser, node); pm_prettyprint(&buffer, &parser, node);
rb_encoding *encoding = rb_enc_find(parser.encoding.name); rb_encoding *encoding = rb_enc_find(parser.encoding->name);
VALUE string = rb_enc_str_new(pm_buffer_value(&buffer), pm_buffer_length(&buffer), encoding); VALUE string = rb_enc_str_new(pm_buffer_value(&buffer), pm_buffer_length(&buffer), encoding);
pm_buffer_free(&buffer); pm_buffer_free(&buffer);

Просмотреть файл

@ -523,12 +523,6 @@ struct pm_parser {
size_t index; size_t index;
} lex_modes; } lex_modes;
/**
* The common_whitespace value from the most-recently-popped heredoc mode of the lexer, so we
* can dedent the heredoc after popping the lex mode.
*/
size_t current_string_common_whitespace;
/** The pointer to the start of the source. */ /** The pointer to the start of the source. */
const uint8_t *start; const uint8_t *start;
@ -581,7 +575,7 @@ struct pm_parser {
* The encoding functions for the current file is attached to the parser as * The encoding functions for the current file is attached to the parser as
* it's parsing so that it can change with a magic comment. * it's parsing so that it can change with a magic comment.
*/ */
pm_encoding_t encoding; const pm_encoding_t *encoding;
/** /**
* When the encoding that is being used to parse the source is changed by * When the encoding that is being used to parse the source is changed by
@ -637,6 +631,37 @@ struct pm_parser {
*/ */
int32_t start_line; int32_t start_line;
/**
* When a string-like expression is being lexed, any byte or escape sequence
* that resolves to a value whose top bit is set (i.e., >= 0x80) will
* explicitly set the encoding to the same encoding as the source.
* Alternatively, if a unicode escape sequence is used (e.g., \\u{80}) that
* resolves to a value whose top bit is set, then the encoding will be
* explicitly set to UTF-8.
*
* The _next_ time this happens, if the encoding that is about to become the
* explicitly set encoding does not match the previously set explicit
* encoding, a mixed encoding error will be emitted.
*
* When the expression is finished being lexed, the explicit encoding
* controls the encoding of the expression. For the most part this means
* that the expression will either be encoded in the source encoding or
* UTF-8. This holds for all encodings except US-ASCII. If the source is
* US-ASCII and an explicit encoding was set that was _not_ UTF-8, then the
* expression will be encoded as ASCII-8BIT.
*
* Note that if the expression is a list, different elements within the same
* list can have different encodings, so this will get reset between each
* element. Furthermore all of this only applies to lists that support
* interpolation, because otherwise escapes that could change the encoding
* are ignored.
*
* At first glance, it may make more sense for this to live on the lexer
* mode, but we need it here to communicate back to the parser for character
* literals that do not push a new lexer mode.
*/
const pm_encoding_t *explicit_encoding;
/** Whether or not we're at the beginning of a command. */ /** Whether or not we're at the beginning of a command. */
bool command_start; bool command_start;

Просмотреть файл

@ -275,6 +275,7 @@ lex_mode_push_list(pm_parser_t *parser, bool interpolation, uint8_t delimiter) {
breakpoints[index++] = incrementor; breakpoints[index++] = incrementor;
} }
parser->explicit_encoding = NULL;
return lex_mode_push(parser, lex_mode); return lex_mode_push(parser, lex_mode);
} }
@ -356,6 +357,7 @@ lex_mode_push_string(pm_parser_t *parser, bool interpolation, bool label_allowed
breakpoints[index++] = incrementor; breakpoints[index++] = incrementor;
} }
parser->explicit_encoding = NULL;
return lex_mode_push(parser, lex_mode); return lex_mode_push(parser, lex_mode);
} }
@ -539,7 +541,7 @@ pm_parser_err_token(pm_parser_t *parser, const pm_token_t *token, pm_diagnostic_
* Append an error to the list of errors on the parser using the location of the * Append an error to the list of errors on the parser using the location of the
* given token and a format string. * given token and a format string.
*/ */
#define PM_PARSER_ERR_TOKEN_FORMAT(parser, token, diag_id, ...) pm_diagnostic_list_append_format(&parser->error_list, token->start, token->end, diag_id, __VA_ARGS__) #define PM_PARSER_ERR_TOKEN_FORMAT(parser, token, diag_id, ...) pm_diagnostic_list_append_format(&parser->error_list, (token).start, (token).end, diag_id, __VA_ARGS__)
/** /**
* Append a warning to the list of warnings on the parser. * Append a warning to the list of warnings on the parser.
@ -5714,6 +5716,7 @@ pm_xstring_node_create_unescaped(pm_parser_t *parser, const pm_token_t *opening,
*node = (pm_x_string_node_t) { *node = (pm_x_string_node_t) {
{ {
.type = PM_X_STRING_NODE, .type = PM_X_STRING_NODE,
.flags = PM_STRING_FLAGS_FROZEN,
.location = { .location = {
.start = opening->start, .start = opening->start,
.end = closing->end .end = closing->end
@ -5922,12 +5925,12 @@ static inline size_t
char_is_identifier_start(pm_parser_t *parser, const uint8_t *b) { char_is_identifier_start(pm_parser_t *parser, const uint8_t *b) {
if (parser->encoding_changed) { if (parser->encoding_changed) {
size_t width; size_t width;
if ((width = parser->encoding.alpha_char(b, parser->end - b)) != 0) { if ((width = parser->encoding->alpha_char(b, parser->end - b)) != 0) {
return width; return width;
} else if (*b == '_') { } else if (*b == '_') {
return 1; return 1;
} else if (*b >= 0x80) { } else if (*b >= 0x80) {
return parser->encoding.char_width(b, parser->end - b); return parser->encoding->char_width(b, parser->end - b);
} else { } else {
return 0; return 0;
} }
@ -5960,12 +5963,12 @@ static inline size_t
char_is_identifier(pm_parser_t *parser, const uint8_t *b) { char_is_identifier(pm_parser_t *parser, const uint8_t *b) {
if (parser->encoding_changed) { if (parser->encoding_changed) {
size_t width; size_t width;
if ((width = parser->encoding.alnum_char(b, parser->end - b)) != 0) { if ((width = parser->encoding->alnum_char(b, parser->end - b)) != 0) {
return width; return width;
} else if (*b == '_') { } else if (*b == '_') {
return 1; return 1;
} else if (*b >= 0x80) { } else if (*b >= 0x80) {
return parser->encoding.char_width(b, parser->end - b); return parser->encoding->char_width(b, parser->end - b);
} else { } else {
return 0; return 0;
} }
@ -6148,8 +6151,8 @@ parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *star
const pm_encoding_t *encoding = pm_encoding_find(start, end); const pm_encoding_t *encoding = pm_encoding_find(start, end);
if (encoding != NULL) { if (encoding != NULL) {
if (encoding != pm_encoding_utf_8) { if (encoding != PM_ENCODING_UTF_8_ENTRY) {
parser->encoding = *encoding; parser->encoding = encoding;
parser->encoding_changed = true; parser->encoding_changed = true;
if (parser->encoding_changed_callback != NULL) parser->encoding_changed_callback(parser); if (parser->encoding_changed_callback != NULL) parser->encoding_changed_callback(parser);
} }
@ -6205,7 +6208,7 @@ parser_lex_magic_comment_encoding(pm_parser_t *parser) {
} }
const uint8_t *value_start = cursor; const uint8_t *value_start = cursor;
while ((*cursor == '-' || *cursor == '_' || parser->encoding.alnum_char(cursor, 1)) && ++cursor < end); while ((*cursor == '-' || *cursor == '_' || parser->encoding->alnum_char(cursor, 1)) && ++cursor < end);
if (!parser_lex_magic_comment_encoding_value(parser, value_start, cursor)) { if (!parser_lex_magic_comment_encoding_value(parser, value_start, cursor)) {
// If we were unable to parse the encoding value, then we've got an // If we were unable to parse the encoding value, then we've got an
@ -6239,7 +6242,7 @@ pm_char_is_magic_comment_key_delimiter(const uint8_t b) {
*/ */
static inline const uint8_t * static inline const uint8_t *
parser_lex_magic_comment_emacs_marker(pm_parser_t *parser, const uint8_t *cursor, const uint8_t *end) { parser_lex_magic_comment_emacs_marker(pm_parser_t *parser, const uint8_t *cursor, const uint8_t *end) {
while ((cursor + 3 <= end) && (cursor = pm_memchr(cursor, '-', (size_t) (end - cursor), parser->encoding_changed, &parser->encoding)) != NULL) { while ((cursor + 3 <= end) && (cursor = pm_memchr(cursor, '-', (size_t) (end - cursor), parser->encoding_changed, parser->encoding)) != NULL) {
if (cursor + 3 <= end && cursor[1] == '*' && cursor[2] == '-') { if (cursor + 3 <= end && cursor[1] == '*' && cursor[2] == '-') {
return cursor; return cursor;
} }
@ -6329,7 +6332,7 @@ parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) {
// underscores. We only need to do this if there _is_ a dash in the key. // underscores. We only need to do this if there _is_ a dash in the key.
pm_string_t key; pm_string_t key;
const size_t key_length = (size_t) (key_end - key_start); const size_t key_length = (size_t) (key_end - key_start);
const uint8_t *dash = pm_memchr(key_start, '-', (size_t) key_length, parser->encoding_changed, &parser->encoding); const uint8_t *dash = pm_memchr(key_start, '-', (size_t) key_length, parser->encoding_changed, parser->encoding);
if (dash == NULL) { if (dash == NULL) {
pm_string_shared_init(&key, key_start, key_end); pm_string_shared_init(&key, key_start, key_end);
@ -6341,7 +6344,7 @@ parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) {
memcpy(buffer, key_start, width); memcpy(buffer, key_start, width);
buffer[dash - key_start] = '_'; buffer[dash - key_start] = '_';
while ((dash = pm_memchr(dash + 1, '-', (size_t) (key_end - dash - 1), parser->encoding_changed, &parser->encoding)) != NULL) { while ((dash = pm_memchr(dash + 1, '-', (size_t) (key_end - dash - 1), parser->encoding_changed, parser->encoding)) != NULL) {
buffer[dash - key_start] = '_'; buffer[dash - key_start] = '_';
} }
@ -7000,7 +7003,7 @@ lex_identifier(pm_parser_t *parser, bool previous_command_start) {
} }
if (encoding_changed) { if (encoding_changed) {
return parser->encoding.isupper_char(current_start, end - current_start) ? PM_TOKEN_CONSTANT : PM_TOKEN_IDENTIFIER; return parser->encoding->isupper_char(current_start, end - current_start) ? PM_TOKEN_CONSTANT : PM_TOKEN_IDENTIFIER;
} }
return pm_encoding_utf_8_isupper_char(current_start, end - current_start) ? PM_TOKEN_CONSTANT : PM_TOKEN_IDENTIFIER; return pm_encoding_utf_8_isupper_char(current_start, end - current_start) ? PM_TOKEN_CONSTANT : PM_TOKEN_IDENTIFIER;
} }
@ -7214,7 +7217,18 @@ escape_byte(uint8_t value, const uint8_t flags) {
* Write a unicode codepoint to the given buffer. * Write a unicode codepoint to the given buffer.
*/ */
static inline void static inline void
escape_write_unicode(pm_parser_t *parser, pm_buffer_t *buffer, const uint8_t *start, const uint8_t *end, uint32_t value) { escape_write_unicode(pm_parser_t *parser, pm_buffer_t *buffer, const uint8_t flags, const uint8_t *start, const uint8_t *end, uint32_t value) {
// \u escape sequences in string-like structures implicitly change the
// encoding to UTF-8 if they are >= 0x80 or if they are used in a character
// literal.
if (value >= 0x80 || flags & PM_ESCAPE_FLAG_SINGLE) {
if (parser->explicit_encoding != NULL && parser->explicit_encoding != PM_ENCODING_UTF_8_ENTRY) {
PM_PARSER_ERR_FORMAT(parser, start, end, PM_ERR_MIXED_ENCODING, parser->explicit_encoding->name);
}
parser->explicit_encoding = PM_ENCODING_UTF_8_ENTRY;
}
if (value <= 0x7F) { // 0xxxxxxx if (value <= 0x7F) { // 0xxxxxxx
pm_buffer_append_byte(buffer, (uint8_t) value); pm_buffer_append_byte(buffer, (uint8_t) value);
} else if (value <= 0x7FF) { // 110xxxxx 10xxxxxx } else if (value <= 0x7FF) { // 110xxxxx 10xxxxxx
@ -7237,6 +7251,23 @@ escape_write_unicode(pm_parser_t *parser, pm_buffer_t *buffer, const uint8_t *st
} }
} }
/**
* When you're writing a byte to the unescape buffer, if the byte is non-ASCII
* (i.e., the top bit is set) then it locks in the encoding.
*/
static inline void
escape_write_byte_encoded(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t byte) {
if (byte >= 0x80) {
if (parser->explicit_encoding != NULL && parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY && parser->encoding != PM_ENCODING_UTF_8_ENTRY) {
PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_MIXED_ENCODING, parser->encoding->name);
}
parser->explicit_encoding = parser->encoding;
}
pm_buffer_append_byte(buffer, byte);
}
/** /**
* The regular expression engine doesn't support the same escape sequences as * The regular expression engine doesn't support the same escape sequences as
* Ruby does. So first we have to read the escape sequence, and then we have to * Ruby does. So first we have to read the escape sequence, and then we have to
@ -7253,7 +7284,7 @@ escape_write_unicode(pm_parser_t *parser, pm_buffer_t *buffer, const uint8_t *st
* source so that the regular expression engine will perform its own unescaping. * source so that the regular expression engine will perform its own unescaping.
*/ */
static inline void static inline void
escape_write_byte(pm_buffer_t *buffer, uint8_t flags, uint8_t byte) { escape_write_byte(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags, uint8_t byte) {
if (flags & PM_ESCAPE_FLAG_REGEXP) { if (flags & PM_ESCAPE_FLAG_REGEXP) {
pm_buffer_append_bytes(buffer, (const uint8_t *) "\\x", 2); pm_buffer_append_bytes(buffer, (const uint8_t *) "\\x", 2);
@ -7272,7 +7303,7 @@ escape_write_byte(pm_buffer_t *buffer, uint8_t flags, uint8_t byte) {
pm_buffer_append_byte(buffer, (uint8_t) (byte2 + '0')); pm_buffer_append_byte(buffer, (uint8_t) (byte2 + '0'));
} }
} else { } else {
pm_buffer_append_byte(buffer, byte); escape_write_byte_encoded(parser, buffer, byte);
} }
} }
@ -7351,7 +7382,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
} }
} }
pm_buffer_append_byte(buffer, value); escape_write_byte_encoded(parser, buffer, value);
return; return;
} }
case 'x': { case 'x': {
@ -7373,7 +7404,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
if (flags & PM_ESCAPE_FLAG_REGEXP) { if (flags & PM_ESCAPE_FLAG_REGEXP) {
pm_buffer_append_bytes(buffer, start, (size_t) (parser->current.end - start)); pm_buffer_append_bytes(buffer, start, (size_t) (parser->current.end - start));
} else { } else {
pm_buffer_append_byte(buffer, value); escape_write_byte_encoded(parser, buffer, value);
} }
} else { } else {
pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_HEXADECIMAL); pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_HEXADECIMAL);
@ -7397,7 +7428,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
if (flags & PM_ESCAPE_FLAG_REGEXP) { if (flags & PM_ESCAPE_FLAG_REGEXP) {
pm_buffer_append_bytes(buffer, start, (size_t) (parser->current.end + 4 - start)); pm_buffer_append_bytes(buffer, start, (size_t) (parser->current.end + 4 - start));
} else { } else {
escape_write_unicode(parser, buffer, start, parser->current.end + 4, value); escape_write_unicode(parser, buffer, flags, start, parser->current.end + 4, value);
} }
parser->current.end += 4; parser->current.end += 4;
@ -7431,13 +7462,14 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
if (!(flags & PM_ESCAPE_FLAG_REGEXP)) { if (!(flags & PM_ESCAPE_FLAG_REGEXP)) {
uint32_t value = escape_unicode(unicode_start, hexadecimal_length); uint32_t value = escape_unicode(unicode_start, hexadecimal_length);
escape_write_unicode(parser, buffer, unicode_start, parser->current.end, value); escape_write_unicode(parser, buffer, flags, unicode_start, parser->current.end, value);
} }
parser->current.end += pm_strspn_whitespace(parser->current.end, parser->end - parser->current.end); parser->current.end += pm_strspn_whitespace(parser->current.end, parser->end - parser->current.end);
} }
// ?\u{nnnn} character literal should contain only one codepoint and cannot be like ?\u{nnnn mmmm} // ?\u{nnnn} character literal should contain only one codepoint
// and cannot be like ?\u{nnnn mmmm}.
if (flags & PM_ESCAPE_FLAG_SINGLE && codepoints_count > 1) { if (flags & PM_ESCAPE_FLAG_SINGLE && codepoints_count > 1) {
pm_parser_err(parser, extra_codepoints_start, parser->current.end - 1, PM_ERR_ESCAPE_INVALID_UNICODE_LITERAL); pm_parser_err(parser, extra_codepoints_start, parser->current.end - 1, PM_ERR_ESCAPE_INVALID_UNICODE_LITERAL);
} }
@ -7468,7 +7500,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
switch (peeked) { switch (peeked) {
case '?': { case '?': {
parser->current.end++; parser->current.end++;
escape_write_byte(buffer, flags, escape_byte(0x7f, flags)); escape_write_byte(parser, buffer, flags, escape_byte(0x7f, flags));
return; return;
} }
case '\\': case '\\':
@ -7486,7 +7518,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
} }
parser->current.end++; parser->current.end++;
escape_write_byte(buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL)); escape_write_byte(parser, buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL));
return; return;
} }
} }
@ -7508,7 +7540,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
switch (peeked) { switch (peeked) {
case '?': { case '?': {
parser->current.end++; parser->current.end++;
escape_write_byte(buffer, flags, escape_byte(0x7f, flags)); escape_write_byte(parser, buffer, flags, escape_byte(0x7f, flags));
return; return;
} }
case '\\': case '\\':
@ -7526,7 +7558,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
} }
parser->current.end++; parser->current.end++;
escape_write_byte(buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL)); escape_write_byte(parser, buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL));
return; return;
} }
} }
@ -7561,7 +7593,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
} }
parser->current.end++; parser->current.end++;
escape_write_byte(buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_META)); escape_write_byte(parser, buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_META));
return; return;
} }
case '\r': { case '\r': {
@ -7574,7 +7606,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
/* fallthrough */ /* fallthrough */
default: { default: {
if (parser->current.end < parser->end) { if (parser->current.end < parser->end) {
pm_buffer_append_byte(buffer, *parser->current.end++); escape_write_byte_encoded(parser, buffer, *parser->current.end++);
} }
return; return;
} }
@ -7637,13 +7669,12 @@ lex_question_mark(pm_parser_t *parser) {
return PM_TOKEN_CHARACTER_LITERAL; return PM_TOKEN_CHARACTER_LITERAL;
} else { } else {
size_t encoding_width = parser->encoding.char_width(parser->current.end, parser->end - parser->current.end); size_t encoding_width = parser->encoding->char_width(parser->current.end, parser->end - parser->current.end);
// Ternary operators can have a ? immediately followed by an identifier which starts with // Ternary operators can have a ? immediately followed by an identifier
// an underscore. We check for this case // which starts with an underscore. We check for this case here.
if ( if (
!(parser->encoding.alnum_char(parser->current.end, parser->end - parser->current.end) || !(parser->encoding->alnum_char(parser->current.end, parser->end - parser->current.end) || peek(parser) == '_') ||
peek(parser) == '_') ||
( (
(parser->current.end + encoding_width >= parser->end) || (parser->current.end + encoding_width >= parser->end) ||
!char_is_identifier(parser, parser->current.end + encoding_width) !char_is_identifier(parser, parser->current.end + encoding_width)
@ -8491,6 +8522,7 @@ parser_lex(pm_parser_t *parser) {
// TODO: handle unterminated heredoc // TODO: handle unterminated heredoc
} }
parser->explicit_encoding = NULL;
lex_mode_push(parser, (pm_lex_mode_t) { lex_mode_push(parser, (pm_lex_mode_t) {
.mode = PM_LEX_HEREDOC, .mode = PM_LEX_HEREDOC,
.as.heredoc = { .as.heredoc = {
@ -8897,7 +8929,7 @@ parser_lex(pm_parser_t *parser) {
(lex_state_p(parser, PM_LEX_STATE_FITEM) && (peek(parser) == 's')) || (lex_state_p(parser, PM_LEX_STATE_FITEM) && (peek(parser) == 's')) ||
lex_state_spcarg_p(parser, space_seen) lex_state_spcarg_p(parser, space_seen)
) { ) {
if (!parser->encoding.alnum_char(parser->current.end, parser->end - parser->current.end)) { if (!parser->encoding->alnum_char(parser->current.end, parser->end - parser->current.end)) {
if (*parser->current.end >= 0x80) { if (*parser->current.end >= 0x80) {
pm_parser_err_current(parser, PM_ERR_INVALID_PERCENT); pm_parser_err_current(parser, PM_ERR_INVALID_PERCENT);
} }
@ -8920,7 +8952,7 @@ parser_lex(pm_parser_t *parser) {
// Delimiters for %-literals cannot be alphanumeric. We // Delimiters for %-literals cannot be alphanumeric. We
// validate that here. // validate that here.
uint8_t delimiter = peek_offset(parser, 1); uint8_t delimiter = peek_offset(parser, 1);
if (delimiter >= 0x80 || parser->encoding.alnum_char(&delimiter, 1)) { if (delimiter >= 0x80 || parser->encoding->alnum_char(&delimiter, 1)) {
pm_parser_err_current(parser, PM_ERR_INVALID_PERCENT); pm_parser_err_current(parser, PM_ERR_INVALID_PERCENT);
goto lex_next_token; goto lex_next_token;
} }
@ -9766,7 +9798,6 @@ parser_lex(pm_parser_t *parser) {
if (current_token_starts_line(parser)) { if (current_token_starts_line(parser)) {
const uint8_t *start = parser->current.start; const uint8_t *start = parser->current.start;
if (start + ident_length <= parser->end) { if (start + ident_length <= parser->end) {
bool at_end = false;
const uint8_t *newline = next_newline(start, parser->end - start); const uint8_t *newline = next_newline(start, parser->end - start);
const uint8_t *ident_end = newline; const uint8_t *ident_end = newline;
const uint8_t *terminator_end = newline; const uint8_t *terminator_end = newline;
@ -9774,7 +9805,6 @@ parser_lex(pm_parser_t *parser) {
if (newline == NULL) { if (newline == NULL) {
terminator_end = parser->end; terminator_end = parser->end;
ident_end = parser->end; ident_end = parser->end;
at_end = true;
} else { } else {
terminator_end++; terminator_end++;
if (newline[-1] == '\r') { if (newline[-1] == '\r') {
@ -9801,6 +9831,7 @@ parser_lex(pm_parser_t *parser) {
if (newline != NULL) { if (newline != NULL) {
pm_newline_list_append(&parser->newline_list, newline); pm_newline_list_append(&parser->newline_list, newline);
} }
parser->current.end = terminator_end; parser->current.end = terminator_end;
if (*lex_mode->as.heredoc.next_start == '\\') { if (*lex_mode->as.heredoc.next_start == '\\') {
parser->next_start = NULL; parser->next_start = NULL;
@ -9809,14 +9840,11 @@ parser_lex(pm_parser_t *parser) {
parser->heredoc_end = parser->current.end; parser->heredoc_end = parser->current.end;
} }
parser->current_string_common_whitespace = parser->lex_modes.current->as.heredoc.common_whitespace; lex_state_set(parser, PM_LEX_STATE_END);
lex_mode_pop(parser);
if (!at_end) {
lex_state_set(parser, PM_LEX_STATE_END);
}
LEX(PM_TOKEN_HEREDOC_END); LEX(PM_TOKEN_HEREDOC_END);
} }
} }
size_t whitespace = pm_heredoc_strspn_inline_whitespace(parser, &start, lex_mode->as.heredoc.indent); size_t whitespace = pm_heredoc_strspn_inline_whitespace(parser, &start, lex_mode->as.heredoc.indent);
if ( if (
lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE && lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE &&
@ -10588,7 +10616,7 @@ parse_target(pm_parser_t *parser, pm_node_t *target) {
return target; return target;
} }
if (*call->message_loc.start == '_' || parser->encoding.alnum_char(call->message_loc.start, call->message_loc.end - call->message_loc.start)) { if (*call->message_loc.start == '_' || parser->encoding->alnum_char(call->message_loc.start, call->message_loc.end - call->message_loc.start)) {
parse_write_name(parser, &call->name); parse_write_name(parser, &call->name);
return (pm_node_t *) call; return (pm_node_t *) call;
} }
@ -10735,7 +10763,7 @@ parse_write(pm_parser_t *parser, pm_node_t *target, pm_token_t *operator, pm_nod
return target; return target;
} }
if (*call->message_loc.start == '_' || parser->encoding.alnum_char(call->message_loc.start, call->message_loc.end - call->message_loc.start)) { if (*call->message_loc.start == '_' || parser->encoding->alnum_char(call->message_loc.start, call->message_loc.end - call->message_loc.start)) {
// When we get here, we have a method call, because it was // When we get here, we have a method call, because it was
// previously marked as a method call but now we have an =. This // previously marked as a method call but now we have an =. This
// looks like: // looks like:
@ -10970,7 +10998,7 @@ parse_assocs(pm_parser_t *parser, pm_node_t *node) {
if (token_begins_expression_p(parser->current.type)) { if (token_begins_expression_p(parser->current.type)) {
value = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, false, PM_ERR_HASH_EXPRESSION_AFTER_LABEL); value = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, false, PM_ERR_HASH_EXPRESSION_AFTER_LABEL);
} else { } else {
if (parser->encoding.isupper_char(label.start, (label.end - 1) - label.start)) { if (parser->encoding->isupper_char(label.start, (label.end - 1) - label.start)) {
pm_token_t constant = { .type = PM_TOKEN_CONSTANT, .start = label.start, .end = label.end - 1 }; pm_token_t constant = { .type = PM_TOKEN_CONSTANT, .start = label.start, .end = label.end - 1 };
value = (pm_node_t *) pm_constant_read_node_create(parser, &constant); value = (pm_node_t *) pm_constant_read_node_create(parser, &constant);
} else { } else {
@ -12239,6 +12267,26 @@ parse_conditional(pm_parser_t *parser, pm_context_t context) {
case PM_INSTANCE_VARIABLE_READ_NODE: case PM_MULTI_TARGET_NODE: case PM_BACK_REFERENCE_READ_NODE: \ case PM_INSTANCE_VARIABLE_READ_NODE: case PM_MULTI_TARGET_NODE: case PM_BACK_REFERENCE_READ_NODE: \
case PM_NUMBERED_REFERENCE_READ_NODE case PM_NUMBERED_REFERENCE_READ_NODE
// Assert here that the flags are the same so that we can safely switch the type
// of the node without having to move the flags.
PM_STATIC_ASSERT(__LINE__, ((int) PM_STRING_FLAGS_FORCED_UTF8_ENCODING) == ((int) PM_ENCODING_FLAGS_FORCED_UTF8_ENCODING), "Expected the flags to match.");
/**
* If the encoding was explicitly set through the lexing process, then we need
* to potentially mark the string's flags to indicate how to encode it.
*/
static inline pm_node_flags_t
parse_unescaped_encoding(const pm_parser_t *parser) {
if (parser->explicit_encoding != NULL) {
if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) {
return PM_STRING_FLAGS_FORCED_UTF8_ENCODING;
} else if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) {
return PM_STRING_FLAGS_FORCED_BINARY_ENCODING;
}
}
return 0;
}
/** /**
* Parse a node that is part of a string. If the subsequent tokens cannot be * Parse a node that is part of a string. If the subsequent tokens cannot be
* parsed as a string part, then NULL is returned. * parsed as a string part, then NULL is returned.
@ -12255,7 +12303,9 @@ parse_string_part(pm_parser_t *parser) {
case PM_TOKEN_STRING_CONTENT: { case PM_TOKEN_STRING_CONTENT: {
pm_token_t opening = not_provided(parser); pm_token_t opening = not_provided(parser);
pm_token_t closing = not_provided(parser); pm_token_t closing = not_provided(parser);
pm_node_t *node = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing); pm_node_t *node = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing);
node->flags |= parse_unescaped_encoding(parser);
parser_lex(parser); parser_lex(parser);
return node; return node;
@ -13459,8 +13509,9 @@ parse_strings(pm_parser_t *parser, pm_node_t *current) {
// Here we have found a string literal. We'll parse it and add it to // Here we have found a string literal. We'll parse it and add it to
// the list of strings. // the list of strings.
assert(parser->lex_modes.current->mode == PM_LEX_STRING); const pm_lex_mode_t *lex_mode = parser->lex_modes.current;
bool lex_interpolation = parser->lex_modes.current->as.string.interpolation; assert(lex_mode->mode == PM_LEX_STRING);
bool lex_interpolation = lex_mode->as.string.interpolation;
pm_token_t opening = parser->current; pm_token_t opening = parser->current;
parser_lex(parser); parser_lex(parser);
@ -13544,6 +13595,7 @@ parse_strings(pm_parser_t *parser, pm_node_t *current) {
if (match1(parser, PM_TOKEN_STRING_END)) { if (match1(parser, PM_TOKEN_STRING_END)) {
node = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &content, &parser->current, &unescaped); node = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &content, &parser->current, &unescaped);
node->flags |= parse_unescaped_encoding(parser);
parser_lex(parser); parser_lex(parser);
} else if (accept1(parser, PM_TOKEN_LABEL_END)) { } else if (accept1(parser, PM_TOKEN_LABEL_END)) {
node = (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped); node = (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped);
@ -13555,6 +13607,7 @@ parse_strings(pm_parser_t *parser, pm_node_t *current) {
pm_token_t string_closing = not_provided(parser); pm_token_t string_closing = not_provided(parser);
pm_node_t *part = (pm_node_t *) pm_string_node_create_unescaped(parser, &string_opening, &parser->previous, &string_closing, &unescaped); pm_node_t *part = (pm_node_t *) pm_string_node_create_unescaped(parser, &string_opening, &parser->previous, &string_closing, &unescaped);
part->flags |= parse_unescaped_encoding(parser);
pm_node_list_append(&parts, part); pm_node_list_append(&parts, part);
while (!match3(parser, PM_TOKEN_STRING_END, PM_TOKEN_LABEL_END, PM_TOKEN_EOF)) { while (!match3(parser, PM_TOKEN_STRING_END, PM_TOKEN_LABEL_END, PM_TOKEN_EOF)) {
@ -13888,6 +13941,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
pm_token_t closing = not_provided(parser); pm_token_t closing = not_provided(parser);
pm_node_t *node = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &content, &closing); pm_node_t *node = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &content, &closing);
node->flags |= parse_unescaped_encoding(parser);
// Characters can be followed by strings in which case they are // Characters can be followed by strings in which case they are
// automatically concatenated. // automatically concatenated.
@ -14074,7 +14128,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
if (match2(parser, PM_TOKEN_HEREDOC_END, PM_TOKEN_EOF)) { if (match2(parser, PM_TOKEN_HEREDOC_END, PM_TOKEN_EOF)) {
// If we get here, then we have an empty heredoc. We'll create // If we get here, then we have an empty heredoc. We'll create
// an empty content token and return an empty string node. // an empty content token and return an empty string node.
lex_state_set(parser, PM_LEX_STATE_END); lex_mode_pop(parser);
expect1(parser, PM_TOKEN_HEREDOC_END, PM_ERR_HEREDOC_TERM); expect1(parser, PM_TOKEN_HEREDOC_END, PM_ERR_HEREDOC_TERM);
pm_token_t content = parse_strings_empty_content(parser->previous.start); pm_token_t content = parse_strings_empty_content(parser->previous.start);
@ -14095,6 +14149,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
// content and we're at the end of the heredoc, so we can return // content and we're at the end of the heredoc, so we can return
// just a string node with the heredoc opening and closing as // just a string node with the heredoc opening and closing as
// its opening and closing. // its opening and closing.
part->flags |= parse_unescaped_encoding(parser);
pm_string_node_t *cast = (pm_string_node_t *) part; pm_string_node_t *cast = (pm_string_node_t *) part;
cast->opening_loc = PM_LOCATION_TOKEN_VALUE(&opening); cast->opening_loc = PM_LOCATION_TOKEN_VALUE(&opening);
@ -14106,13 +14161,13 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
cast->base.type = PM_X_STRING_NODE; cast->base.type = PM_X_STRING_NODE;
} }
size_t common_whitespace = parser->current_string_common_whitespace; size_t common_whitespace = lex_mode->as.heredoc.common_whitespace;
if (indent == PM_HEREDOC_INDENT_TILDE && (common_whitespace != (size_t) -1) && (common_whitespace != 0)) { if (indent == PM_HEREDOC_INDENT_TILDE && (common_whitespace != (size_t) -1) && (common_whitespace != 0)) {
parse_heredoc_dedent_string(&cast->unescaped, common_whitespace); parse_heredoc_dedent_string(&cast->unescaped, common_whitespace);
} }
node = (pm_node_t *) cast; node = (pm_node_t *) cast;
lex_state_set(parser, PM_LEX_STATE_END); lex_mode_pop(parser);
expect1(parser, PM_TOKEN_HEREDOC_END, PM_ERR_HEREDOC_TERM); expect1(parser, PM_TOKEN_HEREDOC_END, PM_ERR_HEREDOC_TERM);
} else { } else {
// If we get here, then we have multiple parts in the heredoc, // If we get here, then we have multiple parts in the heredoc,
@ -14127,13 +14182,15 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
} }
} }
size_t common_whitespace = lex_mode->as.heredoc.common_whitespace;
// Now that we have all of the parts, create the correct type of // Now that we have all of the parts, create the correct type of
// interpolated node. // interpolated node.
if (quote == PM_HEREDOC_QUOTE_BACKTICK) { if (quote == PM_HEREDOC_QUOTE_BACKTICK) {
pm_interpolated_x_string_node_t *cast = pm_interpolated_xstring_node_create(parser, &opening, &opening); pm_interpolated_x_string_node_t *cast = pm_interpolated_xstring_node_create(parser, &opening, &opening);
cast->parts = parts; cast->parts = parts;
lex_state_set(parser, PM_LEX_STATE_END); lex_mode_pop(parser);
expect1(parser, PM_TOKEN_HEREDOC_END, PM_ERR_HEREDOC_TERM); expect1(parser, PM_TOKEN_HEREDOC_END, PM_ERR_HEREDOC_TERM);
pm_interpolated_xstring_node_closing_set(cast, &parser->previous); pm_interpolated_xstring_node_closing_set(cast, &parser->previous);
@ -14142,7 +14199,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
} else { } else {
pm_interpolated_string_node_t *cast = pm_interpolated_string_node_create(parser, &opening, &parts, &opening); pm_interpolated_string_node_t *cast = pm_interpolated_string_node_create(parser, &opening, &parts, &opening);
lex_state_set(parser, PM_LEX_STATE_END); lex_mode_pop(parser);
expect1(parser, PM_TOKEN_HEREDOC_END, PM_ERR_HEREDOC_TERM); expect1(parser, PM_TOKEN_HEREDOC_END, PM_ERR_HEREDOC_TERM);
pm_interpolated_string_node_closing_set(cast, &parser->previous); pm_interpolated_string_node_closing_set(cast, &parser->previous);
@ -14152,7 +14209,6 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
// If this is a heredoc that is indented with a ~, then we need // If this is a heredoc that is indented with a ~, then we need
// to dedent each line by the common leading whitespace. // to dedent each line by the common leading whitespace.
size_t common_whitespace = parser->current_string_common_whitespace;
if (indent == PM_HEREDOC_INDENT_TILDE && (common_whitespace != (size_t) -1) && (common_whitespace != 0)) { if (indent == PM_HEREDOC_INDENT_TILDE && (common_whitespace != (size_t) -1) && (common_whitespace != 0)) {
pm_node_list_t *nodes; pm_node_list_t *nodes;
if (quote == PM_HEREDOC_QUOTE_BACKTICK) { if (quote == PM_HEREDOC_QUOTE_BACKTICK) {
@ -15409,8 +15465,8 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
} else { } else {
expect1(parser, PM_TOKEN_STRING_END, PM_ERR_LIST_W_LOWER_TERM); expect1(parser, PM_TOKEN_STRING_END, PM_ERR_LIST_W_LOWER_TERM);
} }
pm_array_node_close_set(array, &closing);
pm_array_node_close_set(array, &closing);
return (pm_node_t *) array; return (pm_node_t *) array;
} }
case PM_TOKEN_PERCENT_UPPER_W: { case PM_TOKEN_PERCENT_UPPER_W: {
@ -15418,19 +15474,24 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
pm_token_t opening = parser->previous; pm_token_t opening = parser->previous;
pm_array_node_t *array = pm_array_node_create(parser, &opening); pm_array_node_t *array = pm_array_node_create(parser, &opening);
// This is the current node that we are parsing that will be added to the // This is the current node that we are parsing that will be added
// list of elements. // to the list of elements.
pm_node_t *current = NULL; pm_node_t *current = NULL;
while (!match2(parser, PM_TOKEN_STRING_END, PM_TOKEN_EOF)) { while (!match2(parser, PM_TOKEN_STRING_END, PM_TOKEN_EOF)) {
switch (parser->current.type) { switch (parser->current.type) {
case PM_TOKEN_WORDS_SEP: { case PM_TOKEN_WORDS_SEP: {
// Reset the explicit encoding if we hit a separator
// since each element can have its own encoding.
parser->explicit_encoding = NULL;
if (current == NULL) { if (current == NULL) {
// If we hit a separator before we have any content, then we don't // If we hit a separator before we have any content,
// need to do anything. // then we don't need to do anything.
} else { } else {
// If we hit a separator after we've hit content, then we need to // If we hit a separator after we've hit content,
// append that content to the list and reset the current node. // then we need to append that content to the list
// and reset the current node.
pm_array_node_elements_append(array, current); pm_array_node_elements_append(array, current);
current = NULL; current = NULL;
} }
@ -15443,22 +15504,25 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
pm_token_t closing = not_provided(parser); pm_token_t closing = not_provided(parser);
pm_node_t *string = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing); pm_node_t *string = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing);
string->flags |= parse_unescaped_encoding(parser);
parser_lex(parser); parser_lex(parser);
if (current == NULL) { if (current == NULL) {
// If we hit content and the current node is NULL, then this is // If we hit content and the current node is NULL,
// the first string content we've seen. In that case we're going // then this is the first string content we've seen.
// to create a new string node and set that to the current. // In that case we're going to create a new string
// node and set that to the current.
current = string; current = string;
} else if (PM_NODE_TYPE_P(current, PM_INTERPOLATED_STRING_NODE)) { } else if (PM_NODE_TYPE_P(current, PM_INTERPOLATED_STRING_NODE)) {
// If we hit string content and the current node is an // If we hit string content and the current node is
// interpolated string, then we need to append the string content // an interpolated string, then we need to append
// to the list of child nodes. // the string content to the list of child nodes.
pm_interpolated_string_node_append((pm_interpolated_string_node_t *) current, string); pm_interpolated_string_node_append((pm_interpolated_string_node_t *) current, string);
} else if (PM_NODE_TYPE_P(current, PM_STRING_NODE)) { } else if (PM_NODE_TYPE_P(current, PM_STRING_NODE)) {
// If we hit string content and the current node is a string node, // If we hit string content and the current node is
// then we need to convert the current node into an interpolated // a string node, then we need to convert the
// string and add the string content to the list of child nodes. // current node into an interpolated string and add
// the string content to the list of child nodes.
pm_interpolated_string_node_t *interpolated = pm_interpolated_string_node_create(parser, &opening, NULL, &closing); pm_interpolated_string_node_t *interpolated = pm_interpolated_string_node_create(parser, &opening, NULL, &closing);
pm_interpolated_string_node_append(interpolated, current); pm_interpolated_string_node_append(interpolated, current);
pm_interpolated_string_node_append(interpolated, string); pm_interpolated_string_node_append(interpolated, string);
@ -15471,24 +15535,27 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
} }
case PM_TOKEN_EMBVAR: { case PM_TOKEN_EMBVAR: {
if (current == NULL) { if (current == NULL) {
// If we hit an embedded variable and the current node is NULL, // If we hit an embedded variable and the current
// then this is the start of a new string. We'll set the current // node is NULL, then this is the start of a new
// node to a new interpolated string. // string. We'll set the current node to a new
// interpolated string.
pm_token_t opening = not_provided(parser); pm_token_t opening = not_provided(parser);
pm_token_t closing = not_provided(parser); pm_token_t closing = not_provided(parser);
current = (pm_node_t *) pm_interpolated_string_node_create(parser, &opening, NULL, &closing); current = (pm_node_t *) pm_interpolated_string_node_create(parser, &opening, NULL, &closing);
} else if (PM_NODE_TYPE_P(current, PM_STRING_NODE)) { } else if (PM_NODE_TYPE_P(current, PM_STRING_NODE)) {
// If we hit an embedded variable and the current node is a string // If we hit an embedded variable and the current
// node, then we'll convert the current into an interpolated // node is a string node, then we'll convert the
// string and add the string node to the list of parts. // current into an interpolated string and add the
// string node to the list of parts.
pm_token_t opening = not_provided(parser); pm_token_t opening = not_provided(parser);
pm_token_t closing = not_provided(parser); pm_token_t closing = not_provided(parser);
pm_interpolated_string_node_t *interpolated = pm_interpolated_string_node_create(parser, &opening, NULL, &closing); pm_interpolated_string_node_t *interpolated = pm_interpolated_string_node_create(parser, &opening, NULL, &closing);
pm_interpolated_string_node_append(interpolated, current); pm_interpolated_string_node_append(interpolated, current);
current = (pm_node_t *) interpolated; current = (pm_node_t *) interpolated;
} else { } else {
// If we hit an embedded variable and the current node is an // If we hit an embedded variable and the current
// interpolated string, then we'll just add the embedded variable. // node is an interpolated string, then we'll just
// add the embedded variable.
} }
pm_node_t *part = parse_string_part(parser); pm_node_t *part = parse_string_part(parser);
@ -15497,25 +15564,27 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
} }
case PM_TOKEN_EMBEXPR_BEGIN: { case PM_TOKEN_EMBEXPR_BEGIN: {
if (current == NULL) { if (current == NULL) {
// If we hit an embedded expression and the current node is NULL, // If we hit an embedded expression and the current
// then this is the start of a new string. We'll set the current // node is NULL, then this is the start of a new
// node to a new interpolated string. // string. We'll set the current node to a new
// interpolated string.
pm_token_t opening = not_provided(parser); pm_token_t opening = not_provided(parser);
pm_token_t closing = not_provided(parser); pm_token_t closing = not_provided(parser);
current = (pm_node_t *) pm_interpolated_string_node_create(parser, &opening, NULL, &closing); current = (pm_node_t *) pm_interpolated_string_node_create(parser, &opening, NULL, &closing);
} else if (PM_NODE_TYPE_P(current, PM_STRING_NODE)) { } else if (PM_NODE_TYPE_P(current, PM_STRING_NODE)) {
// If we hit an embedded expression and the current node is a // If we hit an embedded expression and the current
// string node, then we'll convert the current into an // node is a string node, then we'll convert the
// interpolated string and add the string node to the list of // current into an interpolated string and add the
// parts. // string node to the list of parts.
pm_token_t opening = not_provided(parser); pm_token_t opening = not_provided(parser);
pm_token_t closing = not_provided(parser); pm_token_t closing = not_provided(parser);
pm_interpolated_string_node_t *interpolated = pm_interpolated_string_node_create(parser, &opening, NULL, &closing); pm_interpolated_string_node_t *interpolated = pm_interpolated_string_node_create(parser, &opening, NULL, &closing);
pm_interpolated_string_node_append(interpolated, current); pm_interpolated_string_node_append(interpolated, current);
current = (pm_node_t *) interpolated; current = (pm_node_t *) interpolated;
} else if (PM_NODE_TYPE_P(current, PM_INTERPOLATED_STRING_NODE)) { } else if (PM_NODE_TYPE_P(current, PM_INTERPOLATED_STRING_NODE)) {
// If we hit an embedded expression and the current node is an // If we hit an embedded expression and the current
// interpolated string, then we'll just continue on. // node is an interpolated string, then we'll just
// continue on.
} else { } else {
assert(false && "unreachable"); assert(false && "unreachable");
} }
@ -15543,8 +15612,8 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
} else { } else {
expect1(parser, PM_TOKEN_STRING_END, PM_ERR_LIST_W_UPPER_TERM); expect1(parser, PM_TOKEN_STRING_END, PM_ERR_LIST_W_UPPER_TERM);
} }
pm_array_node_close_set(array, &closing);
pm_array_node_close_set(array, &closing);
return (pm_node_t *) array; return (pm_node_t *) array;
} }
case PM_TOKEN_REGEXP_BEGIN: { case PM_TOKEN_REGEXP_BEGIN: {
@ -15652,8 +15721,11 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
pm_token_t content = parser->current; pm_token_t content = parser->current;
parser_lex(parser); parser_lex(parser);
if (accept1(parser, PM_TOKEN_STRING_END)) { if (match1(parser, PM_TOKEN_STRING_END)) {
return (pm_node_t *) pm_xstring_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped); pm_node_t *node = (pm_node_t *) pm_xstring_node_create_unescaped(parser, &opening, &content, &parser->current, &unescaped);
node->flags |= parse_unescaped_encoding(parser);
parser_lex(parser);
return node;
} }
// If we get here, then we have interpolation so we'll need to // If we get here, then we have interpolation so we'll need to
@ -15662,7 +15734,9 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
pm_token_t opening = not_provided(parser); pm_token_t opening = not_provided(parser);
pm_token_t closing = not_provided(parser); pm_token_t closing = not_provided(parser);
pm_node_t *part = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &parser->previous, &closing, &unescaped); pm_node_t *part = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &parser->previous, &closing, &unescaped);
part->flags |= parse_unescaped_encoding(parser);
pm_interpolated_xstring_node_append(node, part); pm_interpolated_xstring_node_append(node, part);
} else { } else {
@ -15986,7 +16060,7 @@ parse_regular_expression_named_captures(pm_parser_t *parser, const pm_string_t *
pm_string_list_t named_captures = { 0 }; pm_string_list_t named_captures = { 0 };
pm_node_t *result; pm_node_t *result;
if (pm_regexp_named_capture_group_names(pm_string_source(content), pm_string_length(content), &named_captures, parser->encoding_changed, &parser->encoding) && (named_captures.length > 0)) { if (pm_regexp_named_capture_group_names(pm_string_source(content), pm_string_length(content), &named_captures, parser->encoding_changed, parser->encoding) && (named_captures.length > 0)) {
// Since we should not create a MatchWriteNode when all capture names // Since we should not create a MatchWriteNode when all capture names
// are invalid, creating a MatchWriteNode is delayed here. // are invalid, creating a MatchWriteNode is delayed here.
pm_match_write_node_t *match = NULL; pm_match_write_node_t *match = NULL;
@ -17004,7 +17078,7 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm
.error_list = { 0 }, .error_list = { 0 },
.current_scope = NULL, .current_scope = NULL,
.current_context = NULL, .current_context = NULL,
.encoding = *pm_encoding_utf_8, .encoding = PM_ENCODING_UTF_8_ENTRY,
.encoding_changed_callback = NULL, .encoding_changed_callback = NULL,
.encoding_comment_start = source, .encoding_comment_start = source,
.lex_callback = NULL, .lex_callback = NULL,
@ -17014,6 +17088,7 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm
.integer_base = 0, .integer_base = 0,
.current_string = PM_STRING_EMPTY, .current_string = PM_STRING_EMPTY,
.start_line = 1, .start_line = 1,
.explicit_encoding = NULL,
.command_start = true, .command_start = true,
.recovering = false, .recovering = false,
.encoding_changed = false, .encoding_changed = false,
@ -17240,7 +17315,7 @@ pm_serialize_parse_comments(pm_buffer_t *buffer, const uint8_t *source, size_t s
pm_node_t *node = pm_parse(&parser); pm_node_t *node = pm_parse(&parser);
pm_serialize_header(buffer); pm_serialize_header(buffer);
pm_serialize_encoding(&parser.encoding, buffer); pm_serialize_encoding(parser.encoding, buffer);
pm_buffer_append_varsint(buffer, parser.start_line); pm_buffer_append_varsint(buffer, parser.start_line);
pm_serialize_comment_list(&parser, &parser.comment_list, buffer); pm_serialize_comment_list(&parser, &parser.comment_list, buffer);

Просмотреть файл

@ -91,7 +91,7 @@ void pm_serialize_comment_list(pm_parser_t *parser, pm_list_t *list, pm_buffer_t
* @param encoding The encoding to serialize. * @param encoding The encoding to serialize.
* @param buffer The buffer to serialize to. * @param buffer The buffer to serialize to.
*/ */
void pm_serialize_encoding(pm_encoding_t *encoding, pm_buffer_t *buffer); void pm_serialize_encoding(const pm_encoding_t *encoding, pm_buffer_t *buffer);
/** /**
* Serialize the encoding, metadata, nodes, and constant pool. * Serialize the encoding, metadata, nodes, and constant pool.

Просмотреть файл

@ -206,7 +206,7 @@ pm_serialize_diagnostic_list(pm_parser_t *parser, pm_list_t *list, pm_buffer_t *
* Serialize the name of the encoding to the buffer. * Serialize the name of the encoding to the buffer.
*/ */
void void
pm_serialize_encoding(pm_encoding_t *encoding, pm_buffer_t *buffer) { pm_serialize_encoding(const pm_encoding_t *encoding, pm_buffer_t *buffer) {
size_t encoding_length = strlen(encoding->name); size_t encoding_length = strlen(encoding->name);
pm_buffer_append_varuint(buffer, pm_sizet_to_u32(encoding_length)); pm_buffer_append_varuint(buffer, pm_sizet_to_u32(encoding_length));
pm_buffer_append_string(buffer, encoding->name, encoding_length); pm_buffer_append_string(buffer, encoding->name, encoding_length);
@ -218,7 +218,7 @@ pm_serialize_encoding(pm_encoding_t *encoding, pm_buffer_t *buffer) {
*/ */
void void
pm_serialize_content(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer) { pm_serialize_content(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer) {
pm_serialize_encoding(&parser->encoding, buffer); pm_serialize_encoding(parser->encoding, buffer);
pm_buffer_append_varsint(buffer, parser->start_line); pm_buffer_append_varsint(buffer, parser->start_line);
<%- unless Prism::SERIALIZE_ONLY_SEMANTICS_FIELDS -%> <%- unless Prism::SERIALIZE_ONLY_SEMANTICS_FIELDS -%>
pm_serialize_comment_list(parser, &parser->comment_list, buffer); pm_serialize_comment_list(parser, &parser->comment_list, buffer);
@ -317,7 +317,7 @@ pm_serialize_lex(pm_buffer_t *buffer, const uint8_t *source, size_t size, const
// Append 0 to mark end of tokens. // Append 0 to mark end of tokens.
pm_buffer_append_byte(buffer, 0); pm_buffer_append_byte(buffer, 0);
pm_serialize_encoding(&parser.encoding, buffer); pm_serialize_encoding(parser.encoding, buffer);
pm_buffer_append_varsint(buffer, parser.start_line); pm_buffer_append_varsint(buffer, parser.start_line);
pm_serialize_comment_list(&parser, &parser.comment_list, buffer); pm_serialize_comment_list(&parser, &parser.comment_list, buffer);
pm_serialize_magic_comment_list(&parser, &parser.magic_comment_list, buffer); pm_serialize_magic_comment_list(&parser, &parser.magic_comment_list, buffer);

Просмотреть файл

@ -4,7 +4,7 @@
* This is the slow path that does care about the encoding. * This is the slow path that does care about the encoding.
*/ */
static inline const uint8_t * static inline const uint8_t *
pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum) { pm_strpbrk_multi_byte(const pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum) {
size_t index = 0; size_t index = 0;
while (index < maximum) { while (index < maximum) {
@ -12,7 +12,7 @@ pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t
return source + index; return source + index;
} }
size_t width = parser->encoding.char_width(source + index, (ptrdiff_t) (maximum - index)); size_t width = parser->encoding->char_width(source + index, (ptrdiff_t) (maximum - index));
if (width == 0) { if (width == 0) {
return NULL; return NULL;
} }
@ -61,10 +61,10 @@ pm_strpbrk_single_byte(const uint8_t *source, const uint8_t *charset, size_t max
* need to take a slower path and iterate one multi-byte character at a time. * need to take a slower path and iterate one multi-byte character at a time.
*/ */
const uint8_t * const uint8_t *
pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length) { pm_strpbrk(const pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length) {
if (length <= 0) { if (length <= 0) {
return NULL; return NULL;
} else if (parser->encoding_changed && parser->encoding.multibyte) { } else if (parser->encoding_changed && parser->encoding->multibyte) {
return pm_strpbrk_multi_byte(parser, source, charset, (size_t) length); return pm_strpbrk_multi_byte(parser, source, charset, (size_t) length);
} else { } else {
return pm_strpbrk_single_byte(source, charset, (size_t) length); return pm_strpbrk_single_byte(source, charset, (size_t) length);

Просмотреть файл

@ -32,12 +32,12 @@
* need to take a slower path and iterate one multi-byte character at a time. * need to take a slower path and iterate one multi-byte character at a time.
* *
* @param parser The parser. * @param parser The parser.
* @param source The source string. * @param source The source to search.
* @param charset The charset to search for. * @param charset The charset to search for.
* @param length The maximum length to search. * @param length The maximum number of bytes to search.
* @return A pointer to the first character in the source string that is in the * @return A pointer to the first character in the source string that is in the
* charset, or NULL if no such character exists. * charset, or NULL if no such character exists.
*/ */
const uint8_t * pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length); const uint8_t * pm_strpbrk(const pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length);
#endif #endif

Просмотреть файл

@ -7,90 +7,16 @@ require_relative "test_helper"
module Prism module Prism
class EncodingTest < TestCase class EncodingTest < TestCase
codepoints_1byte = 0...0x100 codepoints_1byte = 0...0x100
codepoints_2bytes = 0...0x10000
encodings = { encodings = {
Encoding::ASCII => codepoints_1byte, Encoding::ASCII_8BIT => codepoints_1byte,
Encoding::ASCII_8BIT => codepoints_1byte, Encoding::US_ASCII => codepoints_1byte,
Encoding::CP850 => codepoints_1byte, Encoding::Windows_1253 => codepoints_1byte
Encoding::CP852 => codepoints_1byte,
Encoding::CP855 => codepoints_1byte,
Encoding::GB1988 => codepoints_1byte,
Encoding::IBM437 => codepoints_1byte,
Encoding::IBM720 => codepoints_1byte,
Encoding::IBM737 => codepoints_1byte,
Encoding::IBM775 => codepoints_1byte,
Encoding::IBM852 => codepoints_1byte,
Encoding::IBM855 => codepoints_1byte,
Encoding::IBM857 => codepoints_1byte,
Encoding::IBM860 => codepoints_1byte,
Encoding::IBM861 => codepoints_1byte,
Encoding::IBM862 => codepoints_1byte,
Encoding::IBM863 => codepoints_1byte,
Encoding::IBM864 => codepoints_1byte,
Encoding::IBM865 => codepoints_1byte,
Encoding::IBM866 => codepoints_1byte,
Encoding::IBM869 => codepoints_1byte,
Encoding::ISO_8859_1 => codepoints_1byte,
Encoding::ISO_8859_2 => codepoints_1byte,
Encoding::ISO_8859_3 => codepoints_1byte,
Encoding::ISO_8859_4 => codepoints_1byte,
Encoding::ISO_8859_5 => codepoints_1byte,
Encoding::ISO_8859_6 => codepoints_1byte,
Encoding::ISO_8859_7 => codepoints_1byte,
Encoding::ISO_8859_8 => codepoints_1byte,
Encoding::ISO_8859_9 => codepoints_1byte,
Encoding::ISO_8859_10 => codepoints_1byte,
Encoding::ISO_8859_11 => codepoints_1byte,
Encoding::ISO_8859_13 => codepoints_1byte,
Encoding::ISO_8859_14 => codepoints_1byte,
Encoding::ISO_8859_15 => codepoints_1byte,
Encoding::ISO_8859_16 => codepoints_1byte,
Encoding::KOI8_R => codepoints_1byte,
Encoding::KOI8_U => codepoints_1byte,
Encoding::MACCENTEURO => codepoints_1byte,
Encoding::MACCROATIAN => codepoints_1byte,
Encoding::MACCYRILLIC => codepoints_1byte,
Encoding::MACGREEK => codepoints_1byte,
Encoding::MACICELAND => codepoints_1byte,
Encoding::MACROMAN => codepoints_1byte,
Encoding::MACROMANIA => codepoints_1byte,
Encoding::MACTHAI => codepoints_1byte,
Encoding::MACTURKISH => codepoints_1byte,
Encoding::MACUKRAINE => codepoints_1byte,
Encoding::TIS_620 => codepoints_1byte,
Encoding::Windows_1250 => codepoints_1byte,
Encoding::Windows_1251 => codepoints_1byte,
Encoding::Windows_1252 => codepoints_1byte,
Encoding::Windows_1253 => codepoints_1byte,
Encoding::Windows_1254 => codepoints_1byte,
Encoding::Windows_1255 => codepoints_1byte,
Encoding::Windows_1256 => codepoints_1byte,
Encoding::Windows_1257 => codepoints_1byte,
Encoding::Windows_1258 => codepoints_1byte,
Encoding::Windows_874 => codepoints_1byte,
Encoding::Big5 => codepoints_2bytes,
Encoding::Big5_HKSCS => codepoints_2bytes,
Encoding::Big5_UAO => codepoints_2bytes,
Encoding::CP949 => codepoints_2bytes,
Encoding::CP950 => codepoints_2bytes,
Encoding::CP951 => codepoints_2bytes,
Encoding::EUC_KR => codepoints_2bytes,
Encoding::GBK => codepoints_2bytes,
Encoding::GB12345 => codepoints_2bytes,
Encoding::GB2312 => codepoints_2bytes,
Encoding::MACJAPANESE => codepoints_2bytes,
Encoding::Shift_JIS => codepoints_2bytes,
Encoding::SJIS_DoCoMo => codepoints_2bytes,
Encoding::SJIS_KDDI => codepoints_2bytes,
Encoding::SJIS_SoftBank => codepoints_2bytes,
Encoding::Windows_31J => codepoints_2bytes
} }
# By default we don't test every codepoint in these encodings because they # By default we don't test every codepoint in these encodings because it
# are 3 and 4 byte representations so it can drastically slow down the test # takes a very long time.
# suite.
if ENV["PRISM_TEST_ALL_ENCODINGS"] if ENV["PRISM_TEST_ALL_ENCODINGS"]
codepoints_2bytes = 0...0x10000
codepoints_unicode = (0...0x110000) codepoints_unicode = (0...0x110000)
codepoints_eucjp = [ codepoints_eucjp = [
@ -118,6 +44,78 @@ module Prism
] ]
encodings.merge!( encodings.merge!(
Encoding::CP850 => codepoints_1byte,
Encoding::CP852 => codepoints_1byte,
Encoding::CP855 => codepoints_1byte,
Encoding::GB1988 => codepoints_1byte,
Encoding::IBM437 => codepoints_1byte,
Encoding::IBM720 => codepoints_1byte,
Encoding::IBM737 => codepoints_1byte,
Encoding::IBM775 => codepoints_1byte,
Encoding::IBM852 => codepoints_1byte,
Encoding::IBM855 => codepoints_1byte,
Encoding::IBM857 => codepoints_1byte,
Encoding::IBM860 => codepoints_1byte,
Encoding::IBM861 => codepoints_1byte,
Encoding::IBM862 => codepoints_1byte,
Encoding::IBM863 => codepoints_1byte,
Encoding::IBM864 => codepoints_1byte,
Encoding::IBM865 => codepoints_1byte,
Encoding::IBM866 => codepoints_1byte,
Encoding::IBM869 => codepoints_1byte,
Encoding::ISO_8859_1 => codepoints_1byte,
Encoding::ISO_8859_2 => codepoints_1byte,
Encoding::ISO_8859_3 => codepoints_1byte,
Encoding::ISO_8859_4 => codepoints_1byte,
Encoding::ISO_8859_5 => codepoints_1byte,
Encoding::ISO_8859_6 => codepoints_1byte,
Encoding::ISO_8859_7 => codepoints_1byte,
Encoding::ISO_8859_8 => codepoints_1byte,
Encoding::ISO_8859_9 => codepoints_1byte,
Encoding::ISO_8859_10 => codepoints_1byte,
Encoding::ISO_8859_11 => codepoints_1byte,
Encoding::ISO_8859_13 => codepoints_1byte,
Encoding::ISO_8859_14 => codepoints_1byte,
Encoding::ISO_8859_15 => codepoints_1byte,
Encoding::ISO_8859_16 => codepoints_1byte,
Encoding::KOI8_R => codepoints_1byte,
Encoding::KOI8_U => codepoints_1byte,
Encoding::MACCENTEURO => codepoints_1byte,
Encoding::MACCROATIAN => codepoints_1byte,
Encoding::MACCYRILLIC => codepoints_1byte,
Encoding::MACGREEK => codepoints_1byte,
Encoding::MACICELAND => codepoints_1byte,
Encoding::MACROMAN => codepoints_1byte,
Encoding::MACROMANIA => codepoints_1byte,
Encoding::MACTHAI => codepoints_1byte,
Encoding::MACTURKISH => codepoints_1byte,
Encoding::MACUKRAINE => codepoints_1byte,
Encoding::TIS_620 => codepoints_1byte,
Encoding::Windows_1250 => codepoints_1byte,
Encoding::Windows_1251 => codepoints_1byte,
Encoding::Windows_1252 => codepoints_1byte,
Encoding::Windows_1254 => codepoints_1byte,
Encoding::Windows_1255 => codepoints_1byte,
Encoding::Windows_1256 => codepoints_1byte,
Encoding::Windows_1257 => codepoints_1byte,
Encoding::Windows_1258 => codepoints_1byte,
Encoding::Windows_874 => codepoints_1byte,
Encoding::Big5 => codepoints_2bytes,
Encoding::Big5_HKSCS => codepoints_2bytes,
Encoding::Big5_UAO => codepoints_2bytes,
Encoding::CP949 => codepoints_2bytes,
Encoding::CP950 => codepoints_2bytes,
Encoding::CP951 => codepoints_2bytes,
Encoding::EUC_KR => codepoints_2bytes,
Encoding::GBK => codepoints_2bytes,
Encoding::GB12345 => codepoints_2bytes,
Encoding::GB2312 => codepoints_2bytes,
Encoding::MACJAPANESE => codepoints_2bytes,
Encoding::Shift_JIS => codepoints_2bytes,
Encoding::SJIS_DoCoMo => codepoints_2bytes,
Encoding::SJIS_KDDI => codepoints_2bytes,
Encoding::SJIS_SoftBank => codepoints_2bytes,
Encoding::Windows_31J => codepoints_2bytes,
Encoding::UTF_8 => codepoints_unicode, Encoding::UTF_8 => codepoints_unicode,
Encoding::UTF8_MAC => codepoints_unicode, Encoding::UTF8_MAC => codepoints_unicode,
Encoding::UTF8_DoCoMo => codepoints_unicode, Encoding::UTF8_DoCoMo => codepoints_unicode,
@ -136,6 +134,8 @@ module Prism
) )
end end
# These test that we're correctly parsing codepoints for each alias of each
# encoding that prism supports.
encodings.each do |encoding, range| encodings.each do |encoding, range|
encoding.names.each do |name| encoding.names.each do |name|
next if name == "locale" next if name == "locale"
@ -146,6 +146,17 @@ module Prism
end end
end end
# These test that we're correctly setting the flags on strings for each
# encoding that prism supports.
escapes = ["\\x00", "\\x7F", "\\x80", "\\xFF", "\\u{00}", "\\u{7F}", "\\u{80}", "\\M-\\C-?"]
escapes = escapes.concat(escapes.product(escapes).map(&:join))
encodings.each_key do |encoding|
define_method(:"test_encoding_flags_#{encoding.name}") do
assert_encoding_flags(encoding, escapes)
end
end
def test_coding def test_coding
result = Prism.parse("# coding: utf-8\n'string'") result = Prism.parse("# coding: utf-8\n'string'")
actual = result.value.statements.body.first.unescaped.encoding actual = result.value.statements.body.first.unescaped.encoding
@ -292,5 +303,47 @@ module Prism
refute Prism.parse(source).success? refute Prism.parse(source).success?
end end
end end
def assert_encoding_flags(encoding, escapes)
escapes.each do |escaped|
source = "# encoding: #{encoding.name}\n\"#{escaped}\""
expected =
begin
eval(source).encoding
rescue SyntaxError => error
if error.message.include?("UTF-8 mixed within")
error.message[/: (.+?)\n/, 1]
else
raise
end
end
actual =
Prism.parse(source).then do |result|
if result.success?
string = result.value.statements.body.first
if string.forced_utf8_encoding?
Encoding::UTF_8
elsif string.forced_binary_encoding?
Encoding::ASCII_8BIT
else
encoding
end
else
error = result.errors.first
if error.message.include?("mixed")
error.message
else
raise error.message
end
end
end
assert_equal expected, actual
end
end
end end
end end

Просмотреть файл

@ -659,7 +659,7 @@ module Prism
end end
def test_do_not_allow_multiple_codepoints_in_a_single_character_literal def test_do_not_allow_multiple_codepoints_in_a_single_character_literal
expected = StringNode(0, Location(), Location(), nil, "\u0001\u0002") expected = StringNode(StringFlags::FORCED_UTF8_ENCODING, Location(), Location(), nil, "\u0001\u0002")
assert_errors expected, '?\u{0001 0002}', [ assert_errors expected, '?\u{0001 0002}', [
["invalid Unicode escape sequence; multiple codepoints are not allowed in a character literal", 9..12] ["invalid Unicode escape sequence; multiple codepoints are not allowed in a character literal", 9..12]

Просмотреть файл

@ -796,6 +796,7 @@
│ ├── closing_loc: (64,16)-(64,17) = "#" │ ├── closing_loc: (64,16)-(64,17) = "#"
│ └── flags: ∅ │ └── flags: ∅
├── @ XStringNode (location: (66,0)-(66,17)) ├── @ XStringNode (location: (66,0)-(66,17))
│ ├── flags: ∅
│ ├── opening_loc: (66,0)-(66,3) = "%x#" │ ├── opening_loc: (66,0)-(66,3) = "%x#"
│ ├── content_loc: (66,3)-(66,16) = "one two three" │ ├── content_loc: (66,3)-(66,16) = "one two three"
│ ├── closing_loc: (66,16)-(66,17) = "#" │ ├── closing_loc: (66,16)-(66,17) = "#"
@ -844,6 +845,7 @@
│ ├── closing_loc: (71,16)-(71,17) = "@" │ ├── closing_loc: (71,16)-(71,17) = "@"
│ └── flags: ∅ │ └── flags: ∅
├── @ XStringNode (location: (73,0)-(73,17)) ├── @ XStringNode (location: (73,0)-(73,17))
│ ├── flags: ∅
│ ├── opening_loc: (73,0)-(73,3) = "%x@" │ ├── opening_loc: (73,0)-(73,3) = "%x@"
│ ├── content_loc: (73,3)-(73,16) = "one two three" │ ├── content_loc: (73,3)-(73,16) = "one two three"
│ ├── closing_loc: (73,16)-(73,17) = "@" │ ├── closing_loc: (73,16)-(73,17) = "@"
@ -892,6 +894,7 @@
│ ├── closing_loc: (78,16)-(78,17) = "}" │ ├── closing_loc: (78,16)-(78,17) = "}"
│ └── flags: ∅ │ └── flags: ∅
├── @ XStringNode (location: (80,0)-(80,17)) ├── @ XStringNode (location: (80,0)-(80,17))
│ ├── flags: ∅
│ ├── opening_loc: (80,0)-(80,3) = "%x{" │ ├── opening_loc: (80,0)-(80,3) = "%x{"
│ ├── content_loc: (80,3)-(80,16) = "one two three" │ ├── content_loc: (80,3)-(80,16) = "one two three"
│ ├── closing_loc: (80,16)-(80,17) = "}" │ ├── closing_loc: (80,16)-(80,17) = "}"

Просмотреть файл

@ -178,6 +178,7 @@
│ │ └── flags: variable_call │ │ └── flags: variable_call
│ ├── pattern: │ ├── pattern:
│ │ @ XStringNode (location: (10,7)-(10,12)) │ │ @ XStringNode (location: (10,7)-(10,12))
│ │ ├── flags: ∅
│ │ ├── opening_loc: (10,7)-(10,8) = "`" │ │ ├── opening_loc: (10,7)-(10,8) = "`"
│ │ ├── content_loc: (10,8)-(10,11) = "foo" │ │ ├── content_loc: (10,8)-(10,11) = "foo"
│ │ ├── closing_loc: (10,11)-(10,12) = "`" │ │ ├── closing_loc: (10,11)-(10,12) = "`"
@ -197,6 +198,7 @@
│ │ └── flags: variable_call │ │ └── flags: variable_call
│ ├── pattern: │ ├── pattern:
│ │ @ XStringNode (location: (11,7)-(11,14)) │ │ @ XStringNode (location: (11,7)-(11,14))
│ │ ├── flags: ∅
│ │ ├── opening_loc: (11,7)-(11,10) = "%x[" │ │ ├── opening_loc: (11,7)-(11,10) = "%x["
│ │ ├── content_loc: (11,10)-(11,13) = "foo" │ │ ├── content_loc: (11,10)-(11,13) = "foo"
│ │ ├── closing_loc: (11,13)-(11,14) = "]" │ │ ├── closing_loc: (11,13)-(11,14) = "]"
@ -725,12 +727,14 @@
│ │ @ RangeNode (location: (36,7)-(36,21)) │ │ @ RangeNode (location: (36,7)-(36,21))
│ │ ├── left: │ │ ├── left:
│ │ │ @ XStringNode (location: (36,7)-(36,12)) │ │ │ @ XStringNode (location: (36,7)-(36,12))
│ │ │ ├── flags: ∅
│ │ │ ├── opening_loc: (36,7)-(36,8) = "`" │ │ │ ├── opening_loc: (36,7)-(36,8) = "`"
│ │ │ ├── content_loc: (36,8)-(36,11) = "foo" │ │ │ ├── content_loc: (36,8)-(36,11) = "foo"
│ │ │ ├── closing_loc: (36,11)-(36,12) = "`" │ │ │ ├── closing_loc: (36,11)-(36,12) = "`"
│ │ │ └── unescaped: "foo" │ │ │ └── unescaped: "foo"
│ │ ├── right: │ │ ├── right:
│ │ │ @ XStringNode (location: (36,16)-(36,21)) │ │ │ @ XStringNode (location: (36,16)-(36,21))
│ │ │ ├── flags: ∅
│ │ │ ├── opening_loc: (36,16)-(36,17) = "`" │ │ │ ├── opening_loc: (36,16)-(36,17) = "`"
│ │ │ ├── content_loc: (36,17)-(36,20) = "foo" │ │ │ ├── content_loc: (36,17)-(36,20) = "foo"
│ │ │ ├── closing_loc: (36,20)-(36,21) = "`" │ │ │ ├── closing_loc: (36,20)-(36,21) = "`"
@ -754,12 +758,14 @@
│ │ @ RangeNode (location: (37,7)-(37,25)) │ │ @ RangeNode (location: (37,7)-(37,25))
│ │ ├── left: │ │ ├── left:
│ │ │ @ XStringNode (location: (37,7)-(37,14)) │ │ │ @ XStringNode (location: (37,7)-(37,14))
│ │ │ ├── flags: ∅
│ │ │ ├── opening_loc: (37,7)-(37,10) = "%x[" │ │ │ ├── opening_loc: (37,7)-(37,10) = "%x["
│ │ │ ├── content_loc: (37,10)-(37,13) = "foo" │ │ │ ├── content_loc: (37,10)-(37,13) = "foo"
│ │ │ ├── closing_loc: (37,13)-(37,14) = "]" │ │ │ ├── closing_loc: (37,13)-(37,14) = "]"
│ │ │ └── unescaped: "foo" │ │ │ └── unescaped: "foo"
│ │ ├── right: │ │ ├── right:
│ │ │ @ XStringNode (location: (37,18)-(37,25)) │ │ │ @ XStringNode (location: (37,18)-(37,25))
│ │ │ ├── flags: ∅
│ │ │ ├── opening_loc: (37,18)-(37,21) = "%x[" │ │ │ ├── opening_loc: (37,18)-(37,21) = "%x["
│ │ │ ├── content_loc: (37,21)-(37,24) = "foo" │ │ │ ├── content_loc: (37,21)-(37,24) = "foo"
│ │ │ ├── closing_loc: (37,24)-(37,25) = "]" │ │ │ ├── closing_loc: (37,24)-(37,25) = "]"
@ -2483,6 +2489,7 @@
│ │ └── flags: variable_call │ │ └── flags: variable_call
│ ├── pattern: │ ├── pattern:
│ │ @ XStringNode (location: (109,7)-(109,12)) │ │ @ XStringNode (location: (109,7)-(109,12))
│ │ ├── flags: ∅
│ │ ├── opening_loc: (109,7)-(109,8) = "`" │ │ ├── opening_loc: (109,7)-(109,8) = "`"
│ │ ├── content_loc: (109,8)-(109,11) = "foo" │ │ ├── content_loc: (109,8)-(109,11) = "foo"
│ │ ├── closing_loc: (109,11)-(109,12) = "`" │ │ ├── closing_loc: (109,11)-(109,12) = "`"
@ -2502,6 +2509,7 @@
│ │ └── flags: variable_call │ │ └── flags: variable_call
│ ├── pattern: │ ├── pattern:
│ │ @ XStringNode (location: (110,7)-(110,14)) │ │ @ XStringNode (location: (110,7)-(110,14))
│ │ ├── flags: ∅
│ │ ├── opening_loc: (110,7)-(110,10) = "%x[" │ │ ├── opening_loc: (110,7)-(110,10) = "%x["
│ │ ├── content_loc: (110,10)-(110,13) = "foo" │ │ ├── content_loc: (110,10)-(110,13) = "foo"
│ │ ├── closing_loc: (110,13)-(110,14) = "]" │ │ ├── closing_loc: (110,13)-(110,14) = "]"
@ -3038,6 +3046,7 @@
│ │ └── @ InNode (location: (136,10)-(136,23)) │ │ └── @ InNode (location: (136,10)-(136,23))
│ │ ├── pattern: │ │ ├── pattern:
│ │ │ @ XStringNode (location: (136,13)-(136,18)) │ │ │ @ XStringNode (location: (136,13)-(136,18))
│ │ │ ├── flags: ∅
│ │ │ ├── opening_loc: (136,13)-(136,14) = "`" │ │ │ ├── opening_loc: (136,13)-(136,14) = "`"
│ │ │ ├── content_loc: (136,14)-(136,17) = "foo" │ │ │ ├── content_loc: (136,14)-(136,17) = "foo"
│ │ │ ├── closing_loc: (136,17)-(136,18) = "`" │ │ │ ├── closing_loc: (136,17)-(136,18) = "`"
@ -3064,6 +3073,7 @@
│ │ └── @ InNode (location: (137,10)-(137,25)) │ │ └── @ InNode (location: (137,10)-(137,25))
│ │ ├── pattern: │ │ ├── pattern:
│ │ │ @ XStringNode (location: (137,13)-(137,20)) │ │ │ @ XStringNode (location: (137,13)-(137,20))
│ │ │ ├── flags: ∅
│ │ │ ├── opening_loc: (137,13)-(137,16) = "%x[" │ │ │ ├── opening_loc: (137,13)-(137,16) = "%x["
│ │ │ ├── content_loc: (137,16)-(137,19) = "foo" │ │ │ ├── content_loc: (137,16)-(137,19) = "foo"
│ │ │ ├── closing_loc: (137,19)-(137,20) = "]" │ │ │ ├── closing_loc: (137,19)-(137,20) = "]"
@ -3828,6 +3838,7 @@
│ │ │ │ @ StatementsNode (location: (163,13)-(163,18)) │ │ │ │ @ StatementsNode (location: (163,13)-(163,18))
│ │ │ │ └── body: (length: 1) │ │ │ │ └── body: (length: 1)
│ │ │ │ └── @ XStringNode (location: (163,13)-(163,18)) │ │ │ │ └── @ XStringNode (location: (163,13)-(163,18))
│ │ │ │ ├── flags: ∅
│ │ │ │ ├── opening_loc: (163,13)-(163,14) = "`" │ │ │ │ ├── opening_loc: (163,13)-(163,14) = "`"
│ │ │ │ ├── content_loc: (163,14)-(163,17) = "foo" │ │ │ │ ├── content_loc: (163,14)-(163,17) = "foo"
│ │ │ │ ├── closing_loc: (163,17)-(163,18) = "`" │ │ │ │ ├── closing_loc: (163,17)-(163,18) = "`"
@ -3866,6 +3877,7 @@
│ │ │ │ @ StatementsNode (location: (164,13)-(164,20)) │ │ │ │ @ StatementsNode (location: (164,13)-(164,20))
│ │ │ │ └── body: (length: 1) │ │ │ │ └── body: (length: 1)
│ │ │ │ └── @ XStringNode (location: (164,13)-(164,20)) │ │ │ │ └── @ XStringNode (location: (164,13)-(164,20))
│ │ │ │ ├── flags: ∅
│ │ │ │ ├── opening_loc: (164,13)-(164,16) = "%x[" │ │ │ │ ├── opening_loc: (164,13)-(164,16) = "%x["
│ │ │ │ ├── content_loc: (164,16)-(164,19) = "foo" │ │ │ │ ├── content_loc: (164,16)-(164,19) = "foo"
│ │ │ │ ├── closing_loc: (164,19)-(164,20) = "]" │ │ │ │ ├── closing_loc: (164,19)-(164,20) = "]"

Просмотреть файл

@ -806,6 +806,7 @@
│ │ └── @ InNode (location: (98,0)-(98,12)) │ │ └── @ InNode (location: (98,0)-(98,12))
│ │ ├── pattern: │ │ ├── pattern:
│ │ │ @ XStringNode (location: (98,3)-(98,12)) │ │ │ @ XStringNode (location: (98,3)-(98,12))
│ │ │ ├── flags: ∅
│ │ │ ├── opening_loc: (98,3)-(98,4) = "`" │ │ │ ├── opening_loc: (98,3)-(98,4) = "`"
│ │ │ ├── content_loc: (98,4)-(98,11) = "echo hi" │ │ │ ├── content_loc: (98,4)-(98,11) = "echo hi"
│ │ │ ├── closing_loc: (98,11)-(98,12) = "`" │ │ │ ├── closing_loc: (98,11)-(98,12) = "`"

Просмотреть файл

@ -9,7 +9,7 @@
├── name_loc: (1,0)-(1,1) = "s" ├── name_loc: (1,0)-(1,1) = "s"
├── value: ├── value:
│ @ StringNode (location: (1,4)-(1,9)) │ @ StringNode (location: (1,4)-(1,9))
│ ├── flags: │ ├── flags: forced_utf8_encoding
│ ├── opening_loc: (1,4)-(1,9) = "<<eos" │ ├── opening_loc: (1,4)-(1,9) = "<<eos"
│ ├── content_loc: (2,0)-(3,0) = "a\\xE9b\n" │ ├── content_loc: (2,0)-(3,0) = "a\\xE9b\n"
│ ├── closing_loc: (3,0)-(4,0) = "eos\n" │ ├── closing_loc: (3,0)-(4,0) = "eos\n"

Просмотреть файл

@ -9,7 +9,7 @@
├── name_loc: (1,0)-(1,1) = "s" ├── name_loc: (1,0)-(1,1) = "s"
├── value: ├── value:
│ @ StringNode (location: (1,4)-(1,10)) │ @ StringNode (location: (1,4)-(1,10))
│ ├── flags: │ ├── flags: forced_utf8_encoding
│ ├── opening_loc: (1,4)-(1,10) = "<<-EOS" │ ├── opening_loc: (1,4)-(1,10) = "<<-EOS"
│ ├── content_loc: (2,0)-(4,0) = "a\\247b\ncöd\n" │ ├── content_loc: (2,0)-(4,0) = "a\\247b\ncöd\n"
│ ├── closing_loc: (4,0)-(5,0) = "EOS\n" │ ├── closing_loc: (4,0)-(5,0) = "EOS\n"

Просмотреть файл

@ -4,7 +4,7 @@
@ StatementsNode (location: (1,0)-(1,9)) @ StatementsNode (location: (1,0)-(1,9))
└── body: (length: 1) └── body: (length: 1)
└── @ StringNode (location: (1,0)-(1,9)) └── @ StringNode (location: (1,0)-(1,9))
├── flags: ├── flags: forced_utf8_encoding
├── opening_loc: (1,0)-(1,1) = "?" ├── opening_loc: (1,0)-(1,1) = "?"
├── content_loc: (1,1)-(1,9) = "\\u{00a0}" ├── content_loc: (1,1)-(1,9) = "\\u{00a0}"
├── closing_loc: ∅ ├── closing_loc: ∅

Просмотреть файл

@ -4,7 +4,7 @@
@ StatementsNode (location: (1,0)-(1,7)) @ StatementsNode (location: (1,0)-(1,7))
└── body: (length: 1) └── body: (length: 1)
└── @ StringNode (location: (1,0)-(1,7)) └── @ StringNode (location: (1,0)-(1,7))
├── flags: ├── flags: forced_utf8_encoding
├── opening_loc: (1,0)-(1,1) = "?" ├── opening_loc: (1,0)-(1,1) = "?"
├── content_loc: (1,1)-(1,7) = "\\u00a0" ├── content_loc: (1,1)-(1,7) = "\\u00a0"
├── closing_loc: ∅ ├── closing_loc: ∅

Просмотреть файл

@ -29,7 +29,7 @@
│ │ │ └── flags: variable_call │ │ │ └── flags: variable_call
│ │ └── closing_loc: (1,6)-(1,7) = "}" │ │ └── closing_loc: (1,6)-(1,7) = "}"
│ └── @ StringNode (location: (1,7)-(1,15)) │ └── @ StringNode (location: (1,7)-(1,15))
│ ├── flags: │ ├── flags: forced_utf8_encoding
│ ├── opening_loc: ∅ │ ├── opening_loc: ∅
│ ├── content_loc: (1,7)-(1,15) = "\\302\\275" │ ├── content_loc: (1,7)-(1,15) = "\\302\\275"
│ ├── closing_loc: ∅ │ ├── closing_loc: ∅

Просмотреть файл

@ -7,13 +7,13 @@
├── opening_loc: ∅ ├── opening_loc: ∅
├── parts: (length: 2) ├── parts: (length: 2)
│ ├── @ StringNode (location: (1,0)-(1,62)) │ ├── @ StringNode (location: (1,0)-(1,62))
│ │ ├── flags: │ │ ├── flags: forced_utf8_encoding
│ │ ├── opening_loc: (1,0)-(1,1) = "\"" │ │ ├── opening_loc: (1,0)-(1,1) = "\""
│ │ ├── content_loc: (1,1)-(1,61) = "\\xE3\\xD3\\x8B\\xE3\\x83\\xBC\\x83\\xE3\\x83\\xE3\\x82\\xB3\\xA3\\x82\\x99" │ │ ├── content_loc: (1,1)-(1,61) = "\\xE3\\xD3\\x8B\\xE3\\x83\\xBC\\x83\\xE3\\x83\\xE3\\x82\\xB3\\xA3\\x82\\x99"
│ │ ├── closing_loc: (1,61)-(1,62) = "\"" │ │ ├── closing_loc: (1,61)-(1,62) = "\""
│ │ └── unescaped: "\xE3Ӌー\x83\xE3\x83コ\xA3\x82\x99" │ │ └── unescaped: "\xE3Ӌー\x83\xE3\x83コ\xA3\x82\x99"
│ └── @ StringNode (location: (2,8)-(2,66)) │ └── @ StringNode (location: (2,8)-(2,66))
│ ├── flags: │ ├── flags: forced_utf8_encoding
│ ├── opening_loc: (2,8)-(2,9) = "\"" │ ├── opening_loc: (2,8)-(2,9) = "\""
│ ├── content_loc: (2,9)-(2,65) = "\\xE3\\x83\\xB3\\xE3\\x83\\x8F\\xE3\\x82\\x9A\\xC3\\xBD;foo@bar.com" │ ├── content_loc: (2,9)-(2,65) = "\\xE3\\x83\\xB3\\xE3\\x83\\x8F\\xE3\\x82\\x9A\\xC3\\xBD;foo@bar.com"
│ ├── closing_loc: (2,65)-(2,66) = "\"" │ ├── closing_loc: (2,65)-(2,66) = "\""

Просмотреть файл

@ -472,6 +472,7 @@
│ ├── closing_loc: (37,9)-(37,10) = "\"" │ ├── closing_loc: (37,9)-(37,10) = "\""
│ └── unescaped: "foo\nbar" │ └── unescaped: "foo\nbar"
├── @ XStringNode (location: (38,0)-(38,5)) ├── @ XStringNode (location: (38,0)-(38,5))
│ ├── flags: ∅
│ ├── opening_loc: (38,0)-(38,1) = "`" │ ├── opening_loc: (38,0)-(38,1) = "`"
│ ├── content_loc: (38,1)-(38,4) = "foo" │ ├── content_loc: (38,1)-(38,4) = "foo"
│ ├── closing_loc: (38,4)-(38,5) = "`" │ ├── closing_loc: (38,4)-(38,5) = "`"
@ -495,16 +496,19 @@
│ │ └── closing_loc: (39,10)-(39,11) = "}" │ │ └── closing_loc: (39,10)-(39,11) = "}"
│ └── closing_loc: (39,11)-(39,12) = "`" │ └── closing_loc: (39,11)-(39,12) = "`"
├── @ XStringNode (location: (40,0)-(40,3)) ├── @ XStringNode (location: (40,0)-(40,3))
│ ├── flags: ∅
│ ├── opening_loc: (40,0)-(40,1) = "`" │ ├── opening_loc: (40,0)-(40,1) = "`"
│ ├── content_loc: (40,1)-(40,2) = ")" │ ├── content_loc: (40,1)-(40,2) = ")"
│ ├── closing_loc: (40,2)-(40,3) = "`" │ ├── closing_loc: (40,2)-(40,3) = "`"
│ └── unescaped: ")" │ └── unescaped: ")"
├── @ XStringNode (location: (41,0)-(41,4)) ├── @ XStringNode (location: (41,0)-(41,4))
│ ├── flags: ∅
│ ├── opening_loc: (41,0)-(41,1) = "`" │ ├── opening_loc: (41,0)-(41,1) = "`"
│ ├── content_loc: (41,1)-(41,3) = "\\`" │ ├── content_loc: (41,1)-(41,3) = "\\`"
│ ├── closing_loc: (41,3)-(41,4) = "`" │ ├── closing_loc: (41,3)-(41,4) = "`"
│ └── unescaped: "`" │ └── unescaped: "`"
├── @ XStringNode (location: (42,0)-(42,3)) ├── @ XStringNode (location: (42,0)-(42,3))
│ ├── flags: ∅
│ ├── opening_loc: (42,0)-(42,1) = "`" │ ├── opening_loc: (42,0)-(42,1) = "`"
│ ├── content_loc: (42,1)-(42,2) = "\"" │ ├── content_loc: (42,1)-(42,2) = "\""
│ ├── closing_loc: (42,2)-(42,3) = "`" │ ├── closing_loc: (42,2)-(42,3) = "`"

Просмотреть файл

@ -4,7 +4,7 @@
@ StatementsNode (location: (2,9)-(2,75)) @ StatementsNode (location: (2,9)-(2,75))
└── body: (length: 1) └── body: (length: 1)
└── @ StringNode (location: (2,9)-(2,75)) └── @ StringNode (location: (2,9)-(2,75))
├── flags: ├── flags: forced_utf8_encoding
├── opening_loc: (2,9)-(2,10) = "\"" ├── opening_loc: (2,9)-(2,10) = "\""
├── content_loc: (2,10)-(2,74) = "\\xD0\\xBF\\xD1\\x80\\xD0\\xBE\\xD0\\xB2\\xD0\\xB5\\xD1\\x80\\xD0\\xBA\\xD0\\xB0" ├── content_loc: (2,10)-(2,74) = "\\xD0\\xBF\\xD1\\x80\\xD0\\xBE\\xD0\\xB2\\xD0\\xB5\\xD1\\x80\\xD0\\xBA\\xD0\\xB0"
├── closing_loc: (2,74)-(2,75) = "\"" ├── closing_loc: (2,74)-(2,75) = "\""

Просмотреть файл

@ -16,6 +16,7 @@
│ ├── closing_loc: (9,0)-(10,0) = "HERE\n" │ ├── closing_loc: (9,0)-(10,0) = "HERE\n"
│ └── unescaped: "foo\nbar\n" │ └── unescaped: "foo\nbar\n"
└── @ XStringNode (location: (11,0)-(11,8)) └── @ XStringNode (location: (11,0)-(11,8))
├── flags: ∅
├── opening_loc: (11,0)-(11,8) = "<<`HERE`" ├── opening_loc: (11,0)-(11,8) = "<<`HERE`"
├── content_loc: (12,0)-(14,0) = "foo\nbar\n" ├── content_loc: (12,0)-(14,0) = "foo\nbar\n"
├── closing_loc: (14,0)-(15,0) = "HERE\n" ├── closing_loc: (14,0)-(15,0) = "HERE\n"

Просмотреть файл

@ -146,11 +146,13 @@
│ ├── closing_loc: (35,10)-(35,11) = "]" │ ├── closing_loc: (35,10)-(35,11) = "]"
│ └── flags: ∅ │ └── flags: ∅
├── @ XStringNode (location: (37,1)-(37,8)) ├── @ XStringNode (location: (37,1)-(37,8))
│ ├── flags: ∅
│ ├── opening_loc: (37,1)-(37,4) = "%x{" │ ├── opening_loc: (37,1)-(37,4) = "%x{"
│ ├── content_loc: (37,4)-(37,7) = "\#@1" │ ├── content_loc: (37,4)-(37,7) = "\#@1"
│ ├── closing_loc: (37,7)-(37,8) = "}" │ ├── closing_loc: (37,7)-(37,8) = "}"
│ └── unescaped: "\#@1" │ └── unescaped: "\#@1"
├── @ XStringNode (location: (39,1)-(39,9)) ├── @ XStringNode (location: (39,1)-(39,9))
│ ├── flags: ∅
│ ├── opening_loc: (39,1)-(39,4) = "%x{" │ ├── opening_loc: (39,1)-(39,4) = "%x{"
│ ├── content_loc: (39,4)-(39,8) = "\#@@1" │ ├── content_loc: (39,4)-(39,8) = "\#@@1"
│ ├── closing_loc: (39,8)-(39,9) = "}" │ ├── closing_loc: (39,8)-(39,9) = "}"
@ -212,11 +214,13 @@
│ ├── closing_loc: (59,7)-(59,8) = "'" │ ├── closing_loc: (59,7)-(59,8) = "'"
│ └── unescaped: "\#@@1" │ └── unescaped: "\#@@1"
├── @ XStringNode (location: (61,1)-(61,6)) ├── @ XStringNode (location: (61,1)-(61,6))
│ ├── flags: ∅
│ ├── opening_loc: (61,1)-(61,2) = "`" │ ├── opening_loc: (61,1)-(61,2) = "`"
│ ├── content_loc: (61,2)-(61,5) = "\#@1" │ ├── content_loc: (61,2)-(61,5) = "\#@1"
│ ├── closing_loc: (61,5)-(61,6) = "`" │ ├── closing_loc: (61,5)-(61,6) = "`"
│ └── unescaped: "\#@1" │ └── unescaped: "\#@1"
├── @ XStringNode (location: (63,1)-(63,7)) ├── @ XStringNode (location: (63,1)-(63,7))
│ ├── flags: ∅
│ ├── opening_loc: (63,1)-(63,2) = "`" │ ├── opening_loc: (63,1)-(63,2) = "`"
│ ├── content_loc: (63,2)-(63,6) = "\#@@1" │ ├── content_loc: (63,2)-(63,6) = "\#@@1"
│ ├── closing_loc: (63,6)-(63,7) = "`" │ ├── closing_loc: (63,6)-(63,7) = "`"
@ -246,11 +250,13 @@
│ ├── closing_loc: (79,0)-(80,0) = "HERE\n" │ ├── closing_loc: (79,0)-(80,0) = "HERE\n"
│ └── unescaped: "\#@@1\n" │ └── unescaped: "\#@@1\n"
├── @ XStringNode (location: (81,0)-(81,9)) ├── @ XStringNode (location: (81,0)-(81,9))
│ ├── flags: ∅
│ ├── opening_loc: (81,0)-(81,9) = "<<-`HERE`" │ ├── opening_loc: (81,0)-(81,9) = "<<-`HERE`"
│ ├── content_loc: (82,0)-(83,0) = "\#@1\n" │ ├── content_loc: (82,0)-(83,0) = "\#@1\n"
│ ├── closing_loc: (83,0)-(84,0) = "HERE\n" │ ├── closing_loc: (83,0)-(84,0) = "HERE\n"
│ └── unescaped: "\#@1\n" │ └── unescaped: "\#@1\n"
└── @ XStringNode (location: (85,0)-(85,9)) └── @ XStringNode (location: (85,0)-(85,9))
├── flags: ∅
├── opening_loc: (85,0)-(85,9) = "<<-`HERE`" ├── opening_loc: (85,0)-(85,9) = "<<-`HERE`"
├── content_loc: (86,0)-(87,0) = "\#@@1\n" ├── content_loc: (86,0)-(87,0) = "\#@@1\n"
├── closing_loc: (87,0)-(88,0) = "HERE\n" ├── closing_loc: (87,0)-(88,0) = "HERE\n"

Просмотреть файл

@ -75,6 +75,7 @@
│ ├── closing_loc: (26,1)-(26,2) = "}" │ ├── closing_loc: (26,1)-(26,2) = "}"
│ └── flags: ∅ │ └── flags: ∅
├── @ XStringNode (location: (28,0)-(29,2)) ├── @ XStringNode (location: (28,0)-(29,2))
│ ├── flags: ∅
│ ├── opening_loc: (28,0)-(28,3) = "%x{" │ ├── opening_loc: (28,0)-(28,3) = "%x{"
│ ├── content_loc: (28,3)-(29,1) = "a\\\nb" │ ├── content_loc: (28,3)-(29,1) = "a\\\nb"
│ ├── closing_loc: (29,1)-(29,2) = "}" │ ├── closing_loc: (29,1)-(29,2) = "}"
@ -120,11 +121,13 @@
│ ├── closing_loc: (54,0)-(55,0) = "HERE\n" │ ├── closing_loc: (54,0)-(55,0) = "HERE\n"
│ └── unescaped: "a\\\nb\n" │ └── unescaped: "a\\\nb\n"
├── @ XStringNode (location: (56,0)-(56,9)) ├── @ XStringNode (location: (56,0)-(56,9))
│ ├── flags: ∅
│ ├── opening_loc: (56,0)-(56,9) = "<<-`HERE`" │ ├── opening_loc: (56,0)-(56,9) = "<<-`HERE`"
│ ├── content_loc: (57,0)-(59,0) = "a\\\nb\n" │ ├── content_loc: (57,0)-(59,0) = "a\\\nb\n"
│ ├── closing_loc: (59,0)-(60,0) = "HERE\n" │ ├── closing_loc: (59,0)-(60,0) = "HERE\n"
│ └── unescaped: "ab\n" │ └── unescaped: "ab\n"
└── @ XStringNode (location: (61,0)-(62,2)) └── @ XStringNode (location: (61,0)-(62,2))
├── flags: ∅
├── opening_loc: (61,0)-(61,1) = "`" ├── opening_loc: (61,0)-(61,1) = "`"
├── content_loc: (61,1)-(62,1) = "a\\\nb" ├── content_loc: (61,1)-(62,1) = "a\\\nb"
├── closing_loc: (62,1)-(62,2) = "`" ├── closing_loc: (62,1)-(62,2) = "`"

Просмотреть файл

@ -4,6 +4,7 @@
@ StatementsNode (location: (1,0)-(1,8)) @ StatementsNode (location: (1,0)-(1,8))
└── body: (length: 1) └── body: (length: 1)
└── @ XStringNode (location: (1,0)-(1,8)) └── @ XStringNode (location: (1,0)-(1,8))
├── flags: ∅
├── opening_loc: (1,0)-(1,1) = "`" ├── opening_loc: (1,0)-(1,1) = "`"
├── content_loc: (1,1)-(1,7) = "foobar" ├── content_loc: (1,1)-(1,7) = "foobar"
├── closing_loc: (1,7)-(1,8) = "`" ├── closing_loc: (1,7)-(1,8) = "`"

Просмотреть файл

@ -4,6 +4,7 @@
@ StatementsNode (location: (1,0)-(7,5)) @ StatementsNode (location: (1,0)-(7,5))
└── body: (length: 4) └── body: (length: 4)
├── @ XStringNode (location: (1,0)-(1,7)) ├── @ XStringNode (location: (1,0)-(1,7))
│ ├── flags: ∅
│ ├── opening_loc: (1,0)-(1,3) = "%x[" │ ├── opening_loc: (1,0)-(1,3) = "%x["
│ ├── content_loc: (1,3)-(1,6) = "foo" │ ├── content_loc: (1,3)-(1,6) = "foo"
│ ├── closing_loc: (1,6)-(1,7) = "]" │ ├── closing_loc: (1,6)-(1,7) = "]"
@ -41,11 +42,13 @@
│ │ └── unescaped: " baz" │ │ └── unescaped: " baz"
│ └── closing_loc: (3,15)-(3,16) = "`" │ └── closing_loc: (3,15)-(3,16) = "`"
├── @ XStringNode (location: (5,0)-(5,6)) ├── @ XStringNode (location: (5,0)-(5,6))
│ ├── flags: ∅
│ ├── opening_loc: (5,0)-(5,1) = "`" │ ├── opening_loc: (5,0)-(5,1) = "`"
│ ├── content_loc: (5,1)-(5,5) = "f\\oo" │ ├── content_loc: (5,1)-(5,5) = "f\\oo"
│ ├── closing_loc: (5,5)-(5,6) = "`" │ ├── closing_loc: (5,5)-(5,6) = "`"
│ └── unescaped: "foo" │ └── unescaped: "foo"
└── @ XStringNode (location: (7,0)-(7,5)) └── @ XStringNode (location: (7,0)-(7,5))
├── flags: ∅
├── opening_loc: (7,0)-(7,1) = "`" ├── opening_loc: (7,0)-(7,1) = "`"
├── content_loc: (7,1)-(7,4) = "foo" ├── content_loc: (7,1)-(7,4) = "foo"
├── closing_loc: (7,4)-(7,5) = "`" ├── closing_loc: (7,4)-(7,5) = "`"