[ruby/prism] Parse all magic comments

https://github.com/ruby/prism/commit/2b3d59f424
This commit is contained in:
Kevin Newton 2023-10-13 11:56:08 -04:00 коммит произвёл Jemma Issroff
Родитель fd87372a74
Коммит 39dd3343d8
2 изменённых файлов: 179 добавлений и 79 удалений

Просмотреть файл

@ -5218,66 +5218,17 @@ next_newline(const uint8_t *cursor, ptrdiff_t length) {
return memchr(cursor, '\n', (size_t) length);
}
// Find the start of the encoding comment. This is effectively an inlined
// version of strnstr with some modifications.
static inline const uint8_t *
parser_lex_encoding_comment_start(pm_parser_t *parser, const uint8_t *cursor, ptrdiff_t remaining) {
assert(remaining >= 0);
size_t length = (size_t) remaining;
size_t key_length = strlen("coding:");
if (key_length > length) return NULL;
const uint8_t *cursor_limit = cursor + length - key_length + 1;
while ((cursor = pm_memchr(cursor, 'c', (size_t) (cursor_limit - cursor), parser->encoding_changed, &parser->encoding)) != NULL) {
if (memcmp(cursor, "coding", key_length - 1) == 0) {
size_t whitespace_after_coding = pm_strspn_inline_whitespace(cursor + key_length - 1, parser->end - (cursor + key_length - 1));
size_t cur_pos = key_length + whitespace_after_coding;
if (cursor[cur_pos - 1] == ':' || cursor[cur_pos - 1] == '=') {
return cursor + cur_pos;
}
}
cursor++;
}
return NULL;
}
// Here we're going to check if this is a "magic" comment, and perform whatever
// actions are necessary for it here.
static void
parser_lex_encoding_comment(pm_parser_t *parser) {
const uint8_t *start = parser->current.start + 1;
const uint8_t *end = parser->current.end;
// These are the patterns we're going to match to find the encoding comment.
// This is definitely not complete or even really correct.
const uint8_t *encoding_start = parser_lex_encoding_comment_start(parser, start, end - start);
// If we didn't find anything that matched our patterns, then return. Note
// that this does a _very_ poor job of actually finding the encoding, and
// there is a lot of work to do here to better reflect actual magic comment
// parsing from CRuby, but this at least gets us part of the way there.
if (encoding_start == NULL) return;
// Skip any non-newline whitespace after the "coding:" or "coding=".
encoding_start += pm_strspn_inline_whitespace(encoding_start, end - encoding_start);
// Now determine the end of the encoding string. This is either the end of
// the line, the first whitespace character, or a punctuation mark.
const uint8_t *encoding_end = pm_strpbrk(parser, encoding_start, (const uint8_t *) " \t\f\r\v\n;,", end - encoding_start);
encoding_end = encoding_end == NULL ? end : encoding_end;
// Finally, we can determine the width of the encoding string.
size_t width = (size_t) (encoding_end - encoding_start);
parser_lex_magic_comment_encoding(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) {
size_t width = (size_t) (end - start);
// First, we're going to call out to a user-defined callback if one was
// provided. If they return an encoding struct that we can use, then we'll
// use that here.
if (parser->encoding_decode_callback != NULL) {
pm_encoding_t *encoding = parser->encoding_decode_callback(parser, encoding_start, width);
pm_encoding_t *encoding = parser->encoding_decode_callback(parser, start, width);
if (encoding != NULL) {
parser->encoding = *encoding;
@ -5289,7 +5240,7 @@ parser_lex_encoding_comment(pm_parser_t *parser) {
// Extensions like utf-8 can contain extra encoding details like,
// utf-8-dos, utf-8-linux, utf-8-mac. We treat these all as utf-8 should
// treat any encoding starting utf-8 as utf-8.
if ((encoding_start + 5 <= parser->end) && (pm_strncasecmp(encoding_start, (const uint8_t *) "utf-8", 5) == 0)) {
if ((start + 5 <= end) && (pm_strncasecmp(start, (const uint8_t *) "utf-8", 5) == 0)) {
// We don't need to do anything here because the default encoding is
// already UTF-8. We'll just return.
return;
@ -5298,7 +5249,7 @@ parser_lex_encoding_comment(pm_parser_t *parser) {
// Next, we're going to loop through each of the encodings that we handle
// explicitly. If we found one that we understand, we'll use that value.
#define ENCODING(value, prebuilt) \
if (width == sizeof(value) - 1 && encoding_start + width <= parser->end && pm_strncasecmp(encoding_start, (const uint8_t *) value, width) == 0) { \
if (width == sizeof(value) - 1 && start + width <= end && pm_strncasecmp(start, (const uint8_t *) value, width) == 0) { \
parser->encoding = prebuilt; \
parser->encoding_changed |= true; \
if (parser->encoding_changed_callback != NULL) parser->encoding_changed_callback(parser); \
@ -5347,39 +5298,156 @@ parser_lex_encoding_comment(pm_parser_t *parser) {
// didn't understand the encoding that the user was trying to use. In this
// case we'll keep using the default encoding but add an error to the
// parser to indicate an unsuccessful parse.
pm_parser_err(parser, encoding_start, encoding_end, PM_ERR_INVALID_ENCODING_MAGIC_COMMENT);
pm_parser_err(parser, start, end, PM_ERR_INVALID_ENCODING_MAGIC_COMMENT);
}
// Check if this is a magic comment that includes the frozen_string_literal
// pragma. If it does, set that field on the parser.
static void
parser_lex_frozen_string_literal_comment(pm_parser_t *parser) {
const uint8_t *cursor = parser->current.start + 1;
parser_lex_magic_comment_frozen_string_literal(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) {
if (start + 4 <= end && pm_strncasecmp(start, (const uint8_t *) "true", 4) == 0) {
parser->frozen_string_literal = true;
}
}
static inline bool
pm_char_is_magic_comment_key_delimiter(const uint8_t b) {
return b == '\'' || b == '"' || b == ':' || b == ';';
}
// Find an emacs magic comment marker (-*-) within the given bounds. If one is
// found, it returns a pointer to the start of the marker. Otherwise it returns
// NULL.
static inline const uint8_t *
parser_lex_magic_comment_emacs_marker(pm_parser_t *parser, const uint8_t *cursor, const uint8_t *end) {
while ((cursor + 3 <= end) && (cursor = pm_memchr(cursor, '-', (size_t) (end - cursor), parser->encoding_changed, &parser->encoding)) != NULL) {
if (cursor + 3 <= end && cursor[1] == '*' && cursor[2] == '-') {
return cursor;
}
cursor++;
}
return NULL;
}
// Parse the current token on the parser to see if it's a magic comment and
// potentially perform some action based on that. A regular expression that this
// function is effectively matching is:
//
// %r"([^\\s\'\":;]+)\\s*:\\s*(\"(?:\\\\.|[^\"])*\"|[^\"\\s;]+)[\\s;]*"
//
static inline void
parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) {
const uint8_t *start = parser->current.start + 1;
const uint8_t *end = parser->current.end;
size_t key_length = strlen("frozen_string_literal");
if (key_length > (size_t) (end - cursor)) return;
const uint8_t *cursor;
bool indicator = false;
const uint8_t *cursor_limit = cursor + (end - cursor) - key_length + 1;
if ((cursor = parser_lex_magic_comment_emacs_marker(parser, start, end)) != NULL) {
start = cursor + 3;
while ((cursor = pm_memchr(cursor, 'f', (size_t) (cursor_limit - cursor), parser->encoding_changed, &parser->encoding)) != NULL) {
if (memcmp(cursor, "frozen_string_literal", key_length) == 0) {
cursor += key_length;
cursor += pm_strspn_inline_whitespace(cursor, end - cursor);
if ((cursor = parser_lex_magic_comment_emacs_marker(parser, start, end)) != NULL) {
end = cursor;
indicator = true;
} else {
// If we have a start marker but not an end marker, then we cannot
// have a magic comment.
return;
}
}
if (*cursor == ':' || *cursor == '=') {
cursor++;
cursor += pm_strspn_inline_whitespace(cursor, end - cursor);
cursor = start;
while (cursor < end) {
while (cursor < end && (pm_char_is_magic_comment_key_delimiter(*cursor) || pm_char_is_whitespace(*cursor))) cursor++;
if (cursor + 4 <= end && pm_strncasecmp(cursor, (const uint8_t *) "true", 4) == 0) {
parser->frozen_string_literal = true;
}
const uint8_t *key_start = cursor;
while (cursor < end && (!pm_char_is_magic_comment_key_delimiter(*cursor) && !pm_char_is_whitespace(*cursor))) cursor++;
return;
const uint8_t *key_end = cursor;
while (cursor < end && pm_char_is_whitespace(*cursor)) cursor++;
if (cursor == end) return;
if (*cursor == ':') {
cursor++;
} else {
if (!indicator) return;
continue;
}
while (cursor < end && pm_char_is_whitespace(*cursor)) cursor++;
if (cursor == end) return;
const uint8_t *value_start;
const uint8_t *value_end;
if (*cursor == '"') {
value_start = ++cursor;
for (; cursor < end && *cursor != '"'; cursor++) {
if (*cursor == '\\' && (cursor + 1 < end)) cursor++;
}
value_end = cursor;
} else {
value_start = cursor;
while (cursor < end && *cursor != '"' && *cursor != ';' && !pm_char_is_whitespace(*cursor)) cursor++;
value_end = cursor;
}
if (indicator) {
while (cursor < end && (*cursor == ';' || pm_char_is_whitespace(*cursor))) cursor++;
} else {
while (cursor < end && pm_char_is_whitespace(*cursor)) cursor++;
if (cursor != end) return;
}
// Here, we need to do some processing on the key to swap out dashes for
// underscores. We only need to do this if there _is_ a dash in the key.
pm_string_t key;
const uint8_t *dash = pm_memchr(key_start, '-', (size_t) (key_end - key_start), parser->encoding_changed, &parser->encoding);
if (dash == NULL) {
pm_string_shared_init(&key, key_start, key_end);
} else {
size_t width = (size_t) (key_end - key_start);
uint8_t *buffer = malloc(width);
if (buffer == NULL) return;
memcpy(buffer, key_start, width);
buffer[dash - key_start] = '_';
while ((dash = pm_memchr(dash + 1, '-', (size_t) (key_end - dash - 1), parser->encoding_changed, &parser->encoding)) != NULL) {
buffer[dash - key_start] = '_';
}
pm_string_owned_init(&key, buffer, width);
}
// Finally, we can start checking the key against the list of known
// magic comment keys, and potentially change state based on that.
const char *key_source = (const char *) pm_string_source(&key);
const size_t key_length = pm_string_length(&key);
// We only want to attempt to compare against encoding comments if it's
// the first line in the file (or the second in the case of a shebang).
if (parser->current.start == parser->encoding_comment_start) {
if (
(key_length == 8 && strncasecmp(key_source, "encoding", 8) == 0) ||
(key_length == 6 && strncasecmp(key_source, "coding", 6) == 0)
) {
parser_lex_magic_comment_encoding(parser, value_start, value_end);
}
}
cursor++;
// We only want to handle frozen string literal comments if it's before
// any semantic tokens have been seen.
if (!semantic_token_seen) {
if (key_length == 21 && strncasecmp(key_source, "frozen_string_literal", 21) == 0) {
parser_lex_magic_comment_frozen_string_literal(parser, value_start, value_end);
}
}
// When we're done, we want to free the string in case we had to
// allocate memory for it.
pm_string_free(&key);
}
}
@ -6981,13 +7049,9 @@ parser_lex(pm_parser_t *parser) {
parser->current.type = PM_TOKEN_COMMENT;
parser_lex_callback(parser);
if (parser->current.start == parser->encoding_comment_start) {
parser_lex_encoding_comment(parser);
}
if (!semantic_token_seen) {
parser_lex_frozen_string_literal_comment(parser);
}
// Here, parse the comment to see if it's a magic comment
// and potentially change state on the parser.
parser_lex_magic_comment(parser, semantic_token_seen);
lexed_comment = true;
}

Просмотреть файл

@ -0,0 +1,36 @@
# frozen_string_literal: true
require_relative "test_helper"
module Prism
class MagicCommentTest < TestCase
examples = [
"# encoding: ascii",
"# coding: ascii",
"# eNcOdInG: ascii",
"# CoDiNg: ascii",
"# \s\t\v encoding \s\t\v : \s\t\v ascii \s\t\v",
"# -*- encoding: ascii -*-",
"# -*- coding: ascii -*-",
"# -*- eNcOdInG: ascii -*-",
"# -*- CoDiNg: ascii -*-",
"# -*- \s\t\v encoding \s\t\v : \s\t\v ascii \s\t\v -*-",
"# -*- foo: bar; encoding: ascii -*-",
"# coding \t \r \v : \t \v \r ascii-8bit\n"
]
examples.each do |example|
define_method(:"test_magic_comment_#{example}") do
assert_magic_comment(example)
end
end
private
def assert_magic_comment(example)
expected = Ripper.new(example).tap(&:parse).encoding
actual = Prism.parse(example).source.source.encoding
assert_equal expected, actual
end
end
end