зеркало из https://github.com/github/ruby.git
Mark the first string element of a regexp as binary if US-ASCII
This commit is contained in:
Родитель
b5cefa79dd
Коммит
5409661fe6
|
@ -19173,6 +19173,14 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
|
|||
pm_token_t opening = not_provided(parser);
|
||||
pm_token_t closing = not_provided(parser);
|
||||
pm_node_t *part = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &parser->previous, &closing, &unescaped);
|
||||
|
||||
if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) {
|
||||
// This is extremely strange, but the first string part of a
|
||||
// regular expression will always be tagged as binary if we
|
||||
// are in a US-ASCII file, no matter its contents.
|
||||
pm_node_flag_set(part, PM_STRING_FLAGS_FORCED_BINARY_ENCODING);
|
||||
}
|
||||
|
||||
pm_interpolated_regular_expression_node_append(interpolated, part);
|
||||
} else {
|
||||
// If the first part of the body of the regular expression is not a
|
||||
|
|
|
@ -363,18 +363,34 @@ parse_regexp_error(rb_iseq_t *iseq, int32_t line_number, const char *fmt, ...)
|
|||
}
|
||||
|
||||
static VALUE
|
||||
parse_regexp_string_part(rb_iseq_t *iseq, const pm_scope_node_t *scope_node, const pm_node_t *node, const pm_string_t *unescaped, rb_encoding *regexp_encoding)
|
||||
parse_regexp_string_part(rb_iseq_t *iseq, const pm_scope_node_t *scope_node, const pm_node_t *node, const pm_string_t *unescaped, rb_encoding *implicit_regexp_encoding, rb_encoding *explicit_regexp_encoding)
|
||||
{
|
||||
// If we were passed an explicit regexp encoding, then we need to double
|
||||
// check that it's okay here for this fragment of the string.
|
||||
VALUE string = rb_enc_str_new((const char *) pm_string_source(unescaped), pm_string_length(unescaped), regexp_encoding);
|
||||
rb_encoding *encoding;
|
||||
|
||||
if (explicit_regexp_encoding != NULL) {
|
||||
encoding = explicit_regexp_encoding;
|
||||
}
|
||||
else if (node->flags & PM_STRING_FLAGS_FORCED_BINARY_ENCODING) {
|
||||
encoding = rb_ascii8bit_encoding();
|
||||
}
|
||||
else if (node->flags & PM_STRING_FLAGS_FORCED_UTF8_ENCODING) {
|
||||
encoding = rb_utf8_encoding();
|
||||
}
|
||||
else {
|
||||
encoding = implicit_regexp_encoding;
|
||||
}
|
||||
|
||||
VALUE string = rb_enc_str_new((const char *) pm_string_source(unescaped), pm_string_length(unescaped), encoding);
|
||||
VALUE error = rb_reg_check_preprocess(string);
|
||||
|
||||
if (error != Qnil) parse_regexp_error(iseq, pm_node_line_number(scope_node->parser, node), "%" PRIsVALUE, rb_obj_as_string(error));
|
||||
return string;
|
||||
}
|
||||
|
||||
static VALUE
|
||||
pm_static_literal_concat(rb_iseq_t *iseq, const pm_node_list_t *nodes, const pm_scope_node_t *scope_node, rb_encoding *regexp_encoding, bool top)
|
||||
pm_static_literal_concat(rb_iseq_t *iseq, const pm_node_list_t *nodes, const pm_scope_node_t *scope_node, rb_encoding *implicit_regexp_encoding, rb_encoding *explicit_regexp_encoding, bool top)
|
||||
{
|
||||
VALUE current = Qnil;
|
||||
|
||||
|
@ -384,9 +400,9 @@ pm_static_literal_concat(rb_iseq_t *iseq, const pm_node_list_t *nodes, const pm_
|
|||
|
||||
switch (PM_NODE_TYPE(part)) {
|
||||
case PM_STRING_NODE:
|
||||
if (regexp_encoding != NULL) {
|
||||
if (implicit_regexp_encoding != NULL) {
|
||||
if (top) {
|
||||
string = parse_regexp_string_part(iseq, scope_node, part, &((const pm_string_node_t *) part)->unescaped, regexp_encoding);
|
||||
string = parse_regexp_string_part(iseq, scope_node, part, &((const pm_string_node_t *) part)->unescaped, implicit_regexp_encoding, explicit_regexp_encoding);
|
||||
}
|
||||
else {
|
||||
string = parse_string_encoded(part, &((const pm_string_node_t *) part)->unescaped, scope_node->encoding);
|
||||
|
@ -399,11 +415,11 @@ pm_static_literal_concat(rb_iseq_t *iseq, const pm_node_list_t *nodes, const pm_
|
|||
}
|
||||
break;
|
||||
case PM_INTERPOLATED_STRING_NODE:
|
||||
string = pm_static_literal_concat(iseq, &((const pm_interpolated_string_node_t *) part)->parts, scope_node, regexp_encoding, false);
|
||||
string = pm_static_literal_concat(iseq, &((const pm_interpolated_string_node_t *) part)->parts, scope_node, implicit_regexp_encoding, explicit_regexp_encoding, false);
|
||||
break;
|
||||
case PM_EMBEDDED_STATEMENTS_NODE: {
|
||||
const pm_embedded_statements_node_t *cast = (const pm_embedded_statements_node_t *) part;
|
||||
string = pm_static_literal_concat(iseq, &cast->statements->body, scope_node, regexp_encoding, false);
|
||||
string = pm_static_literal_concat(iseq, &cast->statements->body, scope_node, implicit_regexp_encoding, explicit_regexp_encoding, false);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
|
@ -499,7 +515,7 @@ parse_regexp_encoding(const pm_scope_node_t *scope_node, const pm_node_t *node)
|
|||
return rb_enc_get_from_index(ENCINDEX_Windows_31J);
|
||||
}
|
||||
else {
|
||||
return scope_node->encoding;
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -527,6 +543,8 @@ static inline VALUE
|
|||
parse_regexp_literal(rb_iseq_t *iseq, const pm_scope_node_t *scope_node, const pm_node_t *node, const pm_string_t *unescaped)
|
||||
{
|
||||
rb_encoding *regexp_encoding = parse_regexp_encoding(scope_node, node);
|
||||
if (regexp_encoding == NULL) regexp_encoding = scope_node->encoding;
|
||||
|
||||
VALUE string = rb_enc_str_new((const char *) pm_string_source(unescaped), pm_string_length(unescaped), regexp_encoding);
|
||||
return parse_regexp(iseq, scope_node, node, string);
|
||||
}
|
||||
|
@ -534,15 +552,17 @@ parse_regexp_literal(rb_iseq_t *iseq, const pm_scope_node_t *scope_node, const p
|
|||
static inline VALUE
|
||||
parse_regexp_concat(rb_iseq_t *iseq, const pm_scope_node_t *scope_node, const pm_node_t *node, const pm_node_list_t *parts)
|
||||
{
|
||||
rb_encoding *regexp_encoding = parse_regexp_encoding(scope_node, node);
|
||||
VALUE string = pm_static_literal_concat(iseq, parts, scope_node, regexp_encoding, false);
|
||||
rb_encoding *explicit_regexp_encoding = parse_regexp_encoding(scope_node, node);
|
||||
rb_encoding *implicit_regexp_encoding = explicit_regexp_encoding != NULL ? explicit_regexp_encoding : scope_node->encoding;
|
||||
|
||||
VALUE string = pm_static_literal_concat(iseq, parts, scope_node, implicit_regexp_encoding, explicit_regexp_encoding, false);
|
||||
return parse_regexp(iseq, scope_node, node, string);
|
||||
}
|
||||
|
||||
static void pm_compile_node(rb_iseq_t *iseq, const pm_node_t *node, LINK_ANCHOR *const ret, bool popped, pm_scope_node_t *scope_node);
|
||||
|
||||
static int
|
||||
pm_interpolated_node_compile(rb_iseq_t *iseq, const pm_node_list_t *parts, const pm_line_column_t *node_location, LINK_ANCHOR *const ret, bool popped, pm_scope_node_t *scope_node, rb_encoding *regexp_encoding)
|
||||
pm_interpolated_node_compile(rb_iseq_t *iseq, const pm_node_list_t *parts, const pm_line_column_t *node_location, LINK_ANCHOR *const ret, bool popped, pm_scope_node_t *scope_node, rb_encoding *implicit_regexp_encoding, rb_encoding *explicit_regexp_encoding)
|
||||
{
|
||||
int stack_size = 0;
|
||||
size_t parts_size = parts->size;
|
||||
|
@ -558,11 +578,11 @@ pm_interpolated_node_compile(rb_iseq_t *iseq, const pm_node_list_t *parts, const
|
|||
const pm_string_node_t *string_node = (const pm_string_node_t *) part;
|
||||
VALUE string_value;
|
||||
|
||||
if (regexp_encoding == NULL) {
|
||||
if (implicit_regexp_encoding == NULL) {
|
||||
string_value = parse_string_encoded(part, &string_node->unescaped, scope_node->encoding);
|
||||
}
|
||||
else {
|
||||
string_value = parse_regexp_string_part(iseq, scope_node, (const pm_node_t *) string_node, &string_node->unescaped, regexp_encoding);
|
||||
string_value = parse_regexp_string_part(iseq, scope_node, (const pm_node_t *) string_node, &string_node->unescaped, implicit_regexp_encoding, explicit_regexp_encoding);
|
||||
}
|
||||
|
||||
if (RTEST(current_string)) {
|
||||
|
@ -584,11 +604,11 @@ pm_interpolated_node_compile(rb_iseq_t *iseq, const pm_node_list_t *parts, const
|
|||
const pm_string_node_t *string_node = (const pm_string_node_t *) ((const pm_embedded_statements_node_t *) part)->statements->body.nodes[0];
|
||||
VALUE string_value;
|
||||
|
||||
if (regexp_encoding == NULL) {
|
||||
if (implicit_regexp_encoding == NULL) {
|
||||
string_value = parse_string_encoded(part, &string_node->unescaped, scope_node->encoding);
|
||||
}
|
||||
else {
|
||||
string_value = parse_regexp_string_part(iseq, scope_node, (const pm_node_t *) string_node, &string_node->unescaped, regexp_encoding);
|
||||
string_value = parse_regexp_string_part(iseq, scope_node, (const pm_node_t *) string_node, &string_node->unescaped, implicit_regexp_encoding, explicit_regexp_encoding);
|
||||
}
|
||||
|
||||
if (RTEST(current_string)) {
|
||||
|
@ -600,7 +620,24 @@ pm_interpolated_node_compile(rb_iseq_t *iseq, const pm_node_list_t *parts, const
|
|||
}
|
||||
else {
|
||||
if (!RTEST(current_string)) {
|
||||
current_string = rb_enc_str_new(NULL, 0, regexp_encoding != NULL ? regexp_encoding : scope_node->encoding);
|
||||
rb_encoding *encoding;
|
||||
|
||||
if (implicit_regexp_encoding != NULL) {
|
||||
if (explicit_regexp_encoding != NULL) {
|
||||
encoding = explicit_regexp_encoding;
|
||||
}
|
||||
else if (scope_node->parser->encoding == PM_ENCODING_US_ASCII_ENTRY) {
|
||||
encoding = rb_ascii8bit_encoding();
|
||||
}
|
||||
else {
|
||||
encoding = implicit_regexp_encoding;
|
||||
}
|
||||
}
|
||||
else {
|
||||
encoding = scope_node->encoding;
|
||||
}
|
||||
|
||||
current_string = rb_enc_str_new(NULL, 0, encoding);
|
||||
}
|
||||
|
||||
PUSH_INSN1(ret, *node_location, putobject, rb_fstring(current_string));
|
||||
|
@ -639,9 +676,10 @@ pm_interpolated_node_compile(rb_iseq_t *iseq, const pm_node_list_t *parts, const
|
|||
static void
|
||||
pm_compile_regexp_dynamic(rb_iseq_t *iseq, const pm_node_t *node, const pm_node_list_t *parts, const pm_line_column_t *node_location, LINK_ANCHOR *const ret, bool popped, pm_scope_node_t *scope_node)
|
||||
{
|
||||
rb_encoding *regexp_encoding = parse_regexp_encoding(scope_node, node);
|
||||
int length = pm_interpolated_node_compile(iseq, parts, node_location, ret, popped, scope_node, regexp_encoding);
|
||||
rb_encoding *explicit_regexp_encoding = parse_regexp_encoding(scope_node, node);
|
||||
rb_encoding *implicit_regexp_encoding = explicit_regexp_encoding != NULL ? explicit_regexp_encoding : scope_node->encoding;
|
||||
|
||||
int length = pm_interpolated_node_compile(iseq, parts, node_location, ret, popped, scope_node, implicit_regexp_encoding, explicit_regexp_encoding);
|
||||
PUSH_INSN2(ret, *node_location, toregexp, INT2FIX(parse_regexp_flags(node) & 0xFF), INT2FIX(length));
|
||||
}
|
||||
|
||||
|
@ -738,13 +776,13 @@ pm_static_literal_value(rb_iseq_t *iseq, const pm_node_t *node, const pm_scope_n
|
|||
return parse_regexp_concat(iseq, scope_node, (const pm_node_t *) cast, &cast->parts);
|
||||
}
|
||||
case PM_INTERPOLATED_STRING_NODE: {
|
||||
VALUE string = pm_static_literal_concat(iseq, &((const pm_interpolated_string_node_t *) node)->parts, scope_node, NULL, false);
|
||||
VALUE string = pm_static_literal_concat(iseq, &((const pm_interpolated_string_node_t *) node)->parts, scope_node, NULL, NULL, false);
|
||||
int line_number = pm_node_line_number(scope_node->parser, node);
|
||||
return pm_static_literal_string(iseq, string, line_number);
|
||||
}
|
||||
case PM_INTERPOLATED_SYMBOL_NODE: {
|
||||
const pm_interpolated_symbol_node_t *cast = (const pm_interpolated_symbol_node_t *) node;
|
||||
VALUE string = pm_static_literal_concat(iseq, &cast->parts, scope_node, NULL, true);
|
||||
VALUE string = pm_static_literal_concat(iseq, &cast->parts, scope_node, NULL, NULL, true);
|
||||
|
||||
return ID2SYM(rb_intern_str(string));
|
||||
}
|
||||
|
@ -6524,7 +6562,7 @@ pm_compile_node(rb_iseq_t *iseq, const pm_node_t *node, LINK_ANCHOR *const ret,
|
|||
}
|
||||
else {
|
||||
const pm_interpolated_string_node_t *cast = (const pm_interpolated_string_node_t *) node;
|
||||
int length = pm_interpolated_node_compile(iseq, &cast->parts, &location, ret, popped, scope_node, NULL);
|
||||
int length = pm_interpolated_node_compile(iseq, &cast->parts, &location, ret, popped, scope_node, NULL, NULL);
|
||||
if (length > 1) PUSH_INSN1(ret, location, concatstrings, INT2FIX(length));
|
||||
if (popped) PUSH_INSN(ret, location, pop);
|
||||
}
|
||||
|
@ -6543,7 +6581,7 @@ pm_compile_node(rb_iseq_t *iseq, const pm_node_t *node, LINK_ANCHOR *const ret,
|
|||
}
|
||||
}
|
||||
else {
|
||||
int length = pm_interpolated_node_compile(iseq, &cast->parts, &location, ret, popped, scope_node, NULL);
|
||||
int length = pm_interpolated_node_compile(iseq, &cast->parts, &location, ret, popped, scope_node, NULL, NULL);
|
||||
if (length > 1) {
|
||||
PUSH_INSN1(ret, location, concatstrings, INT2FIX(length));
|
||||
}
|
||||
|
@ -6565,7 +6603,7 @@ pm_compile_node(rb_iseq_t *iseq, const pm_node_t *node, LINK_ANCHOR *const ret,
|
|||
|
||||
PUSH_INSN(ret, location, putself);
|
||||
|
||||
int length = pm_interpolated_node_compile(iseq, &cast->parts, &location, ret, false, scope_node, NULL);
|
||||
int length = pm_interpolated_node_compile(iseq, &cast->parts, &location, ret, false, scope_node, NULL, NULL);
|
||||
if (length > 1) PUSH_INSN1(ret, location, concatstrings, INT2FIX(length));
|
||||
|
||||
PUSH_SEND_WITH_FLAG(ret, location, idBackquote, INT2NUM(1), INT2FIX(VM_CALL_FCALL | VM_CALL_ARGS_SIMPLE));
|
||||
|
|
|
@ -1,3 +1,2 @@
|
|||
exclude(:test_regexp_ascii, "https://github.com/ruby/prism/issues/2664")
|
||||
exclude(:test_regexp_usascii, "unknown")
|
||||
exclude(:test_string_mixed_unicode, "unknown")
|
||||
|
|
Загрузка…
Ссылка в новой задаче