[ruby/prism] Track both the unescaped bytes and source string for a regular expression so we can accurately set its encoding flags.

https://github.com/ruby/prism/commit/dc6dd3a926
2024-02-08 16:27:59 -05:00 · 2024-02-08 16:27:59 -05:00 · 82fb6a90d5
--- a/prism/encoding.h
+++ b/prism/encoding.h
@ -248,7 +248,7 @@ extern const pm_encoding_t pm_encodings[PM_ENCODING_MAXIMUM];
 /**
 * This is the ASCII-8BIT encoding. We need a reference to it so that pm_strpbrk
 * can compare against it because invalid multibyte characters are not a thing
- * in this encoding.
+ * in this encoding. It is also needed for handling Regexp encoding flags.
 */
 #define PM_ENCODING_ASCII_8BIT_ENTRY (&pm_encodings[PM_ENCODING_ASCII_8BIT])

--- a/prism/parser.h
+++ b/prism/parser.h
@ -663,6 +663,17 @@ struct pm_parser {
     */
    pm_string_t current_string;

+    /**
+     * This string is used to pass information from the lexer to the parser. When
+     * processing regular expressions we must track the string source for the expression
+     * as well as its unescaped representation. In that case, `current_string` will hold
+     * the unescaped value while this field will hold the translated source value. There
+     * are some escape sequences in regular expressions that will cause the associated
+     * source string to have a different value than the content of the expression so we
+     * must track this state separately.
+     */
+    pm_string_t current_regular_expression_source;
+
    /**
     * The line number at the start of the parse. This will be used to offset
     * the line numbers of all of the locations.
--- a/prism/prism.c
+++ b/prism/prism.c
@ -5949,6 +5949,34 @@ parse_symbol_encoding(const pm_parser_t *parser, const pm_string_t *contents) {
    return 0;
 }

+/**
+ * Ruby "downgrades" the encoding of Regexps to US-ASCII if the associated encoding is ASCII-compatible and
+ * the unescaped representation of a Regexp source consists only of US-ASCII code points. This is true even
+ * when the Regexp is explicitly given an ASCII-8BIT encoding via the (/n) modifier. Otherwise, the encoding
+ * may be explicitly set with an escape sequence.
+ */
+static inline pm_node_flags_t
+parse_regular_expression_encoding(const pm_parser_t *parser, const pm_string_t *contents) {
+    // Ruby stipulates that all source files must use an ASCII-compatible encoding. Thus, all regular expressions
+    // appearing in source are eligible for "downgrading" to US-ASCII.
+    if (pm_ascii_only_p(contents)) {
+        return PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING;
+    }
+
+    // A Regexp may optionally have its encoding explicitly set via a character escape sequence in the source string
+    // or by specifying a modifier.
+    //
+    // NB: an explicitly set encoding is ignored by Ruby if the Regexp consists of only US ASCII code points.
+    if (parser->explicit_encoding != NULL) {
+        if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) {
+            return PM_REGULAR_EXPRESSION_FLAGS_FORCED_UTF8_ENCODING;
+        } else if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) {
+            return PM_REGULAR_EXPRESSION_FLAGS_FORCED_BINARY_ENCODING;
+        }
+    }
+    return 0;
+}
+
 /**
 * Allocate and initialize a new SymbolNode node with the given unescaped
 * string.
@ -8130,34 +8158,34 @@ escape_write_escape_encoded(pm_parser_t *parser, pm_buffer_t *buffer) {
 * source so that the regular expression engine will perform its own unescaping.
 */
 static inline void
-escape_write_byte(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags, uint8_t byte) {
+escape_write_byte(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expression_buffer, uint8_t flags, uint8_t byte) {
    if (flags & PM_ESCAPE_FLAG_REGEXP) {
-        pm_buffer_append_bytes(buffer, (const uint8_t *) "\\x", 2);
+        pm_buffer_append_bytes(regular_expression_buffer, (const uint8_t *) "\\x", 2);

        uint8_t byte1 = (uint8_t) ((byte >> 4) & 0xF);
        uint8_t byte2 = (uint8_t) (byte & 0xF);

        if (byte1 >= 0xA) {
-            pm_buffer_append_byte(buffer, (uint8_t) ((byte1 - 0xA) + 'A'));
+            pm_buffer_append_byte(regular_expression_buffer, (uint8_t) ((byte1 - 0xA) + 'A'));
        } else {
-            pm_buffer_append_byte(buffer, (uint8_t) (byte1 + '0'));
+            pm_buffer_append_byte(regular_expression_buffer, (uint8_t) (byte1 + '0'));
        }

        if (byte2 >= 0xA) {
-            pm_buffer_append_byte(buffer, (uint8_t) (byte2 - 0xA + 'A'));
+            pm_buffer_append_byte(regular_expression_buffer, (uint8_t) (byte2 - 0xA + 'A'));
        } else {
-            pm_buffer_append_byte(buffer, (uint8_t) (byte2 + '0'));
+            pm_buffer_append_byte(regular_expression_buffer, (uint8_t) (byte2 + '0'));
        }
-    } else {
-        escape_write_byte_encoded(parser, buffer, byte);
    }
+
+    escape_write_byte_encoded(parser, buffer, byte);
 }

 /**
 * Read the value of an escape into the buffer.
 */
 static void
-escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
+escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expression_buffer, uint8_t flags) {
    switch (peek(parser)) {
        case '\\': {
            parser->current.end++;
@ -8248,10 +8276,10 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
                }

                if (flags & PM_ESCAPE_FLAG_REGEXP) {
-                    pm_buffer_append_bytes(buffer, start, (size_t) (parser->current.end - start));
-                } else {
-                    escape_write_byte_encoded(parser, buffer, value);
+                    pm_buffer_append_bytes(regular_expression_buffer, start, (size_t) (parser->current.end - start));
                }
+
+                escape_write_byte_encoded(parser, buffer, value);
            } else {
                pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_HEXADECIMAL);
            }
@ -8272,10 +8300,9 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
                uint32_t value = escape_unicode(parser->current.end, 4);

                if (flags & PM_ESCAPE_FLAG_REGEXP) {
-                    pm_buffer_append_bytes(buffer, start, (size_t) (parser->current.end + 4 - start));
-                } else {
-                    escape_write_unicode(parser, buffer, flags, start, parser->current.end + 4, value);
+                    pm_buffer_append_bytes(regular_expression_buffer, start, (size_t) (parser->current.end + 4 - start));
                }
+                escape_write_unicode(parser, buffer, flags, start, parser->current.end + 4, value);

                parser->current.end += 4;
            } else if (peek(parser) == '{') {
@ -8306,10 +8333,8 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
                        extra_codepoints_start = unicode_start;
                    }

-                    if (!(flags & PM_ESCAPE_FLAG_REGEXP)) {
-                        uint32_t value = escape_unicode(unicode_start, hexadecimal_length);
-                        escape_write_unicode(parser, buffer, flags, unicode_start, parser->current.end, value);
-                    }
+                    uint32_t value = escape_unicode(unicode_start, hexadecimal_length);
+                    escape_write_unicode(parser, buffer, flags, unicode_start, parser->current.end, value);

                    parser->current.end += pm_strspn_whitespace(parser->current.end, parser->end - parser->current.end);
                }
@ -8327,7 +8352,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
                }

                if (flags & PM_ESCAPE_FLAG_REGEXP) {
-                    pm_buffer_append_bytes(buffer, unicode_codepoints_start, (size_t) (parser->current.end - unicode_codepoints_start));
+                    pm_buffer_append_bytes(regular_expression_buffer, unicode_codepoints_start, (size_t) (parser->current.end - unicode_codepoints_start));
                }
            } else {
                pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_UNICODE);
@ -8346,7 +8371,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
            switch (peeked) {
                case '?': {
                    parser->current.end++;
-                    escape_write_byte(parser, buffer, flags, escape_byte(0x7f, flags));
+                    escape_write_byte(parser, buffer, regular_expression_buffer, flags, escape_byte(0x7f, flags));
                    return;
                }
                case '\\':
@ -8355,7 +8380,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
                        return;
                    }
                    parser->current.end++;
-                    escape_read(parser, buffer, flags | PM_ESCAPE_FLAG_CONTROL);
+                    escape_read(parser, buffer, regular_expression_buffer, flags | PM_ESCAPE_FLAG_CONTROL);
                    return;
                default: {
                    if (!char_is_ascii_printable(peeked)) {
@ -8364,7 +8389,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
                    }

                    parser->current.end++;
-                    escape_write_byte(parser, buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL));
+                    escape_write_byte(parser, buffer, regular_expression_buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL));
                    return;
                }
            }
@ -8386,7 +8411,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
            switch (peeked) {
                case '?': {
                    parser->current.end++;
-                    escape_write_byte(parser, buffer, flags, escape_byte(0x7f, flags));
+                    escape_write_byte(parser, buffer, regular_expression_buffer, flags, escape_byte(0x7f, flags));
                    return;
                }
                case '\\':
@ -8395,7 +8420,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
                        return;
                    }
                    parser->current.end++;
-                    escape_read(parser, buffer, flags | PM_ESCAPE_FLAG_CONTROL);
+                    escape_read(parser, buffer, regular_expression_buffer, flags | PM_ESCAPE_FLAG_CONTROL);
                    return;
                default: {
                    if (!char_is_ascii_printable(peeked)) {
@ -8404,7 +8429,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
                    }

                    parser->current.end++;
-                    escape_write_byte(parser, buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL));
+                    escape_write_byte(parser, buffer, regular_expression_buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL));
                    return;
                }
            }
@ -8429,7 +8454,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
                    return;
                }
                parser->current.end++;
-                escape_read(parser, buffer, flags | PM_ESCAPE_FLAG_META);
+                escape_read(parser, buffer, regular_expression_buffer, flags | PM_ESCAPE_FLAG_META);
                return;
            }

@ -8439,7 +8464,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
            }

            parser->current.end++;
-            escape_write_byte(parser, buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_META));
+            escape_write_byte(parser, buffer, regular_expression_buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_META));
            return;
        }
        case '\r': {
@ -8510,7 +8535,7 @@ lex_question_mark(pm_parser_t *parser) {
        pm_buffer_t buffer;
        pm_buffer_init_capacity(&buffer, 3);

-        escape_read(parser, &buffer, PM_ESCAPE_FLAG_SINGLE);
+        escape_read(parser, &buffer, NULL, PM_ESCAPE_FLAG_SINGLE);
        pm_string_owned_init(&parser->current_string, (uint8_t *) buffer.value, buffer.length);

        return PM_TOKEN_CHARACTER_LITERAL;
@ -8724,7 +8749,7 @@ parser_end_of_line_p(const pm_parser_t *parser) {
 *     "foo\n"
 *
 * then the bytes in the string are "f", "o", "o", "\", "n", but we want to
- * provide out consumers with the string content "f", "o", "o", "\n". In these
+ * provide our consumers with the string content "f", "o", "o", "\n". In these
 * cases, when we find the first escape sequence, we initialize a pm_buffer_t
 * to keep track of the string content. Then in the parser, it will
 * automatically attach the string content to the node that it belongs to.
@ -8736,6 +8761,20 @@ typedef struct {
     */
    pm_buffer_t buffer;

+    /**
+    * In order to properly set a regular expression's encoding and to validate
+    * the byte sequence for the underlying encoding we must process any escape
+    * sequences. The unescaped byte sequence will be stored in `buffer` just like
+    * for other string-like types. However, we also need to store the regular
+    * expression's source string. That string may different from the what we see
+    * during lexing because some escape sequences rewrite the source.
+    *
+    * This value will only be initialized for regular expressions and only if we
+    * receive an escape sequence. It will contain the regular expression's source
+    * string's byte sequence.
+    */
+    pm_buffer_t regular_expression_buffer;
+
    /**
     * The cursor into the source string that points to how far we have
     * currently copied into the buffer.
@ -8751,19 +8790,29 @@ pm_token_buffer_push_byte(pm_token_buffer_t *token_buffer, uint8_t byte) {
    pm_buffer_append_byte(&token_buffer->buffer, byte);
 }

+static inline void
+pm_token_buffer_push_byte_regular_expression(pm_token_buffer_t *token_buffer, uint8_t byte) {
+    pm_buffer_append_byte(&token_buffer->regular_expression_buffer, byte);
+}
+
+
 /**
 * Append the given bytes into the token buffer.
 */
 static inline void
-pm_token_buffer_push_bytes(pm_token_buffer_t *token_buffer, const uint8_t *bytes, size_t length) {
+pm_token_buffer_push_bytes(pm_token_buffer_t *token_buffer, const uint8_t *bytes, size_t length, uint8_t flags) {
    pm_buffer_append_bytes(&token_buffer->buffer, bytes, length);
+
+    if (flags & PM_ESCAPE_FLAG_REGEXP) {
+        pm_buffer_append_bytes(&token_buffer->regular_expression_buffer, bytes, length);
+    }
 }

 /**
 * Push an escaped character into the token buffer.
 */
 static inline void
-pm_token_buffer_push_escaped(pm_token_buffer_t *token_buffer, pm_parser_t *parser) {
+pm_token_buffer_push_escaped(pm_token_buffer_t *token_buffer, pm_parser_t *parser, uint8_t flags) {
    // First, determine the width of the character to be escaped.
    size_t width;
    if (parser->encoding_changed) {
@ -8777,7 +8826,7 @@ pm_token_buffer_push_escaped(pm_token_buffer_t *token_buffer, pm_parser_t *parse
    width = (width == 0 ? 1 : width);

    // Now, push the bytes into the buffer.
-    pm_token_buffer_push_bytes(token_buffer, parser->current.end, width);
+    pm_token_buffer_push_bytes(token_buffer, parser->current.end, width, flags);
    parser->current.end += width;
 }

@ -8790,6 +8839,7 @@ pm_token_buffer_push_escaped(pm_token_buffer_t *token_buffer, pm_parser_t *parse
 static inline void
 pm_token_buffer_copy(pm_parser_t *parser, pm_token_buffer_t *token_buffer) {
    pm_string_owned_init(&parser->current_string, (uint8_t *) token_buffer->buffer.value, token_buffer->buffer.length);
+    pm_string_owned_init(&parser->current_regular_expression_source, (uint8_t *) token_buffer->regular_expression_buffer.value, token_buffer->regular_expression_buffer.length);
 }

 /**
@ -8805,8 +8855,10 @@ static void
 pm_token_buffer_flush(pm_parser_t *parser, pm_token_buffer_t *token_buffer) {
    if (token_buffer->cursor == NULL) {
        pm_string_shared_init(&parser->current_string, parser->current.start, parser->current.end);
+        pm_string_shared_init(&parser->current_regular_expression_source, parser->current.start, parser->current.end);
    } else {
        pm_buffer_append_bytes(&token_buffer->buffer, token_buffer->cursor, (size_t) (parser->current.end - token_buffer->cursor));
+        pm_buffer_append_bytes(&token_buffer->regular_expression_buffer, token_buffer->cursor, (size_t) (parser->current.end - token_buffer->cursor));
        pm_token_buffer_copy(parser, token_buffer);
    }
 }
@ -8824,6 +8876,7 @@ pm_token_buffer_escape(pm_parser_t *parser, pm_token_buffer_t *token_buffer) {
    const uint8_t *start;
    if (token_buffer->cursor == NULL) {
        pm_buffer_init_capacity(&token_buffer->buffer, 16);
+        pm_buffer_init_capacity(&token_buffer->regular_expression_buffer, 16);
        start = parser->current.start;
    } else {
        start = token_buffer->cursor;
@ -8831,6 +8884,7 @@ pm_token_buffer_escape(pm_parser_t *parser, pm_token_buffer_t *token_buffer) {

    const uint8_t *end = parser->current.end - 1;
    pm_buffer_append_bytes(&token_buffer->buffer, start, (size_t) (end - start));
+    pm_buffer_append_bytes(&token_buffer->regular_expression_buffer, start, (size_t) (end - start));

    token_buffer->cursor = end;
 }
@ -10143,7 +10197,7 @@ parser_lex(pm_parser_t *parser) {

            // If we haven't found an escape yet, then this buffer will be
            // unallocated since we can refer directly to the source string.
-            pm_token_buffer_t token_buffer = { { 0 }, 0 };
+            pm_token_buffer_t token_buffer = { { 0 }, { 0 }, 0 };

            while (breakpoint != NULL) {
                // If we hit a null byte, skip directly past it.
@ -10242,10 +10296,10 @@ parser_lex(pm_parser_t *parser) {
                                pm_token_buffer_push_byte(&token_buffer, peeked);
                                parser->current.end++;
                            } else if (lex_mode->as.list.interpolation) {
-                                escape_read(parser, &token_buffer.buffer, PM_ESCAPE_FLAG_NONE);
+                                escape_read(parser, &token_buffer.buffer, NULL, PM_ESCAPE_FLAG_NONE);
                            } else {
                                pm_token_buffer_push_byte(&token_buffer, '\\');
-                                pm_token_buffer_push_escaped(&token_buffer, parser);
+                                pm_token_buffer_push_escaped(&token_buffer, parser, PM_ESCAPE_FLAG_NONE);
                            }

                            break;
@ -10320,7 +10374,7 @@ parser_lex(pm_parser_t *parser) {
            // characters.
            const uint8_t *breakpoints = lex_mode->as.regexp.breakpoints;
            const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
-            pm_token_buffer_t token_buffer = { { 0 }, 0 };
+            pm_token_buffer_t token_buffer = { { 0 },  { 0 }, 0 };

            while (breakpoint != NULL) {
                // If we hit a null byte, skip directly past it.
@ -10403,9 +10457,10 @@ parser_lex(pm_parser_t *parser) {
                            parser->current.end++;
                            if (peek(parser) != '\n') {
                                if (lex_mode->as.regexp.terminator != '\r') {
-                                    pm_token_buffer_push_byte(&token_buffer, '\\');
+                                    pm_token_buffer_push_byte_regular_expression(&token_buffer, '\\');
                                }
                                pm_token_buffer_push_byte(&token_buffer, '\r');
+                                pm_token_buffer_push_byte_regular_expression(&token_buffer, '\r');
                                break;
                            }
                        /* fallthrough */
@ -10429,7 +10484,7 @@ parser_lex(pm_parser_t *parser) {
                        case 'M':
                        case 'u':
                        case 'x':
-                            escape_read(parser, &token_buffer.buffer, PM_ESCAPE_FLAG_REGEXP);
+                            escape_read(parser, &token_buffer.buffer, &token_buffer.regular_expression_buffer, PM_ESCAPE_FLAG_REGEXP);
                            break;
                        default:
                            if (lex_mode->as.regexp.terminator == peeked) {
@ -10440,19 +10495,20 @@ parser_lex(pm_parser_t *parser) {
                                    case '$': case ')': case '*': case '+':
                                    case '.': case '>': case '?': case ']':
                                    case '^': case '|': case '}':
-                                        pm_token_buffer_push_byte(&token_buffer, '\\');
+                                        pm_token_buffer_push_byte_regular_expression(&token_buffer, '\\');
                                        break;
                                    default:
                                        break;
                                }

                                pm_token_buffer_push_byte(&token_buffer, peeked);
+                                pm_token_buffer_push_byte_regular_expression(&token_buffer, peeked);
                                parser->current.end++;
                                break;
                            }

-                            if (peeked < 0x80) pm_token_buffer_push_byte(&token_buffer, '\\');
-                            pm_token_buffer_push_escaped(&token_buffer, parser);
+                            if (peeked < 0x80) pm_token_buffer_push_byte_regular_expression(&token_buffer, '\\');
+                            pm_token_buffer_push_escaped(&token_buffer, parser, PM_ESCAPE_FLAG_REGEXP);
                            break;
                    }

@ -10525,7 +10581,7 @@ parser_lex(pm_parser_t *parser) {

            // If we haven't found an escape yet, then this buffer will be
            // unallocated since we can refer directly to the source string.
-            pm_token_buffer_t token_buffer = { { 0 }, 0 };
+            pm_token_buffer_t token_buffer = { { 0 }, { 0 }, 0 };

            while (breakpoint != NULL) {
                // If we hit the incrementor, then we'll increment then nesting and
@ -10660,10 +10716,10 @@ parser_lex(pm_parser_t *parser) {
                                    pm_token_buffer_push_byte(&token_buffer, peeked);
                                    parser->current.end++;
                                } else if (lex_mode->as.string.interpolation) {
-                                    escape_read(parser, &token_buffer.buffer, PM_ESCAPE_FLAG_NONE);
+                                    escape_read(parser, &token_buffer.buffer, NULL, PM_ESCAPE_FLAG_NONE);
                                } else {
                                    pm_token_buffer_push_byte(&token_buffer, '\\');
-                                    pm_token_buffer_push_escaped(&token_buffer, parser);
+                                    pm_token_buffer_push_escaped(&token_buffer, parser, PM_ESCAPE_FLAG_NONE);
                                }

                                break;
@ -10813,7 +10869,7 @@ parser_lex(pm_parser_t *parser) {
            }

            const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
-            pm_token_buffer_t token_buffer = { { 0 }, 0 };
+            pm_token_buffer_t token_buffer = { { 0 }, { 0 }, 0 };
            bool was_line_continuation = false;

            while (breakpoint != NULL) {
@ -10935,7 +10991,7 @@ parser_lex(pm_parser_t *parser) {
                                    continue;
                                default:
                                    pm_token_buffer_push_byte(&token_buffer, '\\');
-                                    pm_token_buffer_push_escaped(&token_buffer, parser);
+                                    pm_token_buffer_push_escaped(&token_buffer, parser, PM_ESCAPE_FLAG_NONE);
                                    break;
                            }
                        } else {
@ -10972,7 +11028,7 @@ parser_lex(pm_parser_t *parser) {
                                    breakpoint = parser->current.end;
                                    continue;
                                default:
-                                    escape_read(parser, &token_buffer.buffer, PM_ESCAPE_FLAG_NONE);
+                                    escape_read(parser, &token_buffer.buffer, NULL, PM_ESCAPE_FLAG_NONE);
                                    break;
                            }
                        }
@ -16948,7 +17004,11 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
                };

                parser_lex(parser);
-                return (pm_node_t *) pm_regular_expression_node_create(parser, &opening, &content, &parser->previous);
+
+                pm_node_t *regular_expression_node = (pm_node_t *) (pm_node_t *) pm_regular_expression_node_create(parser, &opening, &content, &parser->previous);
+                pm_node_flag_set(regular_expression_node, PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING);
+
+                return regular_expression_node;
            }

            pm_interpolated_regular_expression_node_t *node;
@ -16959,6 +17019,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
                // following token is the end (in which case we can return a plain
                // regular expression) or if it's not then it has interpolation.
                pm_string_t unescaped = parser->current_string;
+                pm_string_t source = parser->current_regular_expression_source;
                pm_token_t content = parser->current;
                parser_lex(parser);

@ -16966,7 +17027,9 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
                // without interpolation, which can be represented more succinctly and
                // more easily compiled.
                if (accept1(parser, PM_TOKEN_REGEXP_END)) {
-                    return (pm_node_t *) pm_regular_expression_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped);
+                    pm_node_t *regular_expression_node = (pm_node_t *) pm_regular_expression_node_create_unescaped(parser, &opening, &content, &parser->previous, &source);
+                    pm_node_flag_set(regular_expression_node, parse_regular_expression_encoding(parser, &unescaped));
+                    return regular_expression_node;
                }

                // If we get here, then we have interpolation so we'll need to create
@ -18527,6 +18590,7 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm
        .newline_list = { 0 },
        .integer_base = 0,
        .current_string = PM_STRING_EMPTY,
+        .current_regular_expression_source = PM_STRING_EMPTY,
        .start_line = 1,
        .explicit_encoding = NULL,
        .command_line = 0,
--- a/test/prism/encoding_test.rb
+++ b/test/prism/encoding_test.rb
@ -149,6 +149,7 @@ module Prism
    escapes = ["\\x00", "\\x7F", "\\x80", "\\xFF", "\\u{00}", "\\u{7F}", "\\u{80}", "\\M-\\C-?"]
    escapes = escapes.concat(escapes.product(escapes).map(&:join))
    symbols = [:a, :ą, :+]
+    regexps = [/a/, /ą/, //]

    encodings.each_key do |encoding|
      define_method(:"test_encoding_flags_#{encoding.name}") do
@ -168,6 +169,18 @@ module Prism
      end
    end

+    encodings.each_key do |encoding|
+      define_method(:"test_regular_expression_encoding_flags_#{encoding.name}") do
+        assert_regular_expression_encoding_flags(encoding, regexps.map(&:inspect))
+      end
+    end
+
+    encodings.each_key do |encoding|
+      define_method(:"test_regular_expression_escape_encoding_flags_#{encoding.name}") do
+        assert_regular_expression_encoding_flags(encoding, escapes.map { |e| "/#{e}/" })
+      end
+    end
+
    def test_coding
      result = Prism.parse("# coding: utf-8\n'string'")
      actual = result.value.statements.body.first.unescaped.encoding
@ -454,5 +467,50 @@ module Prism
        assert_equal expected, actual
      end
    end
+
+    def assert_regular_expression_encoding_flags(encoding, regexps)
+      regexps.each do |regexp|
+        source = "# encoding: #{encoding.name}\n#{regexp}"
+
+        expected =
+          begin
+            eval(source).encoding
+          rescue SyntaxError => error
+            if error.message.include?("UTF-8 character in non UTF-8 regexp") || error.message.include?("escaped non ASCII character in UTF-8 regexp")
+              error.message[/: (.+?)\n/, 1]
+            elsif error.message.include?("invalid multibyte char")
+              # TODO (nirvdrum 26-Jan-2024): Bail out early of the rest of the test due to https://github.com/ruby/prism/issues/2104.
+              next
+            else
+              raise
+            end
+          end
+
+        actual =
+          Prism.parse(source).then do |result|
+            if result.success?
+              regexp = result.value.statements.body.first
+
+              if regexp.forced_utf8_encoding?
+                Encoding::UTF_8
+              elsif regexp.forced_binary_encoding?
+                Encoding::ASCII_8BIT
+              elsif regexp.forced_us_ascii_encoding?
+                Encoding::US_ASCII
+              else
+                encoding
+              end
+            else
+              error = result.errors.last
+
+              unless error.message.include?("UTF-8 mixed within")
+                raise error.message
+              end
+            end
+          end
+
+        assert_equal expected, actual
+      end
+    end
  end
 end
--- a/test/prism/snapshots/heredoc_with_escaped_newline_at_start.txt
+++ b/test/prism/snapshots/heredoc_with_escaped_newline_at_start.txt
@ -21,7 +21,7 @@
        │   │   ├── flags: ∅
        │   │   └── arguments: (length: 2)
        │   │       ├── @ RegularExpressionNode (location: (1,15)-(1,21))
-        │   │       │   ├── flags: ∅
+        │   │       │   ├── flags: forced_us_ascii_encoding
        │   │       │   ├── opening_loc: (1,15)-(1,16) = "/"
        │   │       │   ├── content_loc: (1,16)-(1,20) = "^\\s{"
        │   │       │   ├── closing_loc: (1,20)-(1,21) = "/"
@ -52,7 +52,7 @@
            │   ├── flags: ∅
            │   └── arguments: (length: 2)
            │       ├── @ RegularExpressionNode (location: (5,15)-(5,21))
-            │       │   ├── flags: ∅
+            │       │   ├── flags: forced_us_ascii_encoding
            │       │   ├── opening_loc: (5,15)-(5,16) = "/"
            │       │   ├── content_loc: (5,16)-(5,20) = "^\\s{"
            │       │   ├── closing_loc: (5,20)-(5,21) = "/"
--- a/test/prism/snapshots/newline_terminated.txt
+++ b/test/prism/snapshots/newline_terminated.txt
@ -100,7 +100,7 @@
        │   ├── closing_loc: (37,3)-(38,0) = "\n"
        │   └── unescaped: "foo"
        └── @ RegularExpressionNode (location: (39,0)-(41,0))
-            ├── flags: ∅
+            ├── flags: forced_us_ascii_encoding
            ├── opening_loc: (39,0)-(40,0) = "%r\n"
            ├── content_loc: (40,0)-(40,3) = "foo"
            ├── closing_loc: (40,3)-(41,0) = "\n"
--- a/test/prism/snapshots/patterns.txt
+++ b/test/prism/snapshots/patterns.txt
@ -165,7 +165,7 @@
        │   │   └── block: ∅
        │   ├── pattern:
        │   │   @ RegularExpressionNode (location: (9,7)-(9,12))
-        │   │   ├── flags: ∅
+        │   │   ├── flags: forced_us_ascii_encoding
        │   │   ├── opening_loc: (9,7)-(9,8) = "/"
        │   │   ├── content_loc: (9,8)-(9,11) = "foo"
        │   │   ├── closing_loc: (9,11)-(9,12) = "/"
@ -719,14 +719,14 @@
        │   │   ├── flags: ∅
        │   │   ├── left:
        │   │   │   @ RegularExpressionNode (location: (35,7)-(35,12))
-        │   │   │   ├── flags: ∅
+        │   │   │   ├── flags: forced_us_ascii_encoding
        │   │   │   ├── opening_loc: (35,7)-(35,8) = "/"
        │   │   │   ├── content_loc: (35,8)-(35,11) = "foo"
        │   │   │   ├── closing_loc: (35,11)-(35,12) = "/"
        │   │   │   └── unescaped: "foo"
        │   │   ├── right:
        │   │   │   @ RegularExpressionNode (location: (35,16)-(35,21))
-        │   │   │   ├── flags: ∅
+        │   │   │   ├── flags: forced_us_ascii_encoding
        │   │   │   ├── opening_loc: (35,16)-(35,17) = "/"
        │   │   │   ├── content_loc: (35,17)-(35,20) = "foo"
        │   │   │   ├── closing_loc: (35,20)-(35,21) = "/"
@ -2543,7 +2543,7 @@
        │   │   └── block: ∅
        │   ├── pattern:
        │   │   @ RegularExpressionNode (location: (112,7)-(112,12))
-        │   │   ├── flags: ∅
+        │   │   ├── flags: forced_us_ascii_encoding
        │   │   ├── opening_loc: (112,7)-(112,8) = "/"
        │   │   ├── content_loc: (112,8)-(112,11) = "foo"
        │   │   ├── closing_loc: (112,11)-(112,12) = "/"
@ -3126,7 +3126,7 @@
        │   │   └── @ InNode (location: (143,10)-(143,23))
        │   │       ├── pattern:
        │   │       │   @ RegularExpressionNode (location: (143,13)-(143,18))
-        │   │       │   ├── flags: ∅
+        │   │       │   ├── flags: forced_us_ascii_encoding
        │   │       │   ├── opening_loc: (143,13)-(143,14) = "/"
        │   │       │   ├── content_loc: (143,14)-(143,17) = "foo"
        │   │       │   ├── closing_loc: (143,17)-(143,18) = "/"
@ -3914,7 +3914,7 @@
        │   │       │   │   @ StatementsNode (location: (170,13)-(170,18))
        │   │       │   │   └── body: (length: 1)
        │   │       │   │       └── @ RegularExpressionNode (location: (170,13)-(170,18))
-        │   │       │   │           ├── flags: ∅
+        │   │       │   │           ├── flags: forced_us_ascii_encoding
        │   │       │   │           ├── opening_loc: (170,13)-(170,14) = "/"
        │   │       │   │           ├── content_loc: (170,14)-(170,17) = "foo"
        │   │       │   │           ├── closing_loc: (170,17)-(170,18) = "/"
--- a/test/prism/snapshots/regex.txt
+++ b/test/prism/snapshots/regex.txt
@ -15,7 +15,7 @@
        │   │   ├── flags: ∅
        │   │   └── arguments: (length: 1)
        │   │       └── @ RegularExpressionNode (location: (1,4)-(1,9))
-        │   │           ├── flags: ∅
+        │   │           ├── flags: forced_us_ascii_encoding
        │   │           ├── opening_loc: (1,4)-(1,5) = "/"
        │   │           ├── content_loc: (1,5)-(1,8) = "bar"
        │   │           ├── closing_loc: (1,8)-(1,9) = "/"
@ -23,13 +23,13 @@
        │   ├── closing_loc: ∅
        │   └── block: ∅
        ├── @ RegularExpressionNode (location: (3,0)-(3,8))
-        │   ├── flags: ignore_case
+        │   ├── flags: ignore_case, forced_us_ascii_encoding
        │   ├── opening_loc: (3,0)-(3,3) = "%r{"
        │   ├── content_loc: (3,3)-(3,6) = "abc"
        │   ├── closing_loc: (3,6)-(3,8) = "}i"
        │   └── unescaped: "abc"
        ├── @ RegularExpressionNode (location: (5,0)-(5,5))
-        │   ├── flags: ∅
+        │   ├── flags: forced_us_ascii_encoding
        │   ├── opening_loc: (5,0)-(5,1) = "/"
        │   ├── content_loc: (5,1)-(5,4) = "a\\b"
        │   ├── closing_loc: (5,4)-(5,5) = "/"
@ -92,7 +92,7 @@
        │   │   │   │   ├── flags: ∅
        │   │   │   │   ├── receiver:
        │   │   │   │   │   @ RegularExpressionNode (location: (11,1)-(11,14))
-        │   │   │   │   │   ├── flags: ∅
+        │   │   │   │   │   ├── flags: forced_us_ascii_encoding
        │   │   │   │   │   ├── opening_loc: (11,1)-(11,2) = "/"
        │   │   │   │   │   ├── content_loc: (11,2)-(11,13) = "(?<foo>bar)"
        │   │   │   │   │   ├── closing_loc: (11,13)-(11,14) = "/"
@ -127,31 +127,31 @@
        │   ├── opening_loc: (11,0)-(11,1) = "["
        │   └── closing_loc: (11,26)-(11,27) = "]"
        ├── @ RegularExpressionNode (location: (13,0)-(13,6))
-        │   ├── flags: ignore_case
+        │   ├── flags: ignore_case, forced_us_ascii_encoding
        │   ├── opening_loc: (13,0)-(13,1) = "/"
        │   ├── content_loc: (13,1)-(13,4) = "abc"
        │   ├── closing_loc: (13,4)-(13,6) = "/i"
        │   └── unescaped: "abc"
        ├── @ RegularExpressionNode (location: (15,0)-(15,26))
-        │   ├── flags: ignore_case
+        │   ├── flags: ignore_case, forced_us_ascii_encoding
        │   ├── opening_loc: (15,0)-(15,3) = "%r/"
        │   ├── content_loc: (15,3)-(15,24) = "[a-z$._?][\\w$.?\#@~]*:"
        │   ├── closing_loc: (15,24)-(15,26) = "/i"
        │   └── unescaped: "[a-z$._?][\\w$.?\#@~]*:"
        ├── @ RegularExpressionNode (location: (17,0)-(17,37))
-        │   ├── flags: ignore_case
+        │   ├── flags: ignore_case, forced_us_ascii_encoding
        │   ├── opening_loc: (17,0)-(17,3) = "%r/"
        │   ├── content_loc: (17,3)-(17,35) = "([a-z$._?][\\w$.?\#@~]*)(\\s+)(equ)"
        │   ├── closing_loc: (17,35)-(17,37) = "/i"
        │   └── unescaped: "([a-z$._?][\\w$.?\#@~]*)(\\s+)(equ)"
        ├── @ RegularExpressionNode (location: (19,0)-(19,25))
-        │   ├── flags: ignore_case
+        │   ├── flags: ignore_case, forced_us_ascii_encoding
        │   ├── opening_loc: (19,0)-(19,3) = "%r/"
        │   ├── content_loc: (19,3)-(19,23) = "[a-z$._?][\\w$.?\#@~]*"
        │   ├── closing_loc: (19,23)-(19,25) = "/i"
        │   └── unescaped: "[a-z$._?][\\w$.?\#@~]*"
        ├── @ RegularExpressionNode (location: (21,0)-(24,1))
-        │   ├── flags: ∅
+        │   ├── flags: forced_us_ascii_encoding
        │   ├── opening_loc: (21,0)-(21,3) = "%r("
        │   ├── content_loc: (21,3)-(24,0) = "\n(?:[\\w\#$%_']|\\(\\)|\\(,\\)|\\[\\]|[0-9])*\n  (?:[\\w\#$%_']+)\n"
        │   ├── closing_loc: (24,0)-(24,1) = ")"
@ -160,7 +160,7 @@
        │   ├── flags: ∅
        │   ├── receiver:
        │   │   @ RegularExpressionNode (location: (26,0)-(26,8))
-        │   │   ├── flags: ∅
+        │   │   ├── flags: forced_us_ascii_encoding
        │   │   ├── opening_loc: (26,0)-(26,1) = "/"
        │   │   ├── content_loc: (26,1)-(26,7) = "(?#\\))"
        │   │   ├── closing_loc: (26,7)-(26,8) = "/"
@ -182,7 +182,7 @@
        │   ├── closing_loc: ∅
        │   └── block: ∅
        ├── @ RegularExpressionNode (location: (28,0)-(28,9))
-        │   ├── flags: ∅
+        │   ├── flags: forced_us_ascii_encoding
        │   ├── opening_loc: (28,0)-(28,3) = "%r#"
        │   ├── content_loc: (28,3)-(28,8) = "pound"
        │   ├── closing_loc: (28,8)-(28,9) = "#"
@ -220,7 +220,7 @@
        │   │   ├── flags: ∅
        │   │   ├── receiver:
        │   │   │   @ RegularExpressionNode (location: (32,0)-(33,4))
-        │   │   │   ├── flags: ∅
+        │   │   │   ├── flags: forced_us_ascii_encoding
        │   │   │   ├── opening_loc: (32,0)-(32,1) = "/"
        │   │   │   ├── content_loc: (32,1)-(33,3) = "(?<a\\\nb>)"
        │   │   │   ├── closing_loc: (33,3)-(33,4) = "/"
@ -254,7 +254,7 @@
        │   │   ├── flags: ∅
        │   │   ├── receiver:
        │   │   │   @ RegularExpressionNode (location: (35,0)-(35,18))
-        │   │   │   ├── flags: ∅
+        │   │   │   ├── flags: forced_us_ascii_encoding
        │   │   │   ├── opening_loc: (35,0)-(35,1) = "/"
        │   │   │   ├── content_loc: (35,1)-(35,17) = "(?<abc>)(?<abc>)"
        │   │   │   ├── closing_loc: (35,17)-(35,18) = "/"
@ -286,7 +286,7 @@
        │   ├── flags: ∅
        │   ├── receiver:
        │   │   @ RegularExpressionNode (location: (37,0)-(37,10))
-        │   │   ├── flags: ∅
+        │   │   ├── flags: forced_us_ascii_encoding
        │   │   ├── opening_loc: (37,0)-(37,1) = "/"
        │   │   ├── content_loc: (37,1)-(37,9) = "(?<a b>)"
        │   │   ├── closing_loc: (37,9)-(37,10) = "/"
@ -338,7 +338,7 @@
                │           │   ├── flags: ∅
                │           │   ├── receiver:
                │           │   │   @ RegularExpressionNode (location: (40,6)-(40,14))
-                │           │   │   ├── flags: ∅
+                │           │   │   ├── flags: forced_us_ascii_encoding
                │           │   │   ├── opening_loc: (40,6)-(40,7) = "/"
                │           │   │   ├── content_loc: (40,7)-(40,13) = "(?<a>)"
                │           │   │   ├── closing_loc: (40,13)-(40,14) = "/"
--- a/test/prism/snapshots/seattlerb/TestRubyParserShared.txt
+++ b/test/prism/snapshots/seattlerb/TestRubyParserShared.txt
@ -70,7 +70,7 @@
        │   ├── opening_loc: (26,0)-(26,3) = "%i["
        │   └── closing_loc: (29,0)-(29,1) = "]"
        ├── @ RegularExpressionNode (location: (31,0)-(34,1))
-        │   ├── flags: ∅
+        │   ├── flags: forced_us_ascii_encoding
        │   ├── opening_loc: (31,0)-(31,3) = "%r["
        │   ├── content_loc: (31,3)-(34,0) = "\n\n\n"
        │   ├── closing_loc: (34,0)-(34,1) = "]"
--- a/test/prism/snapshots/seattlerb/bug190.txt
+++ b/test/prism/snapshots/seattlerb/bug190.txt
@ -4,7 +4,7 @@
    @ StatementsNode (location: (1,0)-(1,6))
    └── body: (length: 1)
        └── @ RegularExpressionNode (location: (1,0)-(1,6))
-            ├── flags: ∅
+            ├── flags: forced_us_ascii_encoding
            ├── opening_loc: (1,0)-(1,3) = "%r'"
            ├── content_loc: (1,3)-(1,5) = "\\'"
            ├── closing_loc: (1,5)-(1,6) = "'"
--- a/test/prism/snapshots/seattlerb/bug_case_when_regexp.txt
+++ b/test/prism/snapshots/seattlerb/bug_case_when_regexp.txt
@ -16,7 +16,7 @@
            │       ├── keyword_loc: (1,9)-(1,13) = "when"
            │       ├── conditions: (length: 1)
            │       │   └── @ RegularExpressionNode (location: (1,14)-(1,17))
-            │       │       ├── flags: ∅
+            │       │       ├── flags: forced_us_ascii_encoding
            │       │       ├── opening_loc: (1,14)-(1,15) = "/"
            │       │       ├── content_loc: (1,15)-(1,16) = "x"
            │       │       ├── closing_loc: (1,16)-(1,17) = "/"
--- a/test/prism/snapshots/seattlerb/bug_cond_pct.txt
+++ b/test/prism/snapshots/seattlerb/bug_cond_pct.txt
@ -10,7 +10,7 @@
            │       ├── keyword_loc: (1,6)-(1,10) = "when"
            │       ├── conditions: (length: 1)
            │       │   └── @ RegularExpressionNode (location: (1,11)-(1,23))
-            │       │       ├── flags: ∅
+            │       │       ├── flags: forced_us_ascii_encoding
            │       │       ├── opening_loc: (1,11)-(1,14) = "%r%"
            │       │       ├── content_loc: (1,14)-(1,22) = "blahblah"
            │       │       ├── closing_loc: (1,22)-(1,23) = "%"
--- a/test/prism/snapshots/seattlerb/case_in.txt
+++ b/test/prism/snapshots/seattlerb/case_in.txt
@ -338,7 +338,7 @@
        │   │   └── @ InNode (location: (46,0)-(46,11))
        │   │       ├── pattern:
        │   │       │   @ RegularExpressionNode (location: (46,3)-(46,11))
-        │   │       │   ├── flags: ∅
+        │   │       │   ├── flags: forced_us_ascii_encoding
        │   │       │   ├── opening_loc: (46,3)-(46,4) = "/"
        │   │       │   ├── content_loc: (46,4)-(46,10) = "regexp"
        │   │       │   ├── closing_loc: (46,10)-(46,11) = "/"
--- a/test/prism/snapshots/seattlerb/regexp.txt
+++ b/test/prism/snapshots/seattlerb/regexp.txt
@ -4,31 +4,31 @@
    @ StatementsNode (location: (1,0)-(9,13))
    └── body: (length: 5)
        ├── @ RegularExpressionNode (location: (1,0)-(1,5))
-        │   ├── flags: ∅
+        │   ├── flags: forced_us_ascii_encoding
        │   ├── opening_loc: (1,0)-(1,1) = "/"
        │   ├── content_loc: (1,1)-(1,4) = "wtf"
        │   ├── closing_loc: (1,4)-(1,5) = "/"
        │   └── unescaped: "wtf"
        ├── @ RegularExpressionNode (location: (3,0)-(3,6))
-        │   ├── flags: multi_line
+        │   ├── flags: multi_line, forced_us_ascii_encoding
        │   ├── opening_loc: (3,0)-(3,1) = "/"
        │   ├── content_loc: (3,1)-(3,4) = "wtf"
        │   ├── closing_loc: (3,4)-(3,6) = "/m"
        │   └── unescaped: "wtf"
        ├── @ RegularExpressionNode (location: (5,0)-(5,6))
-        │   ├── flags: ascii_8bit
+        │   ├── flags: ascii_8bit, forced_us_ascii_encoding
        │   ├── opening_loc: (5,0)-(5,1) = "/"
        │   ├── content_loc: (5,1)-(5,4) = "wtf"
        │   ├── closing_loc: (5,4)-(5,6) = "/n"
        │   └── unescaped: "wtf"
        ├── @ RegularExpressionNode (location: (7,0)-(7,7))
-        │   ├── flags: multi_line, ascii_8bit
+        │   ├── flags: multi_line, ascii_8bit, forced_us_ascii_encoding
        │   ├── opening_loc: (7,0)-(7,1) = "/"
        │   ├── content_loc: (7,1)-(7,4) = "wtf"
        │   ├── closing_loc: (7,4)-(7,7) = "/nm"
        │   └── unescaped: "wtf"
        └── @ RegularExpressionNode (location: (9,0)-(9,13))
-            ├── flags: multi_line, ascii_8bit
+            ├── flags: multi_line, ascii_8bit, forced_us_ascii_encoding
            ├── opening_loc: (9,0)-(9,1) = "/"
            ├── content_loc: (9,1)-(9,4) = "wtf"
            ├── closing_loc: (9,4)-(9,13) = "/nmnmnmnm"
--- a/test/prism/snapshots/seattlerb/regexp_esc_C_slash.txt
+++ b/test/prism/snapshots/seattlerb/regexp_esc_C_slash.txt
@ -4,7 +4,7 @@
    @ StatementsNode (location: (1,0)-(1,7))
    └── body: (length: 1)
        └── @ RegularExpressionNode (location: (1,0)-(1,7))
-            ├── flags: ∅
+            ├── flags: forced_us_ascii_encoding
            ├── opening_loc: (1,0)-(1,1) = "/"
            ├── content_loc: (1,1)-(1,6) = "\\cC\\d"
            ├── closing_loc: (1,6)-(1,7) = "/"
--- a/test/prism/snapshots/seattlerb/regexp_esc_u.txt
+++ b/test/prism/snapshots/seattlerb/regexp_esc_u.txt
@ -4,7 +4,7 @@
    @ StatementsNode (location: (1,0)-(1,17))
    └── body: (length: 1)
        └── @ RegularExpressionNode (location: (1,0)-(1,17))
-            ├── flags: ∅
+            ├── flags: forced_us_ascii_encoding
            ├── opening_loc: (1,0)-(1,1) = "/"
            ├── content_loc: (1,1)-(1,16) = "[\\u0021-\\u0027]"
            ├── closing_loc: (1,16)-(1,17) = "/"
--- a/test/prism/snapshots/seattlerb/regexp_unicode_curlies.txt
+++ b/test/prism/snapshots/seattlerb/regexp_unicode_curlies.txt
@ -4,13 +4,13 @@
    @ StatementsNode (location: (1,0)-(3,8))
    └── body: (length: 2)
        ├── @ RegularExpressionNode (location: (1,0)-(1,15))
-        │   ├── flags: ∅
+        │   ├── flags: forced_utf8_encoding
        │   ├── opening_loc: (1,0)-(1,1) = "/"
        │   ├── content_loc: (1,1)-(1,14) = "\\u{c0de babe}"
        │   ├── closing_loc: (1,14)-(1,15) = "/"
        │   └── unescaped: "\\u{c0de babe}"
        └── @ RegularExpressionNode (location: (3,0)-(3,8))
-            ├── flags: ∅
+            ├── flags: forced_utf8_encoding
            ├── opening_loc: (3,0)-(3,1) = "/"
            ├── content_loc: (3,1)-(3,7) = "\\u{df}"
            ├── closing_loc: (3,7)-(3,8) = "/"
--- a/test/prism/snapshots/spanning_heredoc_newlines.txt
+++ b/test/prism/snapshots/spanning_heredoc_newlines.txt
@ -46,7 +46,7 @@
        │   │   ├── flags: ∅
        │   │   └── arguments: (length: 1)
        │   │       └── @ RegularExpressionNode (location: (5,4)-(8,0))
-        │   │           ├── flags: ∅
+        │   │           ├── flags: forced_us_ascii_encoding
        │   │           ├── opening_loc: (5,4)-(6,0) = "%r\n"
        │   │           ├── content_loc: (6,0)-(6,0) = ""
        │   │           ├── closing_loc: (7,0)-(8,0) = "\n"
--- a/test/prism/snapshots/unescaping.txt
+++ b/test/prism/snapshots/unescaping.txt
@ -15,7 +15,7 @@
        │   ├── opening_loc: (1,0)-(1,1) = "["
        │   └── closing_loc: (1,9)-(1,10) = "]"
        ├── @ RegularExpressionNode (location: (3,0)-(3,8))
-        │   ├── flags: ∅
+        │   ├── flags: forced_us_ascii_encoding
        │   ├── opening_loc: (3,0)-(3,1) = "/"
        │   ├── content_loc: (3,1)-(3,7) = "\\c\#{1}"
        │   ├── closing_loc: (3,7)-(3,8) = "/"
--- a/test/prism/snapshots/unparser/corpus/literal/if.txt
+++ b/test/prism/snapshots/unparser/corpus/literal/if.txt
@ -7,7 +7,7 @@
        │   ├── if_keyword_loc: (1,0)-(1,2) = "if"
        │   ├── predicate:
        │   │   @ MatchLastLineNode (location: (1,3)-(1,8))
-        │   │   ├── flags: ∅
+        │   │   ├── flags: forced_us_ascii_encoding
        │   │   ├── opening_loc: (1,3)-(1,4) = "/"
        │   │   ├── content_loc: (1,4)-(1,7) = "foo"
        │   │   ├── closing_loc: (1,7)-(1,8) = "/"
--- a/test/prism/snapshots/unparser/corpus/literal/literal.txt
+++ b/test/prism/snapshots/unparser/corpus/literal/literal.txt
@ -566,13 +566,13 @@
        │   ├── closing_loc: (48,2)-(48,3) = "\""
        │   └── unescaped: ""
        ├── @ RegularExpressionNode (location: (49,0)-(49,5))
-        │   ├── flags: ∅
+        │   ├── flags: forced_us_ascii_encoding
        │   ├── opening_loc: (49,0)-(49,1) = "/"
        │   ├── content_loc: (49,1)-(49,4) = "foo"
        │   ├── closing_loc: (49,4)-(49,5) = "/"
        │   └── unescaped: "foo"
        ├── @ RegularExpressionNode (location: (50,0)-(50,28))
-        │   ├── flags: ∅
+        │   ├── flags: forced_us_ascii_encoding
        │   ├── opening_loc: (50,0)-(50,1) = "/"
        │   ├── content_loc: (50,1)-(50,27) = "[^-+',.\\/:@[:alnum:]\\[\\]]+"
        │   ├── closing_loc: (50,27)-(50,28) = "/"
@ -633,25 +633,25 @@
        │   │       └── closing_loc: (53,11)-(53,12) = "}"
        │   └── closing_loc: (53,12)-(53,13) = "/"
        ├── @ RegularExpressionNode (location: (54,0)-(54,4))
-        │   ├── flags: ∅
+        │   ├── flags: forced_us_ascii_encoding
        │   ├── opening_loc: (54,0)-(54,1) = "/"
        │   ├── content_loc: (54,1)-(54,3) = "\\n"
        │   ├── closing_loc: (54,3)-(54,4) = "/"
        │   └── unescaped: "\\n"
        ├── @ RegularExpressionNode (location: (55,0)-(55,4))
-        │   ├── flags: ∅
+        │   ├── flags: forced_us_ascii_encoding
        │   ├── opening_loc: (55,0)-(55,1) = "/"
        │   ├── content_loc: (55,1)-(55,3) = "\\n"
        │   ├── closing_loc: (55,3)-(55,4) = "/"
        │   └── unescaped: "\\n"
        ├── @ RegularExpressionNode (location: (56,0)-(56,5))
-        │   ├── flags: extended
+        │   ├── flags: extended, forced_us_ascii_encoding
        │   ├── opening_loc: (56,0)-(56,1) = "/"
        │   ├── content_loc: (56,1)-(56,3) = "\\n"
        │   ├── closing_loc: (56,3)-(56,5) = "/x"
        │   └── unescaped: "\\n"
        ├── @ RegularExpressionNode (location: (57,0)-(57,7))
-        │   ├── flags: extended
+        │   ├── flags: extended, forced_us_ascii_encoding
        │   ├── opening_loc: (57,0)-(57,1) = "/"
        │   ├── content_loc: (57,1)-(57,5) = "\\/\\/"
        │   ├── closing_loc: (57,5)-(57,7) = "/x"
--- a/test/prism/snapshots/unparser/corpus/literal/send.txt
+++ b/test/prism/snapshots/unparser/corpus/literal/send.txt
@ -425,7 +425,7 @@
        │   │   │           ├── flags: ∅
        │   │   │           ├── receiver:
        │   │   │           │   @ RegularExpressionNode (location: (37,1)-(37,6))
-        │   │   │           │   ├── flags: ∅
+        │   │   │           │   ├── flags: forced_us_ascii_encoding
        │   │   │           │   ├── opening_loc: (37,1)-(37,2) = "/"
        │   │   │           │   ├── content_loc: (37,2)-(37,5) = "bar"
        │   │   │           │   ├── closing_loc: (37,5)-(37,6) = "/"
@ -511,7 +511,7 @@
        │   │   │           │   ├── flags: ∅
        │   │   │           │   └── arguments: (length: 1)
        │   │   │           │       └── @ RegularExpressionNode (location: (39,8)-(39,13))
-        │   │   │           │           ├── flags: ∅
+        │   │   │           │           ├── flags: forced_us_ascii_encoding
        │   │   │           │           ├── opening_loc: (39,8)-(39,9) = "/"
        │   │   │           │           ├── content_loc: (39,9)-(39,12) = "bar"
        │   │   │           │           ├── closing_loc: (39,12)-(39,13) = "/"
@ -531,7 +531,7 @@
        │   ├── flags: ∅
        │   ├── receiver:
        │   │   @ RegularExpressionNode (location: (40,0)-(40,5))
-        │   │   ├── flags: ∅
+        │   │   ├── flags: forced_us_ascii_encoding
        │   │   ├── opening_loc: (40,0)-(40,1) = "/"
        │   │   ├── content_loc: (40,1)-(40,4) = "bar"
        │   │   ├── closing_loc: (40,4)-(40,5) = "/"
@ -556,7 +556,7 @@
        │   ├── flags: ∅
        │   ├── receiver:
        │   │   @ RegularExpressionNode (location: (41,0)-(41,5))
-        │   │   ├── flags: ∅
+        │   │   ├── flags: forced_us_ascii_encoding
        │   │   ├── opening_loc: (41,0)-(41,1) = "/"
        │   │   ├── content_loc: (41,1)-(41,4) = "bar"
        │   │   ├── closing_loc: (41,4)-(41,5) = "/"
@ -758,7 +758,7 @@
        │   │   ├── flags: ∅
        │   │   └── arguments: (length: 1)
        │   │       └── @ RegularExpressionNode (location: (49,7)-(49,12))
-        │   │           ├── flags: ∅
+        │   │           ├── flags: forced_us_ascii_encoding
        │   │           ├── opening_loc: (49,7)-(49,8) = "/"
        │   │           ├── content_loc: (49,8)-(49,11) = "bar"
        │   │           ├── closing_loc: (49,11)-(49,12) = "/"
@ -1007,7 +1007,7 @@
        │   │           │   ├── flags: ∅
        │   │           │   └── arguments: (length: 1)
        │   │           │       └── @ RegularExpressionNode (location: (57,11)-(57,16))
-        │   │           │           ├── flags: ∅
+        │   │           │           ├── flags: forced_us_ascii_encoding
        │   │           │           ├── opening_loc: (57,11)-(57,12) = "/"
        │   │           │           ├── content_loc: (57,12)-(57,15) = "bar"
        │   │           │           ├── closing_loc: (57,15)-(57,16) = "/"
--- a/test/prism/snapshots/unparser/corpus/semantic/literal.txt
+++ b/test/prism/snapshots/unparser/corpus/semantic/literal.txt
@ -31,13 +31,13 @@
        │   ├── closing_loc: ∅
        │   └── unescaped: "c"
        ├── @ RegularExpressionNode (location: (9,0)-(9,5))
-        │   ├── flags: ∅
+        │   ├── flags: forced_us_ascii_encoding
        │   ├── opening_loc: (9,0)-(9,3) = "%r("
        │   ├── content_loc: (9,3)-(9,4) = "/"
        │   ├── closing_loc: (9,4)-(9,5) = ")"
        │   └── unescaped: "/"
        ├── @ RegularExpressionNode (location: (10,0)-(10,6))
-        │   ├── flags: ∅
+        │   ├── flags: forced_us_ascii_encoding
        │   ├── opening_loc: (10,0)-(10,3) = "%r("
        │   ├── content_loc: (10,3)-(10,5) = "\\)"
        │   ├── closing_loc: (10,5)-(10,6) = ")"
--- a/test/prism/snapshots/whitequark/bug_regex_verification.txt
+++ b/test/prism/snapshots/whitequark/bug_regex_verification.txt
@ -4,7 +4,7 @@
    @ StatementsNode (location: (1,0)-(1,5))
    └── body: (length: 1)
        └── @ RegularExpressionNode (location: (1,0)-(1,5))
-            ├── flags: extended
+            ├── flags: extended, forced_us_ascii_encoding
            ├── opening_loc: (1,0)-(1,1) = "/"
            ├── content_loc: (1,1)-(1,3) = "#)"
            ├── closing_loc: (1,3)-(1,5) = "/x"
--- a/test/prism/snapshots/whitequark/cond_match_current_line.txt
+++ b/test/prism/snapshots/whitequark/cond_match_current_line.txt
@ -7,7 +7,7 @@
        │   ├── flags: ∅
        │   ├── receiver:
        │   │   @ MatchLastLineNode (location: (1,1)-(1,6))
-        │   │   ├── flags: ∅
+        │   │   ├── flags: forced_us_ascii_encoding
        │   │   ├── opening_loc: (1,1)-(1,2) = "/"
        │   │   ├── content_loc: (1,2)-(1,5) = "wat"
        │   │   ├── closing_loc: (1,5)-(1,6) = "/"
@ -23,7 +23,7 @@
            ├── if_keyword_loc: (3,0)-(3,2) = "if"
            ├── predicate:
            │   @ MatchLastLineNode (location: (3,3)-(3,8))
-            │   ├── flags: ∅
+            │   ├── flags: forced_us_ascii_encoding
            │   ├── opening_loc: (3,3)-(3,4) = "/"
            │   ├── content_loc: (3,4)-(3,7) = "wat"
            │   ├── closing_loc: (3,7)-(3,8) = "/"
--- a/test/prism/snapshots/whitequark/interp_digit_var.txt
+++ b/test/prism/snapshots/whitequark/interp_digit_var.txt
@ -106,13 +106,13 @@
        │   ├── closing_loc: (23,8)-(23,9) = "}"
        │   └── unescaped: "\#@@1"
        ├── @ RegularExpressionNode (location: (25,1)-(25,8))
-        │   ├── flags: ∅
+        │   ├── flags: forced_us_ascii_encoding
        │   ├── opening_loc: (25,1)-(25,4) = "%r{"
        │   ├── content_loc: (25,4)-(25,7) = "\#@1"
        │   ├── closing_loc: (25,7)-(25,8) = "}"
        │   └── unescaped: "\#@1"
        ├── @ RegularExpressionNode (location: (27,1)-(27,9))
-        │   ├── flags: ∅
+        │   ├── flags: forced_us_ascii_encoding
        │   ├── opening_loc: (27,1)-(27,4) = "%r{"
        │   ├── content_loc: (27,4)-(27,8) = "\#@@1"
        │   ├── closing_loc: (27,8)-(27,9) = "}"
@ -188,13 +188,13 @@
        │   ├── closing_loc: (47,6)-(47,7) = "'"
        │   └── unescaped: "\#@@1"
        ├── @ RegularExpressionNode (location: (49,1)-(49,6))
-        │   ├── flags: ∅
+        │   ├── flags: forced_us_ascii_encoding
        │   ├── opening_loc: (49,1)-(49,2) = "/"
        │   ├── content_loc: (49,2)-(49,5) = "\#@1"
        │   ├── closing_loc: (49,5)-(49,6) = "/"
        │   └── unescaped: "\#@1"
        ├── @ RegularExpressionNode (location: (51,1)-(51,7))
-        │   ├── flags: ∅
+        │   ├── flags: forced_us_ascii_encoding
        │   ├── opening_loc: (51,1)-(51,2) = "/"
        │   ├── content_loc: (51,2)-(51,6) = "\#@@1"
        │   ├── closing_loc: (51,6)-(51,7) = "/"
--- a/test/prism/snapshots/whitequark/lvar_injecting_match.txt
+++ b/test/prism/snapshots/whitequark/lvar_injecting_match.txt
@ -9,7 +9,7 @@
        │   │   ├── flags: ∅
        │   │   ├── receiver:
        │   │   │   @ RegularExpressionNode (location: (1,0)-(1,15))
-        │   │   │   ├── flags: ∅
+        │   │   │   ├── flags: forced_us_ascii_encoding
        │   │   │   ├── opening_loc: (1,0)-(1,1) = "/"
        │   │   │   ├── content_loc: (1,1)-(1,14) = "(?<match>bar)"
        │   │   │   ├── closing_loc: (1,14)-(1,15) = "/"
--- a/test/prism/snapshots/whitequark/parser_bug_830.txt
+++ b/test/prism/snapshots/whitequark/parser_bug_830.txt
@ -4,7 +4,7 @@
    @ StatementsNode (location: (1,0)-(1,4))
    └── body: (length: 1)
        └── @ RegularExpressionNode (location: (1,0)-(1,4))
-            ├── flags: ∅
+            ├── flags: forced_us_ascii_encoding
            ├── opening_loc: (1,0)-(1,1) = "/"
            ├── content_loc: (1,1)-(1,3) = "\\("
            ├── closing_loc: (1,3)-(1,4) = "/"
--- a/test/prism/snapshots/whitequark/parser_slash_slash_n_escaping_in_literals.txt
+++ b/test/prism/snapshots/whitequark/parser_slash_slash_n_escaping_in_literals.txt
@ -55,7 +55,7 @@
        │   ├── closing_loc: (17,1)-(17,2) = "}"
        │   └── unescaped: "a\\\nb"
        ├── @ RegularExpressionNode (location: (19,0)-(20,2))
-        │   ├── flags: ∅
+        │   ├── flags: forced_us_ascii_encoding
        │   ├── opening_loc: (19,0)-(19,3) = "%r{"
        │   ├── content_loc: (19,3)-(20,1) = "a\\\nb"
        │   ├── closing_loc: (20,1)-(20,2) = "}"
@ -96,7 +96,7 @@
        │   ├── closing_loc: (35,1)-(35,2) = "'"
        │   └── unescaped: "a\\\nb"
        ├── @ RegularExpressionNode (location: (37,0)-(38,2))
-        │   ├── flags: ∅
+        │   ├── flags: forced_us_ascii_encoding
        │   ├── opening_loc: (37,0)-(37,1) = "/"
        │   ├── content_loc: (37,1)-(38,1) = "a\\\nb"
        │   ├── closing_loc: (38,1)-(38,2) = "/"
--- a/test/prism/snapshots/whitequark/regex_plain.txt
+++ b/test/prism/snapshots/whitequark/regex_plain.txt
@ -4,7 +4,7 @@
    @ StatementsNode (location: (1,0)-(1,10))
    └── body: (length: 1)
        └── @ RegularExpressionNode (location: (1,0)-(1,10))
-            ├── flags: ignore_case, multi_line
+            ├── flags: ignore_case, multi_line, forced_us_ascii_encoding
            ├── opening_loc: (1,0)-(1,1) = "/"
            ├── content_loc: (1,1)-(1,7) = "source"
            ├── closing_loc: (1,7)-(1,10) = "/im"
--- a/test/prism/snapshots/whitequark/ruby_bug_11873.txt
+++ b/test/prism/snapshots/whitequark/ruby_bug_11873.txt
@ -112,7 +112,7 @@
        │   │       │   ├── closing_loc: (3,7)-(3,8) = ")"
        │   │       │   └── block: ∅
        │   │       └── @ RegularExpressionNode (location: (3,10)-(3,13))
-        │   │           ├── flags: ∅
+        │   │           ├── flags: forced_us_ascii_encoding
        │   │           ├── opening_loc: (3,10)-(3,11) = "/"
        │   │           ├── content_loc: (3,11)-(3,12) = "x"
        │   │           ├── closing_loc: (3,12)-(3,13) = "/"
@ -173,7 +173,7 @@
        │   │       │   ├── closing_loc: (5,7)-(5,8) = ")"
        │   │       │   └── block: ∅
        │   │       └── @ RegularExpressionNode (location: (5,10)-(5,14))
-        │   │           ├── flags: multi_line
+        │   │           ├── flags: multi_line, forced_us_ascii_encoding
        │   │           ├── opening_loc: (5,10)-(5,11) = "/"
        │   │           ├── content_loc: (5,11)-(5,12) = "x"
        │   │           ├── closing_loc: (5,12)-(5,14) = "/m"
@ -295,7 +295,7 @@
        │   │       │   ├── closing_loc: (9,8)-(9,9) = ")"
        │   │       │   └── block: ∅
        │   │       └── @ RegularExpressionNode (location: (9,11)-(9,14))
-        │   │           ├── flags: ∅
+        │   │           ├── flags: forced_us_ascii_encoding
        │   │           ├── opening_loc: (9,11)-(9,12) = "/"
        │   │           ├── content_loc: (9,12)-(9,13) = "x"
        │   │           ├── closing_loc: (9,13)-(9,14) = "/"
@ -356,7 +356,7 @@
        │   │       │   ├── closing_loc: (11,8)-(11,9) = ")"
        │   │       │   └── block: ∅
        │   │       └── @ RegularExpressionNode (location: (11,11)-(11,15))
-        │   │           ├── flags: multi_line
+        │   │           ├── flags: multi_line, forced_us_ascii_encoding
        │   │           ├── opening_loc: (11,11)-(11,12) = "/"
        │   │           ├── content_loc: (11,12)-(11,13) = "x"
        │   │           ├── closing_loc: (11,13)-(11,15) = "/m"
@ -488,7 +488,7 @@
        │   │       │       ├── opening_loc: (15,3)-(15,4) = "{"
        │   │       │       └── closing_loc: (15,7)-(15,8) = "}"
        │   │       └── @ RegularExpressionNode (location: (15,10)-(15,13))
-        │   │           ├── flags: ∅
+        │   │           ├── flags: forced_us_ascii_encoding
        │   │           ├── opening_loc: (15,10)-(15,11) = "/"
        │   │           ├── content_loc: (15,11)-(15,12) = "x"
        │   │           ├── closing_loc: (15,12)-(15,13) = "/"
@ -554,7 +554,7 @@
        │   │       │       ├── opening_loc: (17,3)-(17,4) = "{"
        │   │       │       └── closing_loc: (17,7)-(17,8) = "}"
        │   │       └── @ RegularExpressionNode (location: (17,10)-(17,14))
-        │   │           ├── flags: multi_line
+        │   │           ├── flags: multi_line, forced_us_ascii_encoding
        │   │           ├── opening_loc: (17,10)-(17,11) = "/"
        │   │           ├── content_loc: (17,11)-(17,12) = "x"
        │   │           ├── closing_loc: (17,12)-(17,14) = "/m"
@ -686,7 +686,7 @@
        │   │       │       ├── opening_loc: (21,3)-(21,4) = "{"
        │   │       │       └── closing_loc: (21,8)-(21,9) = "}"
        │   │       └── @ RegularExpressionNode (location: (21,11)-(21,14))
-        │   │           ├── flags: ∅
+        │   │           ├── flags: forced_us_ascii_encoding
        │   │           ├── opening_loc: (21,11)-(21,12) = "/"
        │   │           ├── content_loc: (21,12)-(21,13) = "x"
        │   │           ├── closing_loc: (21,13)-(21,14) = "/"
@ -752,7 +752,7 @@
            │       │       ├── opening_loc: (23,3)-(23,4) = "{"
            │       │       └── closing_loc: (23,8)-(23,9) = "}"
            │       └── @ RegularExpressionNode (location: (23,11)-(23,15))
-            │           ├── flags: multi_line
+            │           ├── flags: multi_line, forced_us_ascii_encoding
            │           ├── opening_loc: (23,11)-(23,12) = "/"
            │           ├── content_loc: (23,12)-(23,13) = "x"
            │           ├── closing_loc: (23,13)-(23,15) = "/m"