From 51ffef281996727c60571771cd07c1459ba58cd2 Mon Sep 17 00:00:00 2001
From: Peter Zhu <peter@peterzhu.ca>
Date: Fri, 8 Nov 2024 14:33:48 -0500
Subject: [PATCH] Fix memory leak in prism when syntax error in iseq
 compilation

If there's a syntax error during iseq compilation then prism would leak
memory because it would not free the pm_parse_result_t.

This commit changes pm_iseq_new_with_opt to have a rb_protect to catch
when an error is raised, and return NULL and set error_state to a value
that can be raised by calling rb_jump_tag after memory has been freed.

For example:

    10.times do
      10_000.times do
        eval("/[/=~s")
      rescue SyntaxError
      end

      puts `ps -o rss= -p #{$$}`
    end

Before:

    39280
    68736
    99232
    128864
    158896
    188208
    217344
    246304
    275376
    304592

After:

    12192
    13200
    14256
    14848
    16000
    16000
    16000
    16064
    17232
    17952
---
 iseq.c                 | 68 +++++++++++++++++++++++++++++++++---------
 load.c                 |  9 +++++-
 mini_builtin.c         |  8 ++++-
 prism_compile.c        | 17 +++++++++--
 prism_compile.h        | 10 +++----
 ruby.c                 |  9 +++++-
 test/ruby/test_eval.rb | 24 +++++++++++++++
 vm_eval.c              | 10 ++++++-
 8 files changed, 130 insertions(+), 25 deletions(-)
diff --git a/iseq.c b/iseq.c
index 8c8e54b989..9faf01e613 100644
--- a/iseq.c
+++ b/iseq.c
@@ -897,12 +897,12 @@ rb_iseq_new_top(const VALUE ast_value, VALUE name, VALUE path, VALUE realpath, c
  * The main entry-point into the prism compiler when a file is required.
  */
 rb_iseq_t *
-pm_iseq_new_top(pm_scope_node_t *node, VALUE name, VALUE path, VALUE realpath, const rb_iseq_t *parent)
+pm_iseq_new_top(pm_scope_node_t *node, VALUE name, VALUE path, VALUE realpath, const rb_iseq_t *parent, int *error_state)
 {
     iseq_new_setup_coverage(path, (int) (node->parser->newline_list.size - 1));
 
     return pm_iseq_new_with_opt(node, name, path, realpath, 0, parent, 0,
-                                ISEQ_TYPE_TOP, &COMPILE_OPTION_DEFAULT);
+                                ISEQ_TYPE_TOP, &COMPILE_OPTION_DEFAULT, error_state);
 }
 
 rb_iseq_t *
@@ -921,13 +921,13 @@ rb_iseq_new_main(const VALUE ast_value, VALUE path, VALUE realpath, const rb_ise
  * main file in the program.
  */
 rb_iseq_t *
-pm_iseq_new_main(pm_scope_node_t *node, VALUE path, VALUE realpath, const rb_iseq_t *parent, int opt)
+pm_iseq_new_main(pm_scope_node_t *node, VALUE path, VALUE realpath, const rb_iseq_t *parent, int opt, int *error_state)
 {
     iseq_new_setup_coverage(path, (int) (node->parser->newline_list.size - 1));
 
     return pm_iseq_new_with_opt(node, rb_fstring_lit("<main>"),
                                 path, realpath, 0,
-                                parent, 0, ISEQ_TYPE_MAIN, opt ? &COMPILE_OPTION_DEFAULT : &COMPILE_OPTION_FALSE);
+                                parent, 0, ISEQ_TYPE_MAIN, opt ? &COMPILE_OPTION_DEFAULT : &COMPILE_OPTION_FALSE, error_state);
 }
 
 rb_iseq_t *
@@ -947,7 +947,7 @@ rb_iseq_new_eval(const VALUE ast_value, VALUE name, VALUE path, VALUE realpath,
 
 rb_iseq_t *
 pm_iseq_new_eval(pm_scope_node_t *node, VALUE name, VALUE path, VALUE realpath,
-                     int first_lineno, const rb_iseq_t *parent, int isolated_depth)
+                     int first_lineno, const rb_iseq_t *parent, int isolated_depth, int *error_state)
 {
     if (rb_get_coverage_mode() & COVERAGE_TARGET_EVAL) {
         VALUE coverages = rb_get_coverages();
@@ -957,7 +957,7 @@ pm_iseq_new_eval(pm_scope_node_t *node, VALUE name, VALUE path, VALUE realpath,
     }
 
     return pm_iseq_new_with_opt(node, name, path, realpath, first_lineno,
-                                parent, isolated_depth, ISEQ_TYPE_EVAL, &COMPILE_OPTION_DEFAULT);
+                                parent, isolated_depth, ISEQ_TYPE_EVAL, &COMPILE_OPTION_DEFAULT, error_state);
 }
 
 static inline rb_iseq_t *
@@ -1013,6 +1013,25 @@ rb_iseq_new_with_opt(VALUE ast_value, VALUE name, VALUE path, VALUE realpath,
     return iseq_translate(iseq);
 }
 
+struct pm_iseq_new_with_opt_data {
+    rb_iseq_t *iseq;
+    pm_scope_node_t *node;
+};
+
+VALUE
+pm_iseq_new_with_opt_try(VALUE d)
+{
+    struct pm_iseq_new_with_opt_data *data = (struct pm_iseq_new_with_opt_data *)d;
+
+    // This can compile child iseqs, which can raise syntax errors
+    pm_iseq_compile_node(data->iseq, data->node);
+
+    // This raises an exception if there is a syntax error
+    finish_iseq_build(data->iseq);
+
+    return Qundef;
+}
+
 /**
  * This is a step in the prism compiler that is called once all of the various
  * options have been established. It is called from one of the pm_iseq_new_*
@@ -1028,7 +1047,7 @@ rb_iseq_new_with_opt(VALUE ast_value, VALUE name, VALUE path, VALUE realpath,
 rb_iseq_t *
 pm_iseq_new_with_opt(pm_scope_node_t *node, VALUE name, VALUE path, VALUE realpath,
                      int first_lineno, const rb_iseq_t *parent, int isolated_depth,
-                     enum rb_iseq_type type, const rb_compile_option_t *option)
+                     enum rb_iseq_type type, const rb_compile_option_t *option, int *error_state)
 {
     rb_iseq_t *iseq = iseq_alloc();
     ISEQ_BODY(iseq)->prism = true;
@@ -1054,8 +1073,13 @@ pm_iseq_new_with_opt(pm_scope_node_t *node, VALUE name, VALUE path, VALUE realpa
     prepare_iseq_build(iseq, name, path, realpath, first_lineno, &code_location, -1,
                        parent, isolated_depth, type, node->script_lines == NULL ? Qnil : *node->script_lines, option);
 
-    pm_iseq_compile_node(iseq, node);
-    finish_iseq_build(iseq);
+    struct pm_iseq_new_with_opt_data data = {
+        .iseq = iseq,
+        .node = node
+    };
+    rb_protect(pm_iseq_new_with_opt_try, (VALUE)&data, error_state);
+
+    if (*error_state) return NULL;
 
     return iseq_translate(iseq);
 }
@@ -1313,8 +1337,15 @@ pm_iseq_compile_with_option(VALUE src, VALUE file, VALUE realpath, VALUE line, V
     }
 
     if (error == Qnil) {
-        iseq = pm_iseq_new_with_opt(&result.node, name, file, realpath, ln, NULL, 0, ISEQ_TYPE_TOP, &option);
+        int error_state;
+        iseq = pm_iseq_new_with_opt(&result.node, name, file, realpath, ln, NULL, 0, ISEQ_TYPE_TOP, &option, &error_state);
+
         pm_parse_result_free(&result);
+
+        if (error_state) {
+            RUBY_ASSERT(iseq == NULL);
+            rb_jump_tag(error_state);
+        }
     }
     else {
         pm_parse_result_free(&result);
@@ -1771,11 +1802,20 @@ iseqw_s_compile_file_prism(int argc, VALUE *argv, VALUE self)
     if (error == Qnil) {
         make_compile_option(&option, opt);
 
-        ret = iseqw_new(pm_iseq_new_with_opt(&result.node, rb_fstring_lit("<main>"),
-                                            file,
-                                            rb_realpath_internal(Qnil, file, 1),
-                                            1, NULL, 0, ISEQ_TYPE_TOP, &option));
+        int error_state;
+        rb_iseq_t *iseq = pm_iseq_new_with_opt(&result.node, rb_fstring_lit("<main>"),
+                                               file,
+                                               rb_realpath_internal(Qnil, file, 1),
+                                               1, NULL, 0, ISEQ_TYPE_TOP, &option, &error_state);
+
         pm_parse_result_free(&result);
+
+        if (error_state) {
+            RUBY_ASSERT(iseq == NULL);
+            rb_jump_tag(error_state);
+        }
+
+        ret = iseqw_new(iseq);
         rb_vm_pop_frame(ec);
         RB_GC_GUARD(v);
         return ret;
diff --git a/load.c b/load.c
index e4dd1e47ba..19d0849bc7 100644
--- a/load.c
+++ b/load.c
@@ -752,8 +752,15 @@ load_iseq_eval(rb_execution_context_t *ec, VALUE fname)
             VALUE error = pm_load_parse_file(&result, fname, NULL);
 
             if (error == Qnil) {
-                iseq = pm_iseq_new_top(&result.node, rb_fstring_lit("<top (required)>"), fname, realpath_internal_cached(realpath_map, fname), NULL);
+                int error_state;
+                iseq = pm_iseq_new_top(&result.node, rb_fstring_lit("<top (required)>"), fname, realpath_internal_cached(realpath_map, fname), NULL, &error_state);
+
                 pm_parse_result_free(&result);
+
+                if (error_state) {
+                    RUBY_ASSERT(iseq == NULL);
+                    rb_jump_tag(error_state);
+                }
             }
             else {
                 rb_vm_pop_frame(ec);
diff --git a/mini_builtin.c b/mini_builtin.c
index 2fbc00234d..adcb4f965f 100644
--- a/mini_builtin.c
+++ b/mini_builtin.c
@@ -63,10 +63,16 @@ builtin_iseq_load(const char *feature_name, const struct rb_builtin_function *ta
         pm_prelude_load(&result, name_str, code, start_line);
 
         vm->builtin_function_table = table;
-        iseq = pm_iseq_new_with_opt(&result.node, name_str, name_str, Qnil, 0, NULL, 0, ISEQ_TYPE_TOP, &optimization);
+        int error_state;
+        iseq = pm_iseq_new_with_opt(&result.node, name_str, name_str, Qnil, 0, NULL, 0, ISEQ_TYPE_TOP, &optimization, &error_state);
 
         vm->builtin_function_table = NULL;
         pm_parse_result_free(&result);
+
+        if (error_state) {
+            RUBY_ASSERT(iseq == NULL);
+            rb_jump_tag(error_state);
+        }
     }
     else {
         VALUE ast_value = prelude_ast_value(name_str, code, start_line);
diff --git a/prism_compile.c b/prism_compile.c
index 53938ebb0c..f4a35fa429 100644
--- a/prism_compile.c
+++ b/prism_compile.c
@@ -1268,11 +1268,17 @@ pm_new_child_iseq(rb_iseq_t *iseq, pm_scope_node_t *node, VALUE name, const rb_i
 {
     debugs("[new_child_iseq]> ---------------------------------------\n");
     int isolated_depth = ISEQ_COMPILE_DATA(iseq)->isolated_depth;
+    int error_state;
     rb_iseq_t *ret_iseq = pm_iseq_new_with_opt(node, name,
             rb_iseq_path(iseq), rb_iseq_realpath(iseq),
             line_no, parent,
             isolated_depth ? isolated_depth + 1 : 0,
-            type, ISEQ_COMPILE_DATA(iseq)->option);
+            type, ISEQ_COMPILE_DATA(iseq)->option, &error_state);
+
+    if (error_state) {
+        RUBY_ASSERT(ret_iseq == NULL);
+        rb_jump_tag(error_state);
+    }
     debugs("[new_child_iseq]< ---------------------------------------\n");
     return ret_iseq;
 }
@@ -3479,6 +3485,7 @@ pm_compile_builtin_mandatory_only_method(rb_iseq_t *iseq, pm_scope_node_t *scope
     pm_scope_node_t next_scope_node;
     pm_scope_node_init(&def.base, &next_scope_node, scope_node);
 
+    int error_state;
     ISEQ_BODY(iseq)->mandatory_only_iseq = pm_iseq_new_with_opt(
         &next_scope_node,
         rb_iseq_base_label(iseq),
@@ -3488,9 +3495,15 @@ pm_compile_builtin_mandatory_only_method(rb_iseq_t *iseq, pm_scope_node_t *scope
         NULL,
         0,
         ISEQ_TYPE_METHOD,
-        ISEQ_COMPILE_DATA(iseq)->option
+        ISEQ_COMPILE_DATA(iseq)->option,
+        &error_state
     );
 
+    if (error_state) {
+        RUBY_ASSERT(ISEQ_BODY(iseq)->mandatory_only_iseq == NULL);
+        rb_jump_tag(error_state);
+    }
+
     pm_scope_node_destroy(&next_scope_node);
     return COMPILE_OK;
 }
diff --git a/prism_compile.h b/prism_compile.h
index 4015091fc1..f18fdbf892 100644
--- a/prism_compile.h
+++ b/prism_compile.h
@@ -90,10 +90,10 @@ VALUE pm_parse_string(pm_parse_result_t *result, VALUE source, VALUE filepath, V
 VALUE pm_parse_stdin(pm_parse_result_t *result);
 void pm_parse_result_free(pm_parse_result_t *result);
 
-rb_iseq_t *pm_iseq_new(pm_scope_node_t *node, VALUE name, VALUE path, VALUE realpath, const rb_iseq_t *parent, enum rb_iseq_type);
-rb_iseq_t *pm_iseq_new_top(pm_scope_node_t *node, VALUE name, VALUE path, VALUE realpath, const rb_iseq_t *parent);
-rb_iseq_t *pm_iseq_new_main(pm_scope_node_t *node, VALUE path, VALUE realpath, const rb_iseq_t *parent, int opt);
-rb_iseq_t *pm_iseq_new_eval(pm_scope_node_t *node, VALUE name, VALUE path, VALUE realpath, int first_lineno, const rb_iseq_t *parent, int isolated_depth);
-rb_iseq_t *pm_iseq_new_with_opt(pm_scope_node_t *node, VALUE name, VALUE path, VALUE realpath, int first_lineno, const rb_iseq_t *parent, int isolated_depth, enum rb_iseq_type, const rb_compile_option_t*);
+rb_iseq_t *pm_iseq_new(pm_scope_node_t *node, VALUE name, VALUE path, VALUE realpath, const rb_iseq_t *parent, enum rb_iseq_type, int *error_state);
+rb_iseq_t *pm_iseq_new_top(pm_scope_node_t *node, VALUE name, VALUE path, VALUE realpath, const rb_iseq_t *parent, int *error_state);
+rb_iseq_t *pm_iseq_new_main(pm_scope_node_t *node, VALUE path, VALUE realpath, const rb_iseq_t *parent, int opt, int *error_state);
+rb_iseq_t *pm_iseq_new_eval(pm_scope_node_t *node, VALUE name, VALUE path, VALUE realpath, int first_lineno, const rb_iseq_t *parent, int isolated_depth, int *error_state);
+rb_iseq_t *pm_iseq_new_with_opt(pm_scope_node_t *node, VALUE name, VALUE path, VALUE realpath, int first_lineno, const rb_iseq_t *parent, int isolated_depth, enum rb_iseq_type, const rb_compile_option_t *option, int *error_state);
 
 VALUE pm_iseq_compile_node(rb_iseq_t *iseq, pm_scope_node_t *node);
diff --git a/ruby.c b/ruby.c
index 6f32f11b57..eca0382466 100644
--- a/ruby.c
+++ b/ruby.c
@@ -2609,8 +2609,15 @@ process_options(int argc, char **argv, ruby_cmdline_options_t *opt)
 
         if (!result.ast) {
             pm_parse_result_t *pm = &result.prism;
-            iseq = pm_iseq_new_main(&pm->node, opt->script_name, path, parent, optimize);
+            int error_state;
+            iseq = pm_iseq_new_main(&pm->node, opt->script_name, path, parent, optimize, &error_state);
+
             pm_parse_result_free(pm);
+
+            if (error_state) {
+                RUBY_ASSERT(iseq == NULL);
+                rb_jump_tag(error_state);
+            }
         }
         else {
             rb_ast_t *ast = result.ast;
diff --git a/test/ruby/test_eval.rb b/test/ruby/test_eval.rb
index cf1c2bb2f6..2129272b00 100644
--- a/test/ruby/test_eval.rb
+++ b/test/ruby/test_eval.rb
@@ -612,4 +612,28 @@ class TestEval < Test::Unit::TestCase
     x = orphan_lambda
     assert_equal(:ok, x.call)
   end
+
+  def test_syntax_error_no_memory_leak
+    assert_no_memory_leak([], "#{<<~'begin;'}", "#{<<~'end;'}", rss: true)
+    begin;
+      100_000.times do
+        eval("/[/=~s")
+      rescue SyntaxError
+      else
+        raise "Expected SyntaxError to be raised"
+      end
+    end;
+
+    assert_no_memory_leak([], "#{<<~'begin;'}", "#{<<~'end;'}", rss: true)
+    begin;
+      a = 1
+
+      100_000.times do
+        eval("if a in [0, 0] | [0, a]; end")
+      rescue SyntaxError
+      else
+        raise "Expected SyntaxError to be raised"
+      end
+    end;
+  end
 end
diff --git a/vm_eval.c b/vm_eval.c
index b326f2a09c..edfcc6e435 100644
--- a/vm_eval.c
+++ b/vm_eval.c
@@ -1766,7 +1766,8 @@ pm_eval_make_iseq(VALUE src, VALUE fname, int line,
         iseq = ISEQ_BODY(iseq)->parent_iseq;
     }
 
-    iseq = pm_iseq_new_eval(&result.node, name, fname, Qnil, line, parent, 0);
+    int error_state;
+    iseq = pm_iseq_new_eval(&result.node, name, fname, Qnil, line, parent, 0, &error_state);
 
     pm_scope_node_t *prev = result.node.previous;
     while (prev) {
@@ -1778,6 +1779,13 @@ pm_eval_make_iseq(VALUE src, VALUE fname, int line,
     }
 
     pm_parse_result_free(&result);
+
+    // If there was an error, raise it after memory has been cleaned up
+    if (error_state) {
+        RUBY_ASSERT(iseq == NULL);
+        rb_jump_tag(error_state);
+    }
+
     rb_exec_event_hook_script_compiled(GET_EC(), iseq, src);
 
     return iseq;