From 2217e08340cbeba427fc58c7f955fc2382ab0372 Mon Sep 17 00:00:00 2001
From: Jeremy Evans <code@jeremyevans.net>
Date: Sat, 27 Jan 2024 10:16:52 -0800
Subject: [PATCH] Optimize compilation of large literal arrays

To avoid stack overflow, Ruby splits compilation of large arrays
into smaller arrays, and concatenates the small arrays together.
It previously used newarray/concatarray for this, which is
inefficient.  This switches the compilation to use pushtoarray,
which is much faster. This makes almost all literal arrays only
allocate a single array.

For cases where there is a large amount of static values in the
array, Ruby will statically compile subarrays, and previously
added them using concatarray.  This switches to concattoarray,
avoiding an array allocation for the append.

Keyword splats are also supported in arrays, and ignored if the
keyword splat is empty.  Previously, this used newarraykwsplat and
concatarray.  This still uses newarraykwsplat, but switches to
concattoarray to save an allocation.  So large arrays with keyword
splats can allocate 2 arrays instead of 1.

Previously, for the following array sizes (assuming local variable
access for each element), Ruby allocated the following number of
arrays:

  1000 elements: 7 arrays
 10000 elements: 79 arrays
100000 elements: 781 arrays

With these changes, only a single array is allocated (or 2 for a
large array with a keyword splat.

Results using the included benchmark:

```
                       array_1000
            miniruby:     34770.0 i/s
   ./miniruby-before:     10511.7 i/s - 3.31x  slower

                      array_10000
            miniruby:      4938.8 i/s
   ./miniruby-before:       483.8 i/s - 10.21x  slower

                     array_100000
            miniruby:       727.2 i/s
   ./miniruby-before:         4.1 i/s - 176.98x  slower
```

Co-authored-by: Nobuyoshi Nakada <nobu@ruby-lang.org>
---
 benchmark/array_large_literal.yml | 19 +++++++++++++++++++
 compile.c                         | 27 ++++++++++++++-------------
 2 files changed, 33 insertions(+), 13 deletions(-)
 create mode 100644 benchmark/array_large_literal.yml

diff --git a/benchmark/array_large_literal.yml b/benchmark/array_large_literal.yml
new file mode 100644
index 0000000000..423d68391f
--- /dev/null
+++ b/benchmark/array_large_literal.yml
@@ -0,0 +1,19 @@
+prelude: |
+  def def_array(size)
+    Object.class_eval(<<-END)
+      def array_#{size}
+        x = 1
+        [#{(['x'] * size).join(',')}]
+      end
+    END
+  end
+  def_array(100)
+  def_array(1000)
+  def_array(10000)
+  def_array(100000)
+benchmark:
+  array_100: array_100
+  array_1000: array_1000
+  array_10000: array_10000
+  array_100000: array_100000
+
diff --git a/compile.c b/compile.c
index eea92b3638..71b657d9f2 100644
--- a/compile.c
+++ b/compile.c
@@ -4822,8 +4822,8 @@ compile_array(rb_iseq_t *iseq, LINK_ANCHOR *const ret, const NODE *node, int pop
      *
      *   [x1,x2,...,x10000] =>
      *     push x1  ; push x2  ; ...; push x256; newarray 256;
-     *     push x257; push x258; ...; push x512; newarray 256; concatarray;
-     *     push x513; push x514; ...; push x768; newarray 256; concatarray;
+     *     push x257; push x258; ...; push x512; pushtoarray 256;
+     *     push x513; push x514; ...; push x768; pushtoarray 256;
      *     ...
      *
      * - Long subarray can be optimized by pre-allocating a hidden array.
@@ -4833,8 +4833,8 @@ compile_array(rb_iseq_t *iseq, LINK_ANCHOR *const ret, const NODE *node, int pop
      *
      *   [x, 1,2,3,...,100, z] =>
      *     push x; newarray 1;
-     *     putobject [1,2,3,...,100] (<- hidden array); concatarray;
-     *     push z; newarray 1; concatarray
+     *     putobject [1,2,3,...,100] (<- hidden array); concattoarray;
+     *     push z; pushtoarray 1;
      *
      * - If the last element is a keyword, newarraykwsplat should be emitted
      *   to check and remove empty keyword arguments hash from array.
@@ -4849,11 +4849,11 @@ compile_array(rb_iseq_t *iseq, LINK_ANCHOR *const ret, const NODE *node, int pop
     int stack_len = 0;
     int first_chunk = 1;
 
-    /* Convert pushed elements to an array, and concatarray if needed */
-#define FLUSH_CHUNK(newarrayinsn)                               \
+    /* Either create a new array, or push to the existing array */
+#define FLUSH_CHUNK \
     if (stack_len) {                                            \
-        ADD_INSN1(ret, line_node, newarrayinsn, INT2FIX(stack_len)); \
-        if (!first_chunk) ADD_INSN(ret, line_node, concatarray);     \
+        if (first_chunk) ADD_INSN1(ret, line_node, newarray, INT2FIX(stack_len)); \
+        else ADD_INSN1(ret, line_node, pushtoarray, INT2FIX(stack_len));     \
         first_chunk = stack_len = 0;                            \
     }
 
@@ -4877,14 +4877,14 @@ compile_array(rb_iseq_t *iseq, LINK_ANCHOR *const ret, const NODE *node, int pop
                 OBJ_FREEZE(ary);
 
                 /* Emit optimized code */
-                FLUSH_CHUNK(newarray);
+                FLUSH_CHUNK;
                 if (first_chunk) {
                     ADD_INSN1(ret, line_node, duparray, ary);
                     first_chunk = 0;
                 }
                 else {
                     ADD_INSN1(ret, line_node, putobject, ary);
-                    ADD_INSN(ret, line_node, concatarray);
+                    ADD_INSN(ret, line_node, concattoarray);
                 }
                 RB_OBJ_WRITTEN(iseq, Qundef, ary);
             }
@@ -4901,16 +4901,17 @@ compile_array(rb_iseq_t *iseq, LINK_ANCHOR *const ret, const NODE *node, int pop
 
             if (!RNODE_LIST(node)->nd_next && keyword_node_p(RNODE_LIST(node)->nd_head)) {
                 /* Reached the end, and the last element is a keyword */
-                FLUSH_CHUNK(newarraykwsplat);
+                ADD_INSN1(ret, line_node, newarraykwsplat, INT2FIX(stack_len));
+                if (!first_chunk) ADD_INSN(ret, line_node, concattoarray);
                 return 1;
             }
 
             /* If there are many pushed elements, flush them to avoid stack overflow */
-            if (stack_len >= max_stack_len) FLUSH_CHUNK(newarray);
+            if (stack_len >= max_stack_len) FLUSH_CHUNK;
         }
     }
 
-    FLUSH_CHUNK(newarray);
+    FLUSH_CHUNK;
 #undef FLUSH_CHUNK
     return 1;
 }