From 4fbb208185c2c5c0b5c9362c15fc90a0e6d02309 Mon Sep 17 00:00:00 2001
From: Vinicius Stock <vinicius.stock@shopify.com>
Date: Thu, 2 May 2024 11:33:32 -0400
Subject: [PATCH] [ruby/prism] Create specialized `ASCIISource` with asciionly
 optimizations

https://github.com/ruby/prism/commit/40993166a8
---
 lib/prism/ffi.rb                           |  6 ++--
 lib/prism/lex_compat.rb                    |  2 +-
 lib/prism/parse_result.rb                  | 41 ++++++++++++++++++++++
 prism/extension.c                          |  6 ++--
 prism/templates/ext/prism/api_node.c.erb   |  3 +-
 prism/templates/lib/prism/dsl.rb.erb       |  4 +--
 prism/templates/lib/prism/serialize.rb.erb |  2 +-
 7 files changed, 53 insertions(+), 11 deletions(-)

diff --git a/lib/prism/ffi.rb b/lib/prism/ffi.rb
index 2014ccea31..cec4b9d630 100644
--- a/lib/prism/ffi.rb
+++ b/lib/prism/ffi.rb
@@ -317,7 +317,7 @@ module Prism
         buffer.read
       end
 
-      Serialize.load_tokens(Source.new(code), serialized)
+      Serialize.load_tokens(Source.for(code), serialized)
     end
 
     def parse_common(string, code, options) # :nodoc:
@@ -329,7 +329,7 @@ module Prism
       LibRubyParser::PrismBuffer.with do |buffer|
         LibRubyParser.pm_serialize_parse_comments(buffer.pointer, string.pointer, string.length, dump_options(options))
 
-        source = Source.new(code)
+        source = Source.for(code)
         loader = Serialize::Loader.new(source, buffer.read)
 
         loader.load_header
@@ -343,7 +343,7 @@ module Prism
       LibRubyParser::PrismBuffer.with do |buffer|
         LibRubyParser.pm_serialize_parse_lex(buffer.pointer, string.pointer, string.length, dump_options(options))
 
-        source = Source.new(code)
+        source = Source.for(code)
         loader = Serialize::Loader.new(source, buffer.read)
 
         tokens = loader.load_tokens
diff --git a/lib/prism/lex_compat.rb b/lib/prism/lex_compat.rb
index f199af1883..4f8e443a3b 100644
--- a/lib/prism/lex_compat.rb
+++ b/lib/prism/lex_compat.rb
@@ -861,7 +861,7 @@ module Prism
       # We sort by location to compare against Ripper's output
       tokens.sort_by!(&:location)
 
-      Result.new(tokens, result.comments, result.magic_comments, result.data_loc, result.errors, result.warnings, Source.new(source))
+      Result.new(tokens, result.comments, result.magic_comments, result.data_loc, result.errors, result.warnings, Source.for(source))
     end
   end
 
diff --git a/lib/prism/parse_result.rb b/lib/prism/parse_result.rb
index ff8b1dc8bf..e8d7717228 100644
--- a/lib/prism/parse_result.rb
+++ b/lib/prism/parse_result.rb
@@ -5,6 +5,14 @@ module Prism
   # conjunction with locations to allow them to resolve line numbers and source
   # ranges.
   class Source
+    # Create a new source object with the given source code. This method should
+    # be used instead of `new` and it will return either a `Source` or a
+    # specialized and more performant `ASCIISource` if no multibyte characters
+    # are present in the source code.
+    def self.for(source, start_line = 1, offsets = [])
+      source.ascii_only? ? ASCIISource.new(source, start_line, offsets): new(source, start_line, offsets)
+    end
+
     # The source code that this source object represents.
     attr_reader :source
 
@@ -111,6 +119,39 @@ module Prism
     end
   end
 
+  # Specialized version of Prism::Source for source code that includes ASCII
+  # characters only. This class is used to apply performance optimizations that
+  # cannot be applied to sources that include multibyte characters. Sources that
+  # include multibyte characters are represented by the Prism::Source class.
+  class ASCIISource < Source
+    # Return the character offset for the given byte offset.
+    def character_offset(byte_offset)
+      byte_offset
+    end
+
+    # Return the column number in characters for the given byte offset.
+    def character_column(byte_offset)
+      byte_offset - line_start(byte_offset)
+    end
+
+    # Returns the offset from the start of the file for the given byte offset
+    # counting in code units for the given encoding.
+    #
+    # This method is tested with UTF-8, UTF-16, and UTF-32. If there is the
+    # concept of code units that differs from the number of characters in other
+    # encodings, it is not captured here.
+    def code_units_offset(byte_offset, encoding)
+      byte_offset
+    end
+
+    # Specialized version of `code_units_column` that does not depend on
+    # `code_units_offset`, which is a more expensive operation. This is
+    # essentialy the same as `Prism::Source#column`.
+    def code_units_column(byte_offset, encoding)
+      byte_offset - line_start(byte_offset)
+    end
+  end
+
   # This represents a location in the source.
   class Location
     # A Source object that is used to determine more information from the given
diff --git a/prism/extension.c b/prism/extension.c
index 7b3f894478..84872914c4 100644
--- a/prism/extension.c
+++ b/prism/extension.c
@@ -32,6 +32,7 @@ ID rb_option_id_frozen_string_literal;
 ID rb_option_id_line;
 ID rb_option_id_scopes;
 ID rb_option_id_version;
+ID rb_prism_source_id_for;
 
 /******************************************************************************/
 /* IO of Ruby code                                                            */
@@ -599,8 +600,7 @@ parse_lex_input(pm_string_t *input, const pm_options_t *options, bool return_nod
 
     VALUE source_string = rb_str_new((const char *) pm_string_source(input), pm_string_length(input));
     VALUE offsets = rb_ary_new();
-    VALUE source_argv[] = { source_string, LONG2NUM(parser.start_line), offsets };
-    VALUE source = rb_class_new_instance(3, source_argv, rb_cPrismSource);
+    VALUE source = rb_funcall(rb_cPrismSource, rb_prism_source_id_for, 3, source_string, LONG2NUM(parser.start_line), offsets);
 
     parse_lex_data_t parse_lex_data = {
         .source = source,
@@ -1379,6 +1379,8 @@ Init_prism(void) {
     rb_option_id_scopes = rb_intern_const("scopes");
     rb_option_id_version = rb_intern_const("version");
 
+    rb_prism_source_id_for = rb_intern("for");
+
     /**
      * The version of the prism library.
      */
diff --git a/prism/templates/ext/prism/api_node.c.erb b/prism/templates/ext/prism/api_node.c.erb
index 419236ef78..0e3e4d63cc 100644
--- a/prism/templates/ext/prism/api_node.c.erb
+++ b/prism/templates/ext/prism/api_node.c.erb
@@ -76,8 +76,7 @@ pm_source_new(const pm_parser_t *parser, rb_encoding *encoding) {
         rb_ary_push(offsets, ULONG2NUM(parser->newline_list.offsets[index]));
     }
 
-    VALUE source_argv[] = { source_string, LONG2NUM(parser->start_line), offsets };
-    return rb_class_new_instance(3, source_argv, rb_cPrismSource);
+    return rb_funcall(rb_cPrismSource, rb_intern("for"), 3, source_string, LONG2NUM(parser->start_line), offsets);
 }
 
 typedef struct pm_node_stack_node {
diff --git a/prism/templates/lib/prism/dsl.rb.erb b/prism/templates/lib/prism/dsl.rb.erb
index 8dbb540952..eff0d1c4fc 100644
--- a/prism/templates/lib/prism/dsl.rb.erb
+++ b/prism/templates/lib/prism/dsl.rb.erb
@@ -2,7 +2,7 @@ module Prism
   # The DSL module provides a set of methods that can be used to create prism
   # nodes in a more concise manner. For example, instead of writing:
   #
-  #     source = Prism::Source.new("[1]")
+  #     source = Prism::Source.for("[1]")
   #
   #     Prism::ArrayNode.new(
   #       [
@@ -20,7 +20,7 @@ module Prism
   #
   # you could instead write:
   #
-  #     source = Prism::Source.new("[1]")
+  #     source = Prism::Source.for("[1]")
   #
   #     ArrayNode(
   #       IntegerNode(Prism::IntegerBaseFlags::DECIMAL, 1, Location(source, 1, 1)), source),
diff --git a/prism/templates/lib/prism/serialize.rb.erb b/prism/templates/lib/prism/serialize.rb.erb
index c31a319e5f..29ae5356ba 100644
--- a/prism/templates/lib/prism/serialize.rb.erb
+++ b/prism/templates/lib/prism/serialize.rb.erb
@@ -19,7 +19,7 @@ module Prism
     # Deserialize the AST represented by the given string into a parse result.
     def self.load(input, serialized)
       input = input.dup
-      source = Source.new(input)
+      source = Source.for(input)
       loader = Loader.new(source, serialized)
       result = loader.load_result