Implement String#append_as_bytes(String | Integer, ...)

[Feature #20594] A handy method to construct a string out of multiple chunks. Contrary to `String#concat`, it doesn't do any encoding negociation, and simply append the content as bytes regardless of whether this result in a broken string or not. It's the caller responsibility to check for `String#valid_encoding?` in cases where it's needed. When passed integers, only the lower byte is considered, like in `String#setbyte`.
2024-08-01 11:14:19 +02:00 · 2024-08-01 11:14:19 +02:00 · 16f241f0aa
--- a/spec/ruby/core/string/append_as_bytes_spec.rb
+++ b/spec/ruby/core/string/append_as_bytes_spec.rb
@ -0,0 +1,46 @@
+require_relative '../../spec_helper'
+
+describe "String#append_bytes" do
+  ruby_version_is "3.4" do
+    it "doesn't allow to mutate frozen strings" do
+      str = "hello".freeze
+      -> { str.append_as_bytes("\xE2\x82") }.should raise_error(FrozenError)
+    end
+
+    it "allows creating broken strings" do
+      str = +"hello"
+      str.append_as_bytes("\xE2\x82")
+      str.valid_encoding?.should == false
+
+      str.append_as_bytes("\xAC")
+      str.valid_encoding?.should == true
+
+      str = "abc".encode(Encoding::UTF_32LE)
+      str.append_as_bytes("def")
+      str.encoding.should == Encoding::UTF_32LE
+      str.valid_encoding?.should == false
+    end
+
+    it "never changes the receiver encoding" do
+      str = "".b
+      str.append_as_bytes("€")
+      str.encoding.should == Encoding::BINARY
+    end
+
+    it "accepts variadic String or Integer arguments" do
+      str = "hello".b
+      str.append_as_bytes("\xE2\x82", 12, 43, "\xAC")
+      str.encoding.should == Encoding::BINARY
+      str.should == "hello\xE2\x82\f+\xAC".b
+    end
+
+    it "only accepts strings or integers, and doesn't attempt to cast with #to_str or #to_int" do
+      to_str = mock("to_str")
+      to_str.should_not_receive(:to_str)
+      to_str.should_not_receive(:to_int)
+
+      str = +"hello"
+      -> { str.append_as_bytes(to_str) }.should raise_error(TypeError, "wrong argument type MockObject (expected String or Integer)")
+    end
+  end
+end
--- a/string.c
+++ b/string.c
@ -3308,6 +3308,32 @@ rb_str_resize(VALUE str, long len)
    return str;
 }

+static void
+str_ensure_available_capa(VALUE str, long len)
+{
+    str_modify_keep_cr(str);
+
+    const int termlen = TERM_LEN(str);
+    long olen = RSTRING_LEN(str);
+
+    if (RB_UNLIKELY(olen > LONG_MAX - len)) {
+        rb_raise(rb_eArgError, "string sizes too big");
+    }
+
+    long total = olen + len;
+    long capa = str_capacity(str, termlen);
+
+    if (capa < total) {
+        if (total >= LONG_MAX / 2) {
+            capa = total;
+        }
+        while (total > capa) {
+            capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
+        }
+        RESIZE_CAPA_TERM(str, capa, termlen);
+    }
+}
+
 static VALUE
 str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
 {
@ -3662,6 +3688,144 @@ rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
    return str;
 }

+/*
+ *  call-seq:
+ *    append_as_bytes(*objects) -> string
+ *
+ *  Concatenates each object in +objects+ into +self+ without any encoding
+ *  validation or conversion and returns +self+:
+ *
+ *    s = 'foo'
+ *    s.append_as_bytes(" \xE2\x82")  # => "foo \xE2\x82"
+ *    s.valid_encoding?               # => false
+ *    s.append_as_bytes("\xAC 12")
+ *    s.valid_encoding?               # => true
+ *
+ *  For each given object +object+ that is an Integer,
+ *  the value is considered a Byte. If the Integer is bigger
+ *  than one byte, only the lower byte is considered, similar to String#setbyte:
+ *
+ *    s = ""
+ *    s.append_as_bytes(0, 257)             # =>  "\u0000\u0001"
+ *
+ *  Related: String#<<, String#concat, which do an encoding aware concatenation.
+ */
+
+VALUE
+rb_str_append_as_bytes(int argc, VALUE *argv, VALUE str)
+{
+    long needed_capacity = 0;
+    volatile VALUE t0;
+    enum ruby_value_type *types = ALLOCV_N(enum ruby_value_type, t0, argc);
+
+    for (int index = 0; index < argc; index++) {
+        VALUE obj = argv[index];
+        enum ruby_value_type type = types[index] = rb_type(obj);
+        switch (type) {
+          case T_FIXNUM:
+          case T_BIGNUM:
+            needed_capacity++;
+            break;
+          case T_STRING:
+            needed_capacity += RSTRING_LEN(obj);
+            break;
+          default:
+            rb_raise(
+                rb_eTypeError,
+                "wrong argument type %"PRIsVALUE" (expected String or Integer)",
+                rb_obj_class(obj)
+            );
+            break;
+        }
+    }
+
+    str_ensure_available_capa(str, needed_capacity);
+    char *sptr = RSTRING_END(str);
+
+    for (int index = 0; index < argc; index++) {
+        VALUE obj = argv[index];
+        enum ruby_value_type type = types[index];
+        switch (type) {
+          case T_FIXNUM:
+          case T_BIGNUM: {
+            argv[index] = obj = rb_int_and(obj, INT2FIX(0xff));
+            char byte = (char)(NUM2INT(obj) & 0xFF);
+            *sptr = byte;
+            sptr++;
+            break;
+          }
+          case T_STRING: {
+            const char *ptr;
+            long len;
+            RSTRING_GETMEM(obj, ptr, len);
+            memcpy(sptr, ptr, len);
+            sptr += len;
+            break;
+          }
+          default:
+            UNREACHABLE;
+            RUBY_ASSERT("append_as_bytes arguments should have been validated");
+            break;
+        }
+    }
+
+    STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
+    TERM_FILL(sptr, TERM_LEN(str)); /* sentinel */
+
+    int cr = ENC_CODERANGE(str);
+    switch (cr) {
+      case ENC_CODERANGE_7BIT: {
+        for (int index = 0; index < argc; index++) {
+            VALUE obj = argv[index];
+            enum ruby_value_type type = types[index];
+            switch (type) {
+              case T_FIXNUM:
+              case T_BIGNUM: {
+                if (!ISASCII(NUM2INT(obj))) {
+                    goto clear_cr;
+                }
+                break;
+              }
+              case T_STRING: {
+                if (ENC_CODERANGE(obj) != ENC_CODERANGE_7BIT) {
+                    goto clear_cr;
+                }
+              }
+              default:
+                UNREACHABLE;
+                RUBY_ASSERT("append_as_bytes arguments should have been validated");
+                break;
+            }
+        }
+        break;
+      }
+      case ENC_CODERANGE_VALID:
+        if (ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT) {
+            goto keep_cr;
+        }
+        else {
+            goto clear_cr;
+        }
+        break;
+      default:
+        goto clear_cr;
+        break;
+    }
+
+    RB_GC_GUARD(t0);
+
+  clear_cr:
+    // If no fast path was hit, we clear the coderange.
+    // append_as_bytes is predominently meant to be used in
+    // buffering situation, hence it's likely the coderange
+    // will never be scanned, so it's not worth spending time
+    // precomputing the coderange except for simple and common
+    // situations.
+    ENC_CODERANGE_CLEAR(str);
+  keep_cr:
+    return str;
+}
+
 /*
 *  call-seq:
 *    string << object -> string
@ -12433,6 +12597,7 @@ Init_String(void)
    rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
    rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
    rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
+    rb_define_method(rb_cString, "append_as_bytes", rb_str_append_as_bytes, -1);
    rb_define_method(rb_cString, "<<", rb_str_concat, 1);
    rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
    rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
--- a/test/ruby/test_string.rb
+++ b/test/ruby/test_string.rb
@ -3630,6 +3630,55 @@ CODE
    assert_bytesplice_raise(ArgumentError, S("hello"), 0..-1, "bye", 0, 3)
  end

+  def test_append_bytes_into_binary
+    buf = S("".b)
+    assert_equal Encoding::BINARY, buf.encoding
+
+    buf.append_as_bytes(S("hello"))
+    assert_equal "hello".b, buf
+    assert_equal Encoding::BINARY, buf.encoding
+
+    buf.append_as_bytes(S("こんにちは"))
+    assert_equal S("helloこんにちは".b), buf
+    assert_equal Encoding::BINARY, buf.encoding
+  end
+
+  def test_append_bytes_into_utf8
+    buf = S("")
+    assert_equal Encoding::UTF_8, buf.encoding
+
+    buf.append_as_bytes(S("hello"))
+    assert_equal S("hello"), buf
+    assert_equal Encoding::UTF_8, buf.encoding
+    assert_predicate buf, :ascii_only?
+    assert_predicate buf, :valid_encoding?
+
+    buf.append_as_bytes(S("こんにちは"))
+    assert_equal S("helloこんにちは"), buf
+    assert_equal Encoding::UTF_8, buf.encoding
+    refute_predicate buf, :ascii_only?
+    assert_predicate buf, :valid_encoding?
+
+    buf.append_as_bytes(S("\xE2\x82".b))
+    assert_equal S("helloこんにちは\xE2\x82"), buf
+    assert_equal Encoding::UTF_8, buf.encoding
+    refute_predicate buf, :valid_encoding?
+
+    buf.append_as_bytes(S("\xAC".b))
+    assert_equal S("helloこんにちは€"), buf
+    assert_equal Encoding::UTF_8, buf.encoding
+    assert_predicate buf, :valid_encoding?
+  end
+
+  def test_append_bytes_into_utf32
+    buf = S("abc".encode(Encoding::UTF_32LE))
+    assert_equal Encoding::UTF_32LE, buf.encoding
+
+    buf.append_as_bytes("def")
+    assert_equal Encoding::UTF_32LE, buf.encoding
+    refute_predicate buf, :valid_encoding?
+  end
+
  def test_chilled_string
    chilled_string = eval('"chilled"')