Implement String#append_as_bytes(String | Integer, ...)

[Feature #20594]

A handy method to construct a string out of multiple chunks.

Contrary to `String#concat`, it doesn't do any encoding negociation,
and simply append the content as bytes regardless of whether this
result in a broken string or not.

It's the caller responsibility to check for `String#valid_encoding?`
in cases where it's needed.

When passed integers, only the lower byte is considered, like in
`String#setbyte`.
This commit is contained in:
Jean Boussier 2024-08-01 11:14:19 +02:00
Родитель 966901b39d
Коммит 16f241f0aa
3 изменённых файлов: 260 добавлений и 0 удалений

Просмотреть файл

@ -0,0 +1,46 @@
require_relative '../../spec_helper'
describe "String#append_bytes" do
ruby_version_is "3.4" do
it "doesn't allow to mutate frozen strings" do
str = "hello".freeze
-> { str.append_as_bytes("\xE2\x82") }.should raise_error(FrozenError)
end
it "allows creating broken strings" do
str = +"hello"
str.append_as_bytes("\xE2\x82")
str.valid_encoding?.should == false
str.append_as_bytes("\xAC")
str.valid_encoding?.should == true
str = "abc".encode(Encoding::UTF_32LE)
str.append_as_bytes("def")
str.encoding.should == Encoding::UTF_32LE
str.valid_encoding?.should == false
end
it "never changes the receiver encoding" do
str = "".b
str.append_as_bytes("")
str.encoding.should == Encoding::BINARY
end
it "accepts variadic String or Integer arguments" do
str = "hello".b
str.append_as_bytes("\xE2\x82", 12, 43, "\xAC")
str.encoding.should == Encoding::BINARY
str.should == "hello\xE2\x82\f+\xAC".b
end
it "only accepts strings or integers, and doesn't attempt to cast with #to_str or #to_int" do
to_str = mock("to_str")
to_str.should_not_receive(:to_str)
to_str.should_not_receive(:to_int)
str = +"hello"
-> { str.append_as_bytes(to_str) }.should raise_error(TypeError, "wrong argument type MockObject (expected String or Integer)")
end
end
end

165
string.c
Просмотреть файл

@ -3308,6 +3308,32 @@ rb_str_resize(VALUE str, long len)
return str;
}
static void
str_ensure_available_capa(VALUE str, long len)
{
str_modify_keep_cr(str);
const int termlen = TERM_LEN(str);
long olen = RSTRING_LEN(str);
if (RB_UNLIKELY(olen > LONG_MAX - len)) {
rb_raise(rb_eArgError, "string sizes too big");
}
long total = olen + len;
long capa = str_capacity(str, termlen);
if (capa < total) {
if (total >= LONG_MAX / 2) {
capa = total;
}
while (total > capa) {
capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
}
RESIZE_CAPA_TERM(str, capa, termlen);
}
}
static VALUE
str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
{
@ -3662,6 +3688,144 @@ rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
return str;
}
/*
* call-seq:
* append_as_bytes(*objects) -> string
*
* Concatenates each object in +objects+ into +self+ without any encoding
* validation or conversion and returns +self+:
*
* s = 'foo'
* s.append_as_bytes(" \xE2\x82") # => "foo \xE2\x82"
* s.valid_encoding? # => false
* s.append_as_bytes("\xAC 12")
* s.valid_encoding? # => true
*
* For each given object +object+ that is an Integer,
* the value is considered a Byte. If the Integer is bigger
* than one byte, only the lower byte is considered, similar to String#setbyte:
*
* s = ""
* s.append_as_bytes(0, 257) # => "\u0000\u0001"
*
* Related: String#<<, String#concat, which do an encoding aware concatenation.
*/
VALUE
rb_str_append_as_bytes(int argc, VALUE *argv, VALUE str)
{
long needed_capacity = 0;
volatile VALUE t0;
enum ruby_value_type *types = ALLOCV_N(enum ruby_value_type, t0, argc);
for (int index = 0; index < argc; index++) {
VALUE obj = argv[index];
enum ruby_value_type type = types[index] = rb_type(obj);
switch (type) {
case T_FIXNUM:
case T_BIGNUM:
needed_capacity++;
break;
case T_STRING:
needed_capacity += RSTRING_LEN(obj);
break;
default:
rb_raise(
rb_eTypeError,
"wrong argument type %"PRIsVALUE" (expected String or Integer)",
rb_obj_class(obj)
);
break;
}
}
str_ensure_available_capa(str, needed_capacity);
char *sptr = RSTRING_END(str);
for (int index = 0; index < argc; index++) {
VALUE obj = argv[index];
enum ruby_value_type type = types[index];
switch (type) {
case T_FIXNUM:
case T_BIGNUM: {
argv[index] = obj = rb_int_and(obj, INT2FIX(0xff));
char byte = (char)(NUM2INT(obj) & 0xFF);
*sptr = byte;
sptr++;
break;
}
case T_STRING: {
const char *ptr;
long len;
RSTRING_GETMEM(obj, ptr, len);
memcpy(sptr, ptr, len);
sptr += len;
break;
}
default:
UNREACHABLE;
RUBY_ASSERT("append_as_bytes arguments should have been validated");
break;
}
}
STR_SET_LEN(str, RSTRING_LEN(str) + needed_capacity);
TERM_FILL(sptr, TERM_LEN(str)); /* sentinel */
int cr = ENC_CODERANGE(str);
switch (cr) {
case ENC_CODERANGE_7BIT: {
for (int index = 0; index < argc; index++) {
VALUE obj = argv[index];
enum ruby_value_type type = types[index];
switch (type) {
case T_FIXNUM:
case T_BIGNUM: {
if (!ISASCII(NUM2INT(obj))) {
goto clear_cr;
}
break;
}
case T_STRING: {
if (ENC_CODERANGE(obj) != ENC_CODERANGE_7BIT) {
goto clear_cr;
}
}
default:
UNREACHABLE;
RUBY_ASSERT("append_as_bytes arguments should have been validated");
break;
}
}
break;
}
case ENC_CODERANGE_VALID:
if (ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT) {
goto keep_cr;
}
else {
goto clear_cr;
}
break;
default:
goto clear_cr;
break;
}
RB_GC_GUARD(t0);
clear_cr:
// If no fast path was hit, we clear the coderange.
// append_as_bytes is predominently meant to be used in
// buffering situation, hence it's likely the coderange
// will never be scanned, so it's not worth spending time
// precomputing the coderange except for simple and common
// situations.
ENC_CODERANGE_CLEAR(str);
keep_cr:
return str;
}
/*
* call-seq:
* string << object -> string
@ -12433,6 +12597,7 @@ Init_String(void)
rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
rb_define_method(rb_cString, "append_as_bytes", rb_str_append_as_bytes, -1);
rb_define_method(rb_cString, "<<", rb_str_concat, 1);
rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);

Просмотреть файл

@ -3630,6 +3630,55 @@ CODE
assert_bytesplice_raise(ArgumentError, S("hello"), 0..-1, "bye", 0, 3)
end
def test_append_bytes_into_binary
buf = S("".b)
assert_equal Encoding::BINARY, buf.encoding
buf.append_as_bytes(S("hello"))
assert_equal "hello".b, buf
assert_equal Encoding::BINARY, buf.encoding
buf.append_as_bytes(S("こんにちは"))
assert_equal S("helloこんにちは".b), buf
assert_equal Encoding::BINARY, buf.encoding
end
def test_append_bytes_into_utf8
buf = S("")
assert_equal Encoding::UTF_8, buf.encoding
buf.append_as_bytes(S("hello"))
assert_equal S("hello"), buf
assert_equal Encoding::UTF_8, buf.encoding
assert_predicate buf, :ascii_only?
assert_predicate buf, :valid_encoding?
buf.append_as_bytes(S("こんにちは"))
assert_equal S("helloこんにちは"), buf
assert_equal Encoding::UTF_8, buf.encoding
refute_predicate buf, :ascii_only?
assert_predicate buf, :valid_encoding?
buf.append_as_bytes(S("\xE2\x82".b))
assert_equal S("helloこんにちは\xE2\x82"), buf
assert_equal Encoding::UTF_8, buf.encoding
refute_predicate buf, :valid_encoding?
buf.append_as_bytes(S("\xAC".b))
assert_equal S("helloこんにちは€"), buf
assert_equal Encoding::UTF_8, buf.encoding
assert_predicate buf, :valid_encoding?
end
def test_append_bytes_into_utf32
buf = S("abc".encode(Encoding::UTF_32LE))
assert_equal Encoding::UTF_32LE, buf.encoding
buf.append_as_bytes("def")
assert_equal Encoding::UTF_32LE, buf.encoding
refute_predicate buf, :valid_encoding?
end
def test_chilled_string
chilled_string = eval('"chilled"')