string.c: Directly create strings with the correct encoding

While profiling msgpack-ruby I noticed a very substantial amout of time
spent in `rb_enc_associate_index`, called by `rb_utf8_str_new`.

On that benchmark, `rb_utf8_str_new` is 33% of the total runtime,
in big part because it cause GC to trigger often, but even then
`5.3%` of the total runtime is spent in `rb_enc_associate_index`
called by `rb_utf8_str_new`.

After closer inspection, it appears that it's performing a lot of
safety check we can assert we don't need, and other extra useless
operations, because strings are first created and filled as ASCII-8BIT
and then later reassociated to the desired encoding.

By directly allocating the string with the right encoding, it allow
to skip a lot of duplicated and useless operations.

After this change, the time spent in `rb_utf8_str_new` is down
to `28.4%` of total runtime, and most of that is GC.
This commit is contained in:
Jean Boussier 2024-11-13 09:34:45 +01:00
Родитель bfb4783c01
Коммит fae86a701e
3 изменённых файлов: 46 добавлений и 40 удалений

Просмотреть файл

@ -967,6 +967,21 @@ enc_set_index(VALUE obj, int idx)
rb_ivar_set(obj, rb_id_encoding(), INT2NUM(idx));
}
void
rb_enc_raw_set(VALUE obj, rb_encoding *enc)
{
RUBY_ASSERT(enc_capable(obj));
int idx = enc ? ENC_TO_ENCINDEX(enc) : 0;
if (idx < ENCODING_INLINE_MAX) {
ENCODING_SET_INLINED(obj, idx);
return;
}
ENCODING_SET_INLINED(obj, ENCODING_INLINE_MAX);
rb_ivar_set(obj, rb_id_encoding(), INT2NUM(idx));
}
void
rb_enc_set_index(VALUE obj, int idx)
{

Просмотреть файл

@ -28,6 +28,8 @@ int rb_encdb_dummy(const char *name);
void rb_encdb_declare(const char *name);
void rb_enc_set_base(const char *name, const char *orig);
int rb_enc_set_dummy(int index);
void rb_enc_raw_set(VALUE obj, rb_encoding *enc);
PUREFUNC(int rb_data_is_encoding(VALUE obj));
/* vm.c */

Просмотреть файл

@ -988,7 +988,7 @@ empty_str_alloc(VALUE klass)
}
static VALUE
str_new0(VALUE klass, const char *ptr, long len, int termlen)
str_enc_new(VALUE klass, const char *ptr, long len, rb_encoding *enc)
{
VALUE str;
@ -996,12 +996,18 @@ str_new0(VALUE klass, const char *ptr, long len, int termlen)
rb_raise(rb_eArgError, "negative string size (or size too big)");
}
if (enc == NULL) {
enc = rb_ascii8bit_encoding();
}
RUBY_DTRACE_CREATE_HOOK(STRING, len);
int termlen = rb_enc_mbminlen(enc);
if (STR_EMBEDDABLE_P(len, termlen)) {
str = str_alloc_embed(klass, len + termlen);
if (len == 0) {
ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
ENC_CODERANGE_SET(str, rb_enc_asciicompat(enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
}
}
else {
@ -1013,9 +1019,13 @@ str_new0(VALUE klass, const char *ptr, long len, int termlen)
RSTRING(str)->as.heap.ptr =
rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
}
rb_enc_raw_set(str, enc);
if (ptr) {
memcpy(RSTRING_PTR(str), ptr, len);
}
STR_SET_LEN(str, len);
TERM_FILL(RSTRING_PTR(str) + len, termlen);
return str;
@ -1024,7 +1034,7 @@ str_new0(VALUE klass, const char *ptr, long len, int termlen)
static VALUE
str_new(VALUE klass, const char *ptr, long len)
{
return str_new0(klass, ptr, len, 1);
return str_enc_new(klass, ptr, len, rb_ascii8bit_encoding());
}
VALUE
@ -1036,29 +1046,19 @@ rb_str_new(const char *ptr, long len)
VALUE
rb_usascii_str_new(const char *ptr, long len)
{
VALUE str = rb_str_new(ptr, len);
ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
return str;
return str_enc_new(rb_cString, ptr, len, rb_usascii_encoding());
}
VALUE
rb_utf8_str_new(const char *ptr, long len)
{
VALUE str = str_new(rb_cString, ptr, len);
rb_enc_associate_index(str, rb_utf8_encindex());
return str;
return str_enc_new(rb_cString, ptr, len, rb_utf8_encoding());
}
VALUE
rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
{
VALUE str;
if (!enc) return rb_str_new(ptr, len);
str = str_new0(rb_cString, ptr, len, rb_enc_mbminlen(enc));
rb_enc_associate(str, enc);
return str;
return str_enc_new(rb_cString, ptr, len, enc);
}
VALUE
@ -1076,17 +1076,13 @@ rb_str_new_cstr(const char *ptr)
VALUE
rb_usascii_str_new_cstr(const char *ptr)
{
VALUE str = rb_str_new_cstr(ptr);
ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
return str;
return rb_enc_str_new_cstr(ptr, rb_usascii_encoding());
}
VALUE
rb_utf8_str_new_cstr(const char *ptr)
{
VALUE str = rb_str_new_cstr(ptr);
rb_enc_associate_index(str, rb_utf8_encindex());
return str;
return rb_enc_str_new_cstr(ptr, rb_utf8_encoding());
}
VALUE
@ -1109,8 +1105,7 @@ str_new_static(VALUE klass, const char *ptr, long len, int encindex)
}
if (!ptr) {
rb_encoding *enc = rb_enc_get_from_index(encindex);
str = str_new0(klass, ptr, len, rb_enc_mbminlen(enc));
str = str_enc_new(klass, ptr, len, rb_enc_from_index(encindex));
}
else {
RUBY_DTRACE_CREATE_HOOK(STRING, len);
@ -1119,8 +1114,8 @@ str_new_static(VALUE klass, const char *ptr, long len, int encindex)
RSTRING(str)->as.heap.ptr = (char *)ptr;
RSTRING(str)->as.heap.aux.capa = len;
RBASIC(str)->flags |= STR_NOFREE;
rb_enc_associate_index(str, encindex);
}
rb_enc_associate_index(str, encindex);
return str;
}
@ -1570,10 +1565,11 @@ str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
VALUE str;
long len = RSTRING_LEN(orig);
rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
int termlen = copy_encoding ? TERM_LEN(orig) : 1;
if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
str = str_new0(klass, RSTRING_PTR(orig), len, termlen);
str = str_enc_new(klass, RSTRING_PTR(orig), len, enc);
RUBY_ASSERT(STR_EMBED_P(str));
}
else {
@ -1621,7 +1617,7 @@ str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
VALUE
rb_str_new_with_class(VALUE obj, const char *ptr, long len)
{
return str_new0(rb_obj_class(obj), ptr, len, TERM_LEN(obj));
return str_enc_new(rb_obj_class(obj), ptr, len, STR_ENC_GET(obj));
}
static VALUE
@ -2083,8 +2079,6 @@ rb_str_s_new(int argc, VALUE *argv, VALUE klass)
encoding = kwargs[0];
capacity = kwargs[1];
int termlen = 1;
if (n == 1) {
orig = StringValue(orig);
}
@ -2100,7 +2094,6 @@ rb_str_s_new(int argc, VALUE *argv, VALUE klass)
if (!UNDEF_P(encoding)) {
enc = rb_to_encoding(encoding);
termlen = rb_enc_mbminlen(enc);
}
// If capacity is nil, we're basically just duping `orig`.
@ -2131,13 +2124,9 @@ rb_str_s_new(int argc, VALUE *argv, VALUE klass)
}
}
VALUE str = str_new0(klass, NULL, capa, termlen);
VALUE str = str_enc_new(klass, NULL, capa, enc);
STR_SET_LEN(str, 0);
TERM_FILL(RSTRING_PTR(str), termlen);
if (enc) {
rb_enc_associate(str, enc);
}
TERM_FILL(RSTRING_PTR(str), enc ? rb_enc_mbmaxlen(enc) : 1);
if (!NIL_P(orig)) {
rb_str_buf_append(str, orig);
@ -2426,7 +2415,7 @@ rb_str_plus(VALUE str1, VALUE str2)
if (len1 > LONG_MAX - len2) {
rb_raise(rb_eArgError, "string size too big");
}
str3 = str_new0(rb_cString, 0, len1+len2, termlen);
str3 = str_enc_new(rb_cString, 0, len1+len2, enc);
ptr3 = RSTRING_PTR(str3);
memcpy(ptr3, ptr1, len1);
memcpy(ptr3+len1, ptr2, len2);
@ -2521,7 +2510,7 @@ rb_str_times(VALUE str, VALUE times)
len *= RSTRING_LEN(str);
termlen = TERM_LEN(str);
str2 = str_new0(rb_cString, 0, len, termlen);
str2 = str_enc_new(rb_cString, 0, len, STR_ENC_GET(str));
ptr2 = RSTRING_PTR(str2);
if (len) {
n = RSTRING_LEN(str);
@ -10887,7 +10876,7 @@ rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
rb_raise(rb_eArgError, "argument too big");
}
len += size;
res = str_new0(rb_cString, 0, len, termlen);
res = str_enc_new(rb_cString, 0, len, enc);
p = RSTRING_PTR(res);
if (flen <= 1) {
memset(p, *f, llen);
@ -10923,7 +10912,7 @@ rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
}
TERM_FILL(p, termlen);
STR_SET_LEN(res, p-RSTRING_PTR(res));
rb_enc_associate(res, enc);
if (argc == 2)
cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
if (cr != ENC_CODERANGE_BROKEN)