string.c: Directly create strings with the correct encoding

While profiling msgpack-ruby I noticed a very substantial amout of time spent in `rb_enc_associate_index`, called by `rb_utf8_str_new`. On that benchmark, `rb_utf8_str_new` is 33% of the total runtime, in big part because it cause GC to trigger often, but even then `5.3%` of the total runtime is spent in `rb_enc_associate_index` called by `rb_utf8_str_new`. After closer inspection, it appears that it's performing a lot of safety check we can assert we don't need, and other extra useless operations, because strings are first created and filled as ASCII-8BIT and then later reassociated to the desired encoding. By directly allocating the string with the right encoding, it allow to skip a lot of duplicated and useless operations. After this change, the time spent in `rb_utf8_str_new` is down to `28.4%` of total runtime, and most of that is GC.
2024-11-13 09:34:45 +01:00 · 2024-11-13 09:34:45 +01:00 · fae86a701e
--- a/encoding.c
+++ b/encoding.c
@ -967,6 +967,21 @@ enc_set_index(VALUE obj, int idx)
    rb_ivar_set(obj, rb_id_encoding(), INT2NUM(idx));
 }

+void
+rb_enc_raw_set(VALUE obj, rb_encoding *enc)
+{
+    RUBY_ASSERT(enc_capable(obj));
+
+    int idx = enc ? ENC_TO_ENCINDEX(enc) : 0;
+
+    if (idx < ENCODING_INLINE_MAX) {
+        ENCODING_SET_INLINED(obj, idx);
+        return;
+    }
+    ENCODING_SET_INLINED(obj, ENCODING_INLINE_MAX);
+    rb_ivar_set(obj, rb_id_encoding(), INT2NUM(idx));
+}
+
 void
 rb_enc_set_index(VALUE obj, int idx)
 {
--- a/internal/encoding.h
+++ b/internal/encoding.h
@ -28,6 +28,8 @@ int rb_encdb_dummy(const char *name);
 void rb_encdb_declare(const char *name);
 void rb_enc_set_base(const char *name, const char *orig);
 int rb_enc_set_dummy(int index);
+void rb_enc_raw_set(VALUE obj, rb_encoding *enc);
+
 PUREFUNC(int rb_data_is_encoding(VALUE obj));

 /* vm.c */
--- a/string.c
+++ b/string.c
@ -988,7 +988,7 @@ empty_str_alloc(VALUE klass)
 }

 static VALUE
-str_new0(VALUE klass, const char *ptr, long len, int termlen)
+str_enc_new(VALUE klass, const char *ptr, long len, rb_encoding *enc)
 {
    VALUE str;

@ -996,12 +996,18 @@ str_new0(VALUE klass, const char *ptr, long len, int termlen)
        rb_raise(rb_eArgError, "negative string size (or size too big)");
    }

+    if (enc == NULL) {
+        enc = rb_ascii8bit_encoding();
+    }
+
    RUBY_DTRACE_CREATE_HOOK(STRING, len);

+    int termlen = rb_enc_mbminlen(enc);
+
    if (STR_EMBEDDABLE_P(len, termlen)) {
        str = str_alloc_embed(klass, len + termlen);
        if (len == 0) {
-            ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
+            ENC_CODERANGE_SET(str, rb_enc_asciicompat(enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
        }
    }
    else {
@ -1013,9 +1019,13 @@ str_new0(VALUE klass, const char *ptr, long len, int termlen)
        RSTRING(str)->as.heap.ptr =
            rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
    }
+
+    rb_enc_raw_set(str, enc);
+
    if (ptr) {
        memcpy(RSTRING_PTR(str), ptr, len);
    }
+
    STR_SET_LEN(str, len);
    TERM_FILL(RSTRING_PTR(str) + len, termlen);
    return str;
@ -1024,7 +1034,7 @@ str_new0(VALUE klass, const char *ptr, long len, int termlen)
 static VALUE
 str_new(VALUE klass, const char *ptr, long len)
 {
-    return str_new0(klass, ptr, len, 1);
+    return str_enc_new(klass, ptr, len, rb_ascii8bit_encoding());
 }

 VALUE
@ -1036,29 +1046,19 @@ rb_str_new(const char *ptr, long len)
 VALUE
 rb_usascii_str_new(const char *ptr, long len)
 {
-    VALUE str = rb_str_new(ptr, len);
-    ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
-    return str;
+    return str_enc_new(rb_cString, ptr, len, rb_usascii_encoding());
 }

 VALUE
 rb_utf8_str_new(const char *ptr, long len)
 {
-    VALUE str = str_new(rb_cString, ptr, len);
-    rb_enc_associate_index(str, rb_utf8_encindex());
-    return str;
+    return str_enc_new(rb_cString, ptr, len, rb_utf8_encoding());
 }

 VALUE
 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
 {
-    VALUE str;
-
-    if (!enc) return rb_str_new(ptr, len);
-
-    str = str_new0(rb_cString, ptr, len, rb_enc_mbminlen(enc));
-    rb_enc_associate(str, enc);
-    return str;
+    return str_enc_new(rb_cString, ptr, len, enc);
 }

 VALUE
@ -1076,17 +1076,13 @@ rb_str_new_cstr(const char *ptr)
 VALUE
 rb_usascii_str_new_cstr(const char *ptr)
 {
-    VALUE str = rb_str_new_cstr(ptr);
-    ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
-    return str;
+    return rb_enc_str_new_cstr(ptr, rb_usascii_encoding());
 }

 VALUE
 rb_utf8_str_new_cstr(const char *ptr)
 {
-    VALUE str = rb_str_new_cstr(ptr);
-    rb_enc_associate_index(str, rb_utf8_encindex());
-    return str;
+    return rb_enc_str_new_cstr(ptr, rb_utf8_encoding());
 }

 VALUE
@ -1109,8 +1105,7 @@ str_new_static(VALUE klass, const char *ptr, long len, int encindex)
    }

    if (!ptr) {
-        rb_encoding *enc = rb_enc_get_from_index(encindex);
-        str = str_new0(klass, ptr, len, rb_enc_mbminlen(enc));
+        str = str_enc_new(klass, ptr, len, rb_enc_from_index(encindex));
    }
    else {
        RUBY_DTRACE_CREATE_HOOK(STRING, len);
@ -1119,8 +1114,8 @@ str_new_static(VALUE klass, const char *ptr, long len, int encindex)
        RSTRING(str)->as.heap.ptr = (char *)ptr;
        RSTRING(str)->as.heap.aux.capa = len;
        RBASIC(str)->flags |= STR_NOFREE;
+        rb_enc_associate_index(str, encindex);
    }
-    rb_enc_associate_index(str, encindex);
    return str;
 }

@ -1570,10 +1565,11 @@ str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
    VALUE str;

    long len = RSTRING_LEN(orig);
+    rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
    int termlen = copy_encoding ? TERM_LEN(orig) : 1;

    if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
-        str = str_new0(klass, RSTRING_PTR(orig), len, termlen);
+        str = str_enc_new(klass, RSTRING_PTR(orig), len, enc);
        RUBY_ASSERT(STR_EMBED_P(str));
    }
    else {
@ -1621,7 +1617,7 @@ str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
 VALUE
 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
 {
-    return str_new0(rb_obj_class(obj), ptr, len, TERM_LEN(obj));
+    return str_enc_new(rb_obj_class(obj), ptr, len, STR_ENC_GET(obj));
 }

 static VALUE
@ -2083,8 +2079,6 @@ rb_str_s_new(int argc, VALUE *argv, VALUE klass)
    encoding = kwargs[0];
    capacity = kwargs[1];

-    int termlen = 1;
-
    if (n == 1) {
        orig = StringValue(orig);
    }
@ -2100,7 +2094,6 @@ rb_str_s_new(int argc, VALUE *argv, VALUE klass)

    if (!UNDEF_P(encoding)) {
        enc = rb_to_encoding(encoding);
-        termlen = rb_enc_mbminlen(enc);
    }

    // If capacity is nil, we're basically just duping `orig`.
@ -2131,13 +2124,9 @@ rb_str_s_new(int argc, VALUE *argv, VALUE klass)
        }
    }

-    VALUE str = str_new0(klass, NULL, capa, termlen);
+    VALUE str = str_enc_new(klass, NULL, capa, enc);
    STR_SET_LEN(str, 0);
-    TERM_FILL(RSTRING_PTR(str), termlen);
-
-    if (enc) {
-        rb_enc_associate(str, enc);
-    }
+    TERM_FILL(RSTRING_PTR(str), enc ? rb_enc_mbmaxlen(enc) : 1);

    if (!NIL_P(orig)) {
        rb_str_buf_append(str, orig);
@ -2426,7 +2415,7 @@ rb_str_plus(VALUE str1, VALUE str2)
    if (len1 > LONG_MAX - len2) {
        rb_raise(rb_eArgError, "string size too big");
    }
-    str3 = str_new0(rb_cString, 0, len1+len2, termlen);
+    str3 = str_enc_new(rb_cString, 0, len1+len2, enc);
    ptr3 = RSTRING_PTR(str3);
    memcpy(ptr3, ptr1, len1);
    memcpy(ptr3+len1, ptr2, len2);
@ -2521,7 +2510,7 @@ rb_str_times(VALUE str, VALUE times)

    len *= RSTRING_LEN(str);
    termlen = TERM_LEN(str);
-    str2 = str_new0(rb_cString, 0, len, termlen);
+    str2 = str_enc_new(rb_cString, 0, len, STR_ENC_GET(str));
    ptr2 = RSTRING_PTR(str2);
    if (len) {
        n = RSTRING_LEN(str);
@ -10887,7 +10876,7 @@ rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
       rb_raise(rb_eArgError, "argument too big");
    }
    len += size;
-    res = str_new0(rb_cString, 0, len, termlen);
+    res = str_enc_new(rb_cString, 0, len, enc);
    p = RSTRING_PTR(res);
    if (flen <= 1) {
       memset(p, *f, llen);
@ -10923,7 +10912,7 @@ rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
    }
    TERM_FILL(p, termlen);
    STR_SET_LEN(res, p-RSTRING_PTR(res));
-    rb_enc_associate(res, enc);
+
    if (argc == 2)
        cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
    if (cr != ENC_CODERANGE_BROKEN)