зеркало из https://github.com/github/ruby.git
string.c: Directly create strings with the correct encoding
While profiling msgpack-ruby I noticed a very substantial amout of time spent in `rb_enc_associate_index`, called by `rb_utf8_str_new`. On that benchmark, `rb_utf8_str_new` is 33% of the total runtime, in big part because it cause GC to trigger often, but even then `5.3%` of the total runtime is spent in `rb_enc_associate_index` called by `rb_utf8_str_new`. After closer inspection, it appears that it's performing a lot of safety check we can assert we don't need, and other extra useless operations, because strings are first created and filled as ASCII-8BIT and then later reassociated to the desired encoding. By directly allocating the string with the right encoding, it allow to skip a lot of duplicated and useless operations. After this change, the time spent in `rb_utf8_str_new` is down to `28.4%` of total runtime, and most of that is GC.
This commit is contained in:
Родитель
bfb4783c01
Коммит
fae86a701e
15
encoding.c
15
encoding.c
|
@ -967,6 +967,21 @@ enc_set_index(VALUE obj, int idx)
|
|||
rb_ivar_set(obj, rb_id_encoding(), INT2NUM(idx));
|
||||
}
|
||||
|
||||
void
|
||||
rb_enc_raw_set(VALUE obj, rb_encoding *enc)
|
||||
{
|
||||
RUBY_ASSERT(enc_capable(obj));
|
||||
|
||||
int idx = enc ? ENC_TO_ENCINDEX(enc) : 0;
|
||||
|
||||
if (idx < ENCODING_INLINE_MAX) {
|
||||
ENCODING_SET_INLINED(obj, idx);
|
||||
return;
|
||||
}
|
||||
ENCODING_SET_INLINED(obj, ENCODING_INLINE_MAX);
|
||||
rb_ivar_set(obj, rb_id_encoding(), INT2NUM(idx));
|
||||
}
|
||||
|
||||
void
|
||||
rb_enc_set_index(VALUE obj, int idx)
|
||||
{
|
||||
|
|
|
@ -28,6 +28,8 @@ int rb_encdb_dummy(const char *name);
|
|||
void rb_encdb_declare(const char *name);
|
||||
void rb_enc_set_base(const char *name, const char *orig);
|
||||
int rb_enc_set_dummy(int index);
|
||||
void rb_enc_raw_set(VALUE obj, rb_encoding *enc);
|
||||
|
||||
PUREFUNC(int rb_data_is_encoding(VALUE obj));
|
||||
|
||||
/* vm.c */
|
||||
|
|
69
string.c
69
string.c
|
@ -988,7 +988,7 @@ empty_str_alloc(VALUE klass)
|
|||
}
|
||||
|
||||
static VALUE
|
||||
str_new0(VALUE klass, const char *ptr, long len, int termlen)
|
||||
str_enc_new(VALUE klass, const char *ptr, long len, rb_encoding *enc)
|
||||
{
|
||||
VALUE str;
|
||||
|
||||
|
@ -996,12 +996,18 @@ str_new0(VALUE klass, const char *ptr, long len, int termlen)
|
|||
rb_raise(rb_eArgError, "negative string size (or size too big)");
|
||||
}
|
||||
|
||||
if (enc == NULL) {
|
||||
enc = rb_ascii8bit_encoding();
|
||||
}
|
||||
|
||||
RUBY_DTRACE_CREATE_HOOK(STRING, len);
|
||||
|
||||
int termlen = rb_enc_mbminlen(enc);
|
||||
|
||||
if (STR_EMBEDDABLE_P(len, termlen)) {
|
||||
str = str_alloc_embed(klass, len + termlen);
|
||||
if (len == 0) {
|
||||
ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
|
||||
ENC_CODERANGE_SET(str, rb_enc_asciicompat(enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
|
||||
}
|
||||
}
|
||||
else {
|
||||
|
@ -1013,9 +1019,13 @@ str_new0(VALUE klass, const char *ptr, long len, int termlen)
|
|||
RSTRING(str)->as.heap.ptr =
|
||||
rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
|
||||
}
|
||||
|
||||
rb_enc_raw_set(str, enc);
|
||||
|
||||
if (ptr) {
|
||||
memcpy(RSTRING_PTR(str), ptr, len);
|
||||
}
|
||||
|
||||
STR_SET_LEN(str, len);
|
||||
TERM_FILL(RSTRING_PTR(str) + len, termlen);
|
||||
return str;
|
||||
|
@ -1024,7 +1034,7 @@ str_new0(VALUE klass, const char *ptr, long len, int termlen)
|
|||
static VALUE
|
||||
str_new(VALUE klass, const char *ptr, long len)
|
||||
{
|
||||
return str_new0(klass, ptr, len, 1);
|
||||
return str_enc_new(klass, ptr, len, rb_ascii8bit_encoding());
|
||||
}
|
||||
|
||||
VALUE
|
||||
|
@ -1036,29 +1046,19 @@ rb_str_new(const char *ptr, long len)
|
|||
VALUE
|
||||
rb_usascii_str_new(const char *ptr, long len)
|
||||
{
|
||||
VALUE str = rb_str_new(ptr, len);
|
||||
ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
|
||||
return str;
|
||||
return str_enc_new(rb_cString, ptr, len, rb_usascii_encoding());
|
||||
}
|
||||
|
||||
VALUE
|
||||
rb_utf8_str_new(const char *ptr, long len)
|
||||
{
|
||||
VALUE str = str_new(rb_cString, ptr, len);
|
||||
rb_enc_associate_index(str, rb_utf8_encindex());
|
||||
return str;
|
||||
return str_enc_new(rb_cString, ptr, len, rb_utf8_encoding());
|
||||
}
|
||||
|
||||
VALUE
|
||||
rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
|
||||
{
|
||||
VALUE str;
|
||||
|
||||
if (!enc) return rb_str_new(ptr, len);
|
||||
|
||||
str = str_new0(rb_cString, ptr, len, rb_enc_mbminlen(enc));
|
||||
rb_enc_associate(str, enc);
|
||||
return str;
|
||||
return str_enc_new(rb_cString, ptr, len, enc);
|
||||
}
|
||||
|
||||
VALUE
|
||||
|
@ -1076,17 +1076,13 @@ rb_str_new_cstr(const char *ptr)
|
|||
VALUE
|
||||
rb_usascii_str_new_cstr(const char *ptr)
|
||||
{
|
||||
VALUE str = rb_str_new_cstr(ptr);
|
||||
ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
|
||||
return str;
|
||||
return rb_enc_str_new_cstr(ptr, rb_usascii_encoding());
|
||||
}
|
||||
|
||||
VALUE
|
||||
rb_utf8_str_new_cstr(const char *ptr)
|
||||
{
|
||||
VALUE str = rb_str_new_cstr(ptr);
|
||||
rb_enc_associate_index(str, rb_utf8_encindex());
|
||||
return str;
|
||||
return rb_enc_str_new_cstr(ptr, rb_utf8_encoding());
|
||||
}
|
||||
|
||||
VALUE
|
||||
|
@ -1109,8 +1105,7 @@ str_new_static(VALUE klass, const char *ptr, long len, int encindex)
|
|||
}
|
||||
|
||||
if (!ptr) {
|
||||
rb_encoding *enc = rb_enc_get_from_index(encindex);
|
||||
str = str_new0(klass, ptr, len, rb_enc_mbminlen(enc));
|
||||
str = str_enc_new(klass, ptr, len, rb_enc_from_index(encindex));
|
||||
}
|
||||
else {
|
||||
RUBY_DTRACE_CREATE_HOOK(STRING, len);
|
||||
|
@ -1119,8 +1114,8 @@ str_new_static(VALUE klass, const char *ptr, long len, int encindex)
|
|||
RSTRING(str)->as.heap.ptr = (char *)ptr;
|
||||
RSTRING(str)->as.heap.aux.capa = len;
|
||||
RBASIC(str)->flags |= STR_NOFREE;
|
||||
rb_enc_associate_index(str, encindex);
|
||||
}
|
||||
rb_enc_associate_index(str, encindex);
|
||||
return str;
|
||||
}
|
||||
|
||||
|
@ -1570,10 +1565,11 @@ str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
|
|||
VALUE str;
|
||||
|
||||
long len = RSTRING_LEN(orig);
|
||||
rb_encoding *enc = copy_encoding ? STR_ENC_GET(orig) : rb_ascii8bit_encoding();
|
||||
int termlen = copy_encoding ? TERM_LEN(orig) : 1;
|
||||
|
||||
if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
|
||||
str = str_new0(klass, RSTRING_PTR(orig), len, termlen);
|
||||
str = str_enc_new(klass, RSTRING_PTR(orig), len, enc);
|
||||
RUBY_ASSERT(STR_EMBED_P(str));
|
||||
}
|
||||
else {
|
||||
|
@ -1621,7 +1617,7 @@ str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
|
|||
VALUE
|
||||
rb_str_new_with_class(VALUE obj, const char *ptr, long len)
|
||||
{
|
||||
return str_new0(rb_obj_class(obj), ptr, len, TERM_LEN(obj));
|
||||
return str_enc_new(rb_obj_class(obj), ptr, len, STR_ENC_GET(obj));
|
||||
}
|
||||
|
||||
static VALUE
|
||||
|
@ -2083,8 +2079,6 @@ rb_str_s_new(int argc, VALUE *argv, VALUE klass)
|
|||
encoding = kwargs[0];
|
||||
capacity = kwargs[1];
|
||||
|
||||
int termlen = 1;
|
||||
|
||||
if (n == 1) {
|
||||
orig = StringValue(orig);
|
||||
}
|
||||
|
@ -2100,7 +2094,6 @@ rb_str_s_new(int argc, VALUE *argv, VALUE klass)
|
|||
|
||||
if (!UNDEF_P(encoding)) {
|
||||
enc = rb_to_encoding(encoding);
|
||||
termlen = rb_enc_mbminlen(enc);
|
||||
}
|
||||
|
||||
// If capacity is nil, we're basically just duping `orig`.
|
||||
|
@ -2131,13 +2124,9 @@ rb_str_s_new(int argc, VALUE *argv, VALUE klass)
|
|||
}
|
||||
}
|
||||
|
||||
VALUE str = str_new0(klass, NULL, capa, termlen);
|
||||
VALUE str = str_enc_new(klass, NULL, capa, enc);
|
||||
STR_SET_LEN(str, 0);
|
||||
TERM_FILL(RSTRING_PTR(str), termlen);
|
||||
|
||||
if (enc) {
|
||||
rb_enc_associate(str, enc);
|
||||
}
|
||||
TERM_FILL(RSTRING_PTR(str), enc ? rb_enc_mbmaxlen(enc) : 1);
|
||||
|
||||
if (!NIL_P(orig)) {
|
||||
rb_str_buf_append(str, orig);
|
||||
|
@ -2426,7 +2415,7 @@ rb_str_plus(VALUE str1, VALUE str2)
|
|||
if (len1 > LONG_MAX - len2) {
|
||||
rb_raise(rb_eArgError, "string size too big");
|
||||
}
|
||||
str3 = str_new0(rb_cString, 0, len1+len2, termlen);
|
||||
str3 = str_enc_new(rb_cString, 0, len1+len2, enc);
|
||||
ptr3 = RSTRING_PTR(str3);
|
||||
memcpy(ptr3, ptr1, len1);
|
||||
memcpy(ptr3+len1, ptr2, len2);
|
||||
|
@ -2521,7 +2510,7 @@ rb_str_times(VALUE str, VALUE times)
|
|||
|
||||
len *= RSTRING_LEN(str);
|
||||
termlen = TERM_LEN(str);
|
||||
str2 = str_new0(rb_cString, 0, len, termlen);
|
||||
str2 = str_enc_new(rb_cString, 0, len, STR_ENC_GET(str));
|
||||
ptr2 = RSTRING_PTR(str2);
|
||||
if (len) {
|
||||
n = RSTRING_LEN(str);
|
||||
|
@ -10887,7 +10876,7 @@ rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
|
|||
rb_raise(rb_eArgError, "argument too big");
|
||||
}
|
||||
len += size;
|
||||
res = str_new0(rb_cString, 0, len, termlen);
|
||||
res = str_enc_new(rb_cString, 0, len, enc);
|
||||
p = RSTRING_PTR(res);
|
||||
if (flen <= 1) {
|
||||
memset(p, *f, llen);
|
||||
|
@ -10923,7 +10912,7 @@ rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
|
|||
}
|
||||
TERM_FILL(p, termlen);
|
||||
STR_SET_LEN(res, p-RSTRING_PTR(res));
|
||||
rb_enc_associate(res, enc);
|
||||
|
||||
if (argc == 2)
|
||||
cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
|
||||
if (cr != ENC_CODERANGE_BROKEN)
|
||||
|
|
Загрузка…
Ссылка в новой задаче