[ruby/json] json_string_unescape: Use the returned RString as buffer

Rather than to copy into a buffer to unescape and then copy that
buffer into the final string, we can directly copy into the final
string.

The downside is that if the string contains a lot of escaping, we
end up returning a string that's larger than strictly necessary, but
it's probably fine.

Before:

```
== Parsing twitter.json (567916 bytes)
ruby 3.3.4 (2024-07-09 revision https://github.com/ruby/json/commit/be1089c8ec) +YJIT [arm64-darwin23]
Warming up --------------------------------------
                json    56.000 i/100ms
                  oj    58.000 i/100ms
           oj strict    74.000 i/100ms
          Oj::Parser    76.000 i/100ms
           rapidjson    52.000 i/100ms
Calculating -------------------------------------
                json    556.659 (± 2.9%) i/s    (1.80 ms/i) -      2.800k in   5.034719s
                  oj    604.077 (± 3.8%) i/s    (1.66 ms/i) -      3.016k in   5.001546s
           oj strict    706.942 (± 3.5%) i/s    (1.41 ms/i) -      3.552k in   5.030954s
          Oj::Parser    752.917 (± 3.2%) i/s    (1.33 ms/i) -      3.800k in   5.052707s
           rapidjson    546.470 (± 3.5%) i/s    (1.83 ms/i) -      2.756k in   5.049855s

Comparison:
                json:      556.7 i/s
          Oj::Parser:      752.9 i/s - 1.35x  faster
           oj strict:      706.9 i/s - 1.27x  faster
                  oj:      604.1 i/s - 1.09x  faster
           rapidjson:      546.5 i/s - same-ish: difference falls within error

== Parsing citm_catalog.json (1727030 bytes)
ruby 3.3.4 (2024-07-09 revision https://github.com/ruby/json/commit/be1089c8ec) +YJIT [arm64-darwin23]
Warming up --------------------------------------
                json    29.000 i/100ms
                  oj    32.000 i/100ms
           oj strict    38.000 i/100ms
          Oj::Parser    42.000 i/100ms
           rapidjson    38.000 i/100ms
Calculating -------------------------------------
                json    317.858 (± 3.1%) i/s    (3.15 ms/i) -      1.595k in   5.023245s
                  oj    348.168 (± 2.6%) i/s    (2.87 ms/i) -      1.760k in   5.058431s
           oj strict    394.599 (± 2.8%) i/s    (2.53 ms/i) -      1.976k in   5.012073s
          Oj::Parser    403.771 (± 3.0%) i/s    (2.48 ms/i) -      2.058k in   5.101578s
           rapidjson    383.441 (± 3.7%) i/s    (2.61 ms/i) -      1.938k in   5.061355s

Comparison:
                json:      317.9 i/s
          Oj::Parser:      403.8 i/s - 1.27x  faster
           oj strict:      394.6 i/s - 1.24x  faster
           rapidjson:      383.4 i/s - 1.21x  faster
                  oj:      348.2 i/s - 1.10x  faster
```

After:

```
== Parsing twitter.json (567916 bytes)
ruby 3.3.4 (2024-07-09 revision https://github.com/ruby/json/commit/be1089c8ec) +YJIT [arm64-darwin23]
Warming up --------------------------------------
                json    56.000 i/100ms
                  oj    62.000 i/100ms
           oj strict    73.000 i/100ms
          Oj::Parser    76.000 i/100ms
           rapidjson    54.000 i/100ms
Calculating -------------------------------------
                json    561.009 (± 7.5%) i/s    (1.78 ms/i) -      2.800k in   5.039548s
                  oj    601.124 (± 4.3%) i/s    (1.66 ms/i) -      3.038k in   5.064686s
           oj strict    707.455 (± 3.4%) i/s    (1.41 ms/i) -      3.577k in   5.062540s
          Oj::Parser    751.799 (± 3.1%) i/s    (1.33 ms/i) -      3.800k in   5.059509s
           rapidjson    535.641 (± 3.2%) i/s    (1.87 ms/i) -      2.700k in   5.045816s

Comparison:
                json:      561.0 i/s
          Oj::Parser:      751.8 i/s - 1.34x  faster
           oj strict:      707.5 i/s - 1.26x  faster
                  oj:      601.1 i/s - same-ish: difference falls within error
           rapidjson:      535.6 i/s - same-ish: difference falls within error

== Parsing citm_catalog.json (1727030 bytes)
ruby 3.3.4 (2024-07-09 revision https://github.com/ruby/json/commit/be1089c8ec) +YJIT [arm64-darwin23]
Warming up --------------------------------------
                json    30.000 i/100ms
                  oj    32.000 i/100ms
           oj strict    36.000 i/100ms
          Oj::Parser    42.000 i/100ms
           rapidjson    39.000 i/100ms
Calculating -------------------------------------
                json    313.248 (± 7.3%) i/s    (3.19 ms/i) -      1.560k in   5.014118s
                  oj    341.977 (± 4.1%) i/s    (2.92 ms/i) -      1.728k in   5.063332s
           oj strict    387.062 (± 6.2%) i/s    (2.58 ms/i) -      1.944k in   5.045961s
          Oj::Parser    400.423 (± 4.0%) i/s    (2.50 ms/i) -      2.016k in   5.044513s
           rapidjson    379.046 (± 6.1%) i/s    (2.64 ms/i) -      1.911k in   5.064461s

Comparison:
                json:      313.2 i/s
          Oj::Parser:      400.4 i/s - 1.28x  faster
           oj strict:      387.1 i/s - 1.24x  faster
           rapidjson:      379.0 i/s - 1.21x  faster
                  oj:      342.0 i/s - same-ish: difference falls within error
```

https://github.com/ruby/json/commit/5e1ec4a268
This commit is contained in:
Jean Boussier 2024-10-31 15:22:38 +01:00 коммит произвёл Hiroshi SHIBATA
Родитель b85a7a44fa
Коммит ee4fa4ccee
2 изменённых файлов: 33 добавлений и 67 удалений

Просмотреть файл

@ -1476,10 +1476,8 @@ static inline VALUE build_string(const char *start, const char *end, bool intern
return result;
}
static const size_t MAX_STACK_BUFFER_SIZE = 128;
static VALUE json_string_unescape(char *string, char *stringEnd, bool intern, bool symbolize)
{
VALUE result = Qnil;
size_t bufferSize = stringEnd - string;
char *p = string, *pe = string, *unescape, *bufferStart, *buffer;
int unescape_len;
@ -1490,19 +1488,9 @@ static VALUE json_string_unescape(char *string, char *stringEnd, bool intern, bo
return build_string(string, stringEnd, intern, symbolize);
}
if (bufferSize > MAX_STACK_BUFFER_SIZE) {
# ifdef HAVE_RB_ENC_INTERNED_STR
bufferStart = buffer = ALLOC_N(char, bufferSize ? bufferSize : 1);
# else
bufferStart = buffer = ALLOC_N(char, bufferSize);
# endif
} else {
# ifdef HAVE_RB_ENC_INTERNED_STR
bufferStart = buffer = ALLOCA_N(char, bufferSize ? bufferSize : 1);
# else
bufferStart = buffer = ALLOCA_N(char, bufferSize);
# endif
}
VALUE result = rb_str_buf_new(bufferSize);
rb_enc_associate_index(result, utf8_encindex);
buffer = bufferStart = RSTRING_PTR(result);
while (pe < stringEnd) {
if (*pe == '\\') {
@ -1536,9 +1524,6 @@ static VALUE json_string_unescape(char *string, char *stringEnd, bool intern, bo
break;
case 'u':
if (pe > stringEnd - 4) {
if (bufferSize > MAX_STACK_BUFFER_SIZE) {
ruby_xfree(bufferStart);
}
raise_parse_error("incomplete unicode character escape sequence at '%s'", p);
} else {
uint32_t ch = unescape_unicode((unsigned char *) ++pe);
@ -1556,9 +1541,6 @@ static VALUE json_string_unescape(char *string, char *stringEnd, bool intern, bo
if ((ch & 0xFC00) == 0xD800) {
pe++;
if (pe > stringEnd - 6) {
if (bufferSize > MAX_STACK_BUFFER_SIZE) {
ruby_xfree(bufferStart);
}
raise_parse_error("incomplete surrogate pair at '%s'", p);
}
if (pe[0] == '\\' && pe[1] == 'u') {
@ -1591,18 +1573,19 @@ static VALUE json_string_unescape(char *string, char *stringEnd, bool intern, bo
MEMCPY(buffer, p, char, pe - p);
buffer += pe - p;
}
rb_str_set_len(result, buffer - bufferStart);
result = build_string(bufferStart, buffer, intern, symbolize);
if (bufferSize > MAX_STACK_BUFFER_SIZE) {
ruby_xfree(bufferStart);
if (symbolize) {
result = rb_str_intern(result);
} else if (intern) {
result = rb_funcall(rb_str_freeze(result), i_uminus, 0);
}
return result;
}
#line 1606 "parser.c"
#line 1589 "parser.c"
enum {JSON_string_start = 1};
enum {JSON_string_first_final = 8};
enum {JSON_string_error = 0};
@ -1610,7 +1593,7 @@ enum {JSON_string_error = 0};
enum {JSON_string_en_main = 1};
#line 634 "parser.rl"
#line 617 "parser.rl"
static int
@ -1631,15 +1614,15 @@ static char *JSON_parse_string(JSON_Parser *json, char *p, char *pe, VALUE *resu
VALUE match_string;
#line 1635 "parser.c"
#line 1618 "parser.c"
{
cs = JSON_string_start;
}
#line 654 "parser.rl"
#line 637 "parser.rl"
json->memo = p;
#line 1643 "parser.c"
#line 1626 "parser.c"
{
if ( p == pe )
goto _test_eof;
@ -1664,7 +1647,7 @@ case 2:
goto st0;
goto st2;
tr2:
#line 621 "parser.rl"
#line 604 "parser.rl"
{
*result = json_string_unescape(json->memo + 1, p, json->parsing_name || json-> freeze, json->parsing_name && json->symbolize_names);
if (NIL_P(*result)) {
@ -1674,14 +1657,14 @@ tr2:
{p = (( p + 1))-1;}
}
}
#line 631 "parser.rl"
#line 614 "parser.rl"
{ p--; {p++; cs = 8; goto _out;} }
goto st8;
st8:
if ( ++p == pe )
goto _test_eof8;
case 8:
#line 1685 "parser.c"
#line 1668 "parser.c"
goto st0;
st3:
if ( ++p == pe )
@ -1757,7 +1740,7 @@ case 7:
_out: {}
}
#line 656 "parser.rl"
#line 639 "parser.rl"
if (json->create_additions && RTEST(match_string = json->match_string)) {
VALUE klass;
@ -1954,7 +1937,7 @@ static VALUE cParser_initialize(int argc, VALUE *argv, VALUE self)
}
#line 1958 "parser.c"
#line 1941 "parser.c"
enum {JSON_start = 1};
enum {JSON_first_final = 10};
enum {JSON_error = 0};
@ -1962,7 +1945,7 @@ enum {JSON_error = 0};
enum {JSON_en_main = 1};
#line 866 "parser.rl"
#line 849 "parser.rl"
/*
@ -1980,16 +1963,16 @@ static VALUE cParser_parse(VALUE self)
GET_PARSER;
#line 1984 "parser.c"
#line 1967 "parser.c"
{
cs = JSON_start;
}
#line 883 "parser.rl"
#line 866 "parser.rl"
p = json->source;
pe = p + json->len;
#line 1993 "parser.c"
#line 1976 "parser.c"
{
if ( p == pe )
goto _test_eof;
@ -2023,7 +2006,7 @@ st0:
cs = 0;
goto _out;
tr2:
#line 858 "parser.rl"
#line 841 "parser.rl"
{
char *np = JSON_parse_value(json, p, pe, &result, 0);
if (np == NULL) { p--; {p++; cs = 10; goto _out;} } else {p = (( np))-1;}
@ -2033,7 +2016,7 @@ st10:
if ( ++p == pe )
goto _test_eof10;
case 10:
#line 2037 "parser.c"
#line 2020 "parser.c"
switch( (*p) ) {
case 13: goto st10;
case 32: goto st10;
@ -2122,7 +2105,7 @@ case 9:
_out: {}
}
#line 886 "parser.rl"
#line 869 "parser.rl"
if (cs >= JSON_first_final && p == pe) {
return result;

Просмотреть файл

@ -487,10 +487,8 @@ static inline VALUE build_string(const char *start, const char *end, bool intern
return result;
}
static const size_t MAX_STACK_BUFFER_SIZE = 128;
static VALUE json_string_unescape(char *string, char *stringEnd, bool intern, bool symbolize)
{
VALUE result = Qnil;
size_t bufferSize = stringEnd - string;
char *p = string, *pe = string, *unescape, *bufferStart, *buffer;
int unescape_len;
@ -501,19 +499,9 @@ static VALUE json_string_unescape(char *string, char *stringEnd, bool intern, bo
return build_string(string, stringEnd, intern, symbolize);
}
if (bufferSize > MAX_STACK_BUFFER_SIZE) {
# ifdef HAVE_RB_ENC_INTERNED_STR
bufferStart = buffer = ALLOC_N(char, bufferSize ? bufferSize : 1);
# else
bufferStart = buffer = ALLOC_N(char, bufferSize);
# endif
} else {
# ifdef HAVE_RB_ENC_INTERNED_STR
bufferStart = buffer = ALLOCA_N(char, bufferSize ? bufferSize : 1);
# else
bufferStart = buffer = ALLOCA_N(char, bufferSize);
# endif
}
VALUE result = rb_str_buf_new(bufferSize);
rb_enc_associate_index(result, utf8_encindex);
buffer = bufferStart = RSTRING_PTR(result);
while (pe < stringEnd) {
if (*pe == '\\') {
@ -547,9 +535,6 @@ static VALUE json_string_unescape(char *string, char *stringEnd, bool intern, bo
break;
case 'u':
if (pe > stringEnd - 4) {
if (bufferSize > MAX_STACK_BUFFER_SIZE) {
ruby_xfree(bufferStart);
}
raise_parse_error("incomplete unicode character escape sequence at '%s'", p);
} else {
uint32_t ch = unescape_unicode((unsigned char *) ++pe);
@ -567,9 +552,6 @@ static VALUE json_string_unescape(char *string, char *stringEnd, bool intern, bo
if ((ch & 0xFC00) == 0xD800) {
pe++;
if (pe > stringEnd - 6) {
if (bufferSize > MAX_STACK_BUFFER_SIZE) {
ruby_xfree(bufferStart);
}
raise_parse_error("incomplete surrogate pair at '%s'", p);
}
if (pe[0] == '\\' && pe[1] == 'u') {
@ -602,11 +584,12 @@ static VALUE json_string_unescape(char *string, char *stringEnd, bool intern, bo
MEMCPY(buffer, p, char, pe - p);
buffer += pe - p;
}
rb_str_set_len(result, buffer - bufferStart);
result = build_string(bufferStart, buffer, intern, symbolize);
if (bufferSize > MAX_STACK_BUFFER_SIZE) {
ruby_xfree(bufferStart);
if (symbolize) {
result = rb_str_intern(result);
} else if (intern) {
result = rb_funcall(rb_str_freeze(result), i_uminus, 0);
}
return result;