[ruby/json] json_string_unescape: Use the returned RString as buffer

Rather than to copy into a buffer to unescape and then copy that buffer into the final string, we can directly copy into the final string. The downside is that if the string contains a lot of escaping, we end up returning a string that's larger than strictly necessary, but it's probably fine. Before: ``` == Parsing twitter.json (567916 bytes) ruby 3.3.4 (2024-07-09 revision https://github.com/ruby/json/commit/be1089c8ec) +YJIT [arm64-darwin23] Warming up -------------------------------------- json 56.000 i/100ms oj 58.000 i/100ms oj strict 74.000 i/100ms Oj::Parser 76.000 i/100ms rapidjson 52.000 i/100ms Calculating ------------------------------------- json 556.659 (± 2.9%) i/s (1.80 ms/i) - 2.800k in 5.034719s oj 604.077 (± 3.8%) i/s (1.66 ms/i) - 3.016k in 5.001546s oj strict 706.942 (± 3.5%) i/s (1.41 ms/i) - 3.552k in 5.030954s Oj::Parser 752.917 (± 3.2%) i/s (1.33 ms/i) - 3.800k in 5.052707s rapidjson 546.470 (± 3.5%) i/s (1.83 ms/i) - 2.756k in 5.049855s Comparison: json: 556.7 i/s Oj::Parser: 752.9 i/s - 1.35x faster oj strict: 706.9 i/s - 1.27x faster oj: 604.1 i/s - 1.09x faster rapidjson: 546.5 i/s - same-ish: difference falls within error == Parsing citm_catalog.json (1727030 bytes) ruby 3.3.4 (2024-07-09 revision https://github.com/ruby/json/commit/be1089c8ec) +YJIT [arm64-darwin23] Warming up -------------------------------------- json 29.000 i/100ms oj 32.000 i/100ms oj strict 38.000 i/100ms Oj::Parser 42.000 i/100ms rapidjson 38.000 i/100ms Calculating ------------------------------------- json 317.858 (± 3.1%) i/s (3.15 ms/i) - 1.595k in 5.023245s oj 348.168 (± 2.6%) i/s (2.87 ms/i) - 1.760k in 5.058431s oj strict 394.599 (± 2.8%) i/s (2.53 ms/i) - 1.976k in 5.012073s Oj::Parser 403.771 (± 3.0%) i/s (2.48 ms/i) - 2.058k in 5.101578s rapidjson 383.441 (± 3.7%) i/s (2.61 ms/i) - 1.938k in 5.061355s Comparison: json: 317.9 i/s Oj::Parser: 403.8 i/s - 1.27x faster oj strict: 394.6 i/s - 1.24x faster rapidjson: 383.4 i/s - 1.21x faster oj: 348.2 i/s - 1.10x faster ``` After: ``` == Parsing twitter.json (567916 bytes) ruby 3.3.4 (2024-07-09 revision https://github.com/ruby/json/commit/be1089c8ec) +YJIT [arm64-darwin23] Warming up -------------------------------------- json 56.000 i/100ms oj 62.000 i/100ms oj strict 73.000 i/100ms Oj::Parser 76.000 i/100ms rapidjson 54.000 i/100ms Calculating ------------------------------------- json 561.009 (± 7.5%) i/s (1.78 ms/i) - 2.800k in 5.039548s oj 601.124 (± 4.3%) i/s (1.66 ms/i) - 3.038k in 5.064686s oj strict 707.455 (± 3.4%) i/s (1.41 ms/i) - 3.577k in 5.062540s Oj::Parser 751.799 (± 3.1%) i/s (1.33 ms/i) - 3.800k in 5.059509s rapidjson 535.641 (± 3.2%) i/s (1.87 ms/i) - 2.700k in 5.045816s Comparison: json: 561.0 i/s Oj::Parser: 751.8 i/s - 1.34x faster oj strict: 707.5 i/s - 1.26x faster oj: 601.1 i/s - same-ish: difference falls within error rapidjson: 535.6 i/s - same-ish: difference falls within error == Parsing citm_catalog.json (1727030 bytes) ruby 3.3.4 (2024-07-09 revision https://github.com/ruby/json/commit/be1089c8ec) +YJIT [arm64-darwin23] Warming up -------------------------------------- json 30.000 i/100ms oj 32.000 i/100ms oj strict 36.000 i/100ms Oj::Parser 42.000 i/100ms rapidjson 39.000 i/100ms Calculating ------------------------------------- json 313.248 (± 7.3%) i/s (3.19 ms/i) - 1.560k in 5.014118s oj 341.977 (± 4.1%) i/s (2.92 ms/i) - 1.728k in 5.063332s oj strict 387.062 (± 6.2%) i/s (2.58 ms/i) - 1.944k in 5.045961s Oj::Parser 400.423 (± 4.0%) i/s (2.50 ms/i) - 2.016k in 5.044513s rapidjson 379.046 (± 6.1%) i/s (2.64 ms/i) - 1.911k in 5.064461s Comparison: json: 313.2 i/s Oj::Parser: 400.4 i/s - 1.28x faster oj strict: 387.1 i/s - 1.24x faster rapidjson: 379.0 i/s - 1.21x faster oj: 342.0 i/s - same-ish: difference falls within error ``` https://github.com/ruby/json/commit/5e1ec4a268
2024-10-31 15:22:38 +01:00 · 2024-10-31 15:22:38 +01:00 · ee4fa4ccee
--- a/ext/json/parser/parser.c
+++ b/ext/json/parser/parser.c
@ -1476,10 +1476,8 @@ static inline VALUE build_string(const char *start, const char *end, bool intern
    return result;
 }

-static const size_t MAX_STACK_BUFFER_SIZE = 128;
 static VALUE json_string_unescape(char *string, char *stringEnd, bool intern, bool symbolize)
 {
-    VALUE result = Qnil;
    size_t bufferSize = stringEnd - string;
    char *p = string, *pe = string, *unescape, *bufferStart, *buffer;
    int unescape_len;
@ -1490,19 +1488,9 @@ static VALUE json_string_unescape(char *string, char *stringEnd, bool intern, bo
        return build_string(string, stringEnd, intern, symbolize);
    }

-    if (bufferSize > MAX_STACK_BUFFER_SIZE) {
-# ifdef HAVE_RB_ENC_INTERNED_STR
-      bufferStart = buffer = ALLOC_N(char, bufferSize ? bufferSize : 1);
-# else
-      bufferStart = buffer = ALLOC_N(char, bufferSize);
-# endif
-    } else {
-# ifdef HAVE_RB_ENC_INTERNED_STR
-      bufferStart = buffer = ALLOCA_N(char, bufferSize ? bufferSize : 1);
-# else
-      bufferStart = buffer = ALLOCA_N(char, bufferSize);
-# endif
-    }
+    VALUE result = rb_str_buf_new(bufferSize);
+    rb_enc_associate_index(result, utf8_encindex);
+    buffer = bufferStart = RSTRING_PTR(result);

    while (pe < stringEnd) {
        if (*pe == '\\') {
@ -1536,9 +1524,6 @@ static VALUE json_string_unescape(char *string, char *stringEnd, bool intern, bo
                    break;
                case 'u':
                    if (pe > stringEnd - 4) {
-                      if (bufferSize > MAX_STACK_BUFFER_SIZE) {
-                        ruby_xfree(bufferStart);
-                      }
                      raise_parse_error("incomplete unicode character escape sequence at '%s'", p);
                    } else {
                        uint32_t ch = unescape_unicode((unsigned char *) ++pe);
@ -1556,9 +1541,6 @@ static VALUE json_string_unescape(char *string, char *stringEnd, bool intern, bo
                        if ((ch & 0xFC00) == 0xD800) {
                            pe++;
                            if (pe > stringEnd - 6) {
-                              if (bufferSize > MAX_STACK_BUFFER_SIZE) {
-                                ruby_xfree(bufferStart);
-                              }
                              raise_parse_error("incomplete surrogate pair at '%s'", p);
                            }
                            if (pe[0] == '\\' && pe[1] == 'u') {
@ -1591,18 +1573,19 @@ static VALUE json_string_unescape(char *string, char *stringEnd, bool intern, bo
      MEMCPY(buffer, p, char, pe - p);
      buffer += pe - p;
    }
+    rb_str_set_len(result, buffer - bufferStart);

-    result = build_string(bufferStart, buffer, intern, symbolize);
-
-    if (bufferSize > MAX_STACK_BUFFER_SIZE) {
-      ruby_xfree(bufferStart);
+    if (symbolize) {
+        result = rb_str_intern(result);
+    } else if (intern) {
+        result = rb_funcall(rb_str_freeze(result), i_uminus, 0);
    }

    return result;
 }


-#line 1606 "parser.c"
+#line 1589 "parser.c"
 enum {JSON_string_start = 1};
 enum {JSON_string_first_final = 8};
 enum {JSON_string_error = 0};
@ -1610,7 +1593,7 @@ enum {JSON_string_error = 0};
 enum {JSON_string_en_main = 1};


-#line 634 "parser.rl"
+#line 617 "parser.rl"


 static int
@ -1631,15 +1614,15 @@ static char *JSON_parse_string(JSON_Parser *json, char *p, char *pe, VALUE *resu
    VALUE match_string;


-#line 1635 "parser.c"
+#line 1618 "parser.c"
 	{
 	cs = JSON_string_start;
 	}

-#line 654 "parser.rl"
+#line 637 "parser.rl"
    json->memo = p;

-#line 1643 "parser.c"
+#line 1626 "parser.c"
 	{
 	if ( p == pe )
 		goto _test_eof;
@ -1664,7 +1647,7 @@ case 2:
 		goto st0;
 	goto st2;
 tr2:
-#line 621 "parser.rl"
+#line 604 "parser.rl"
 	{
        *result = json_string_unescape(json->memo + 1, p, json->parsing_name || json-> freeze, json->parsing_name && json->symbolize_names);
        if (NIL_P(*result)) {
@ -1674,14 +1657,14 @@ tr2:
            {p = (( p + 1))-1;}
        }
    }
-#line 631 "parser.rl"
+#line 614 "parser.rl"
 	{ p--; {p++; cs = 8; goto _out;} }
 	goto st8;
 st8:
 	if ( ++p == pe )
 		goto _test_eof8;
 case 8:
-#line 1685 "parser.c"
+#line 1668 "parser.c"
 	goto st0;
 st3:
 	if ( ++p == pe )
@ -1757,7 +1740,7 @@ case 7:
 	_out: {}
 	}

-#line 656 "parser.rl"
+#line 639 "parser.rl"

    if (json->create_additions && RTEST(match_string = json->match_string)) {
          VALUE klass;
@ -1954,7 +1937,7 @@ static VALUE cParser_initialize(int argc, VALUE *argv, VALUE self)
 }


-#line 1958 "parser.c"
+#line 1941 "parser.c"
 enum {JSON_start = 1};
 enum {JSON_first_final = 10};
 enum {JSON_error = 0};
@ -1962,7 +1945,7 @@ enum {JSON_error = 0};
 enum {JSON_en_main = 1};


-#line 866 "parser.rl"
+#line 849 "parser.rl"


 /*
@ -1980,16 +1963,16 @@ static VALUE cParser_parse(VALUE self)
    GET_PARSER;


-#line 1984 "parser.c"
+#line 1967 "parser.c"
 	{
 	cs = JSON_start;
 	}

-#line 883 "parser.rl"
+#line 866 "parser.rl"
    p = json->source;
    pe = p + json->len;

-#line 1993 "parser.c"
+#line 1976 "parser.c"
 	{
 	if ( p == pe )
 		goto _test_eof;
@ -2023,7 +2006,7 @@ st0:
 cs = 0;
 	goto _out;
 tr2:
-#line 858 "parser.rl"
+#line 841 "parser.rl"
 	{
        char *np = JSON_parse_value(json, p, pe, &result, 0);
        if (np == NULL) { p--; {p++; cs = 10; goto _out;} } else {p = (( np))-1;}
@ -2033,7 +2016,7 @@ st10:
 	if ( ++p == pe )
 		goto _test_eof10;
 case 10:
-#line 2037 "parser.c"
+#line 2020 "parser.c"
 	switch( (*p) ) {
 		case 13: goto st10;
 		case 32: goto st10;
@ -2122,7 +2105,7 @@ case 9:
 	_out: {}
 	}

-#line 886 "parser.rl"
+#line 869 "parser.rl"

    if (cs >= JSON_first_final && p == pe) {
        return result;
--- a/ext/json/parser/parser.rl
+++ b/ext/json/parser/parser.rl
@ -487,10 +487,8 @@ static inline VALUE build_string(const char *start, const char *end, bool intern
    return result;
 }

-static const size_t MAX_STACK_BUFFER_SIZE = 128;
 static VALUE json_string_unescape(char *string, char *stringEnd, bool intern, bool symbolize)
 {
-    VALUE result = Qnil;
    size_t bufferSize = stringEnd - string;
    char *p = string, *pe = string, *unescape, *bufferStart, *buffer;
    int unescape_len;
@ -501,19 +499,9 @@ static VALUE json_string_unescape(char *string, char *stringEnd, bool intern, bo
        return build_string(string, stringEnd, intern, symbolize);
    }

-    if (bufferSize > MAX_STACK_BUFFER_SIZE) {
-# ifdef HAVE_RB_ENC_INTERNED_STR
-      bufferStart = buffer = ALLOC_N(char, bufferSize ? bufferSize : 1);
-# else
-      bufferStart = buffer = ALLOC_N(char, bufferSize);
-# endif
-    } else {
-# ifdef HAVE_RB_ENC_INTERNED_STR
-      bufferStart = buffer = ALLOCA_N(char, bufferSize ? bufferSize : 1);
-# else
-      bufferStart = buffer = ALLOCA_N(char, bufferSize);
-# endif
-    }
+    VALUE result = rb_str_buf_new(bufferSize);
+    rb_enc_associate_index(result, utf8_encindex);
+    buffer = bufferStart = RSTRING_PTR(result);

    while (pe < stringEnd) {
        if (*pe == '\\') {
@ -547,9 +535,6 @@ static VALUE json_string_unescape(char *string, char *stringEnd, bool intern, bo
                    break;
                case 'u':
                    if (pe > stringEnd - 4) {
-                      if (bufferSize > MAX_STACK_BUFFER_SIZE) {
-                        ruby_xfree(bufferStart);
-                      }
                      raise_parse_error("incomplete unicode character escape sequence at '%s'", p);
                    } else {
                        uint32_t ch = unescape_unicode((unsigned char *) ++pe);
@ -567,9 +552,6 @@ static VALUE json_string_unescape(char *string, char *stringEnd, bool intern, bo
                        if ((ch & 0xFC00) == 0xD800) {
                            pe++;
                            if (pe > stringEnd - 6) {
-                              if (bufferSize > MAX_STACK_BUFFER_SIZE) {
-                                ruby_xfree(bufferStart);
-                              }
                              raise_parse_error("incomplete surrogate pair at '%s'", p);
                            }
                            if (pe[0] == '\\' && pe[1] == 'u') {
@ -602,11 +584,12 @@ static VALUE json_string_unescape(char *string, char *stringEnd, bool intern, bo
      MEMCPY(buffer, p, char, pe - p);
      buffer += pe - p;
    }
+    rb_str_set_len(result, buffer - bufferStart);

-    result = build_string(bufferStart, buffer, intern, symbolize);
-
-    if (bufferSize > MAX_STACK_BUFFER_SIZE) {
-      ruby_xfree(bufferStart);
+    if (symbolize) {
+        result = rb_str_intern(result);
+    } else if (intern) {
+        result = rb_funcall(rb_str_freeze(result), i_uminus, 0);
    }

    return result;