[flori/json] JSON.dump: avoid redundant UTF-8 validation

Given that we called `rb_enc_str_asciionly_p`, if the string encoding isn't valid UTF-8, we can't know it very cheaply by checking the encoding and coderange that was just computed by Ruby, rather than to do it ourselves. Also Ruby might have already computed that earlier. https://github.com/flori/json/commit/4b04c469d5
2024-09-02 09:41:01 +02:00 · 2024-09-02 09:41:01 +02:00 · 630c681321
--- a/ext/json/generator/generator.c
+++ b/ext/json/generator/generator.c
@ -66,42 +66,6 @@ static const char trailingBytesForUTF8[256] = {
 static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
    0x03C82080UL, 0xFA082080UL, 0x82082080UL };

-/*
- * Utility routine to tell whether a sequence of bytes is legal UTF-8.
- * This must be called with the length pre-determined by the first byte.
- * If not calling this from ConvertUTF8to*, then the length can be set by:
- *  length = trailingBytesForUTF8[*source]+1;
- * and the sequence is illegal right away if there aren't that many bytes
- * available.
- * If presented with a length > 4, this returns 0.  The Unicode
- * definition of UTF-8 goes up to 4-byte sequences.
- */
-static unsigned char isLegalUTF8(const UTF8 *source, unsigned long length)
-{
-    UTF8 a;
-    const UTF8 *srcptr = source+length;
-    switch (length) {
-        default: return 0;
-                 /* Everything else falls through when "1"... */
-        case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
-        case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
-        case 2: if ((a = (*--srcptr)) > 0xBF) return 0;
-
-                    switch (*source) {
-                        /* no fall-through in this inner switch */
-                        case 0xE0: if (a < 0xA0) return 0; break;
-                        case 0xED: if (a > 0x9F) return 0; break;
-                        case 0xF0: if (a < 0x90) return 0; break;
-                        case 0xF4: if (a > 0x8F) return 0; break;
-                        default:   if (a < 0x80) return 0;
-                    }
-
-        case 1: if (*source >= 0x80 && *source < 0xC2) return 0;
-    }
-    if (*source > 0xF4) return 0;
-    return 1;
-}
-
 /* Escapes the UTF16 character and stores the result in the buffer buf. */
 static void unicode_escape(char *buf, UTF16 character)
 {
@ -130,17 +94,18 @@ static void convert_UTF8_to_JSON_ASCII(FBuffer *buffer, VALUE string, char scrip
    const UTF8 *sourceEnd = source + RSTRING_LEN(string);
    char buf[6] = { '\\', 'u' };

-    while (source < sourceEnd) {
-        UTF32 ch = 0;
-        unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
-        if (source + extraBytesToRead >= sourceEnd) {
-            rb_raise(rb_path2class("JSON::GeneratorError"),
-                    "partial character in source, but hit end");
-        }
-        if (!isLegalUTF8(source, extraBytesToRead+1)) {
+    int ascii_only = rb_enc_str_asciionly_p(string);
+
+    if (!ascii_only) {
+        if (RB_ENCODING_GET_INLINED(string) != rb_utf8_encindex() || RB_ENC_CODERANGE(string) != RUBY_ENC_CODERANGE_VALID) {
            rb_raise(rb_path2class("JSON::GeneratorError"),
                    "source sequence is illegal/malformed utf-8");
        }
+    }
+
+    while (source < sourceEnd) {
+        UTF32 ch = 0;
+        unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
        /*
         * The cases all fall through. See "Note A" below.
         */
@ -238,6 +203,13 @@ static void convert_UTF8_to_JSON(FBuffer *buffer, VALUE string, char script_safe
    char buf[6] = { '\\', 'u' };
    int ascii_only = rb_enc_str_asciionly_p(string);

+    if (!ascii_only) {
+        if (RB_ENCODING_GET_INLINED(string) != rb_utf8_encindex() || RB_ENC_CODERANGE(string) != RUBY_ENC_CODERANGE_VALID) {
+            rb_raise(rb_path2class("JSON::GeneratorError"),
+                    "source sequence is illegal/malformed utf-8");
+        }
+    }
+
    for (start = 0, end = 0; end < len;) {
        p = ptr + end;
        c = (unsigned char) *p;
@ -309,11 +281,6 @@ static void convert_UTF8_to_JSON(FBuffer *buffer, VALUE string, char script_safe
                                    continue;
                                }
                            }
-
-                            if (!isLegalUTF8((UTF8 *) p, clen)) {
-                                rb_raise(rb_path2class("JSON::GeneratorError"),
-                                        "source sequence is illegal/malformed utf-8");
-                            }
                        }
                        end += clen;
                    }
--- a/ext/json/generator/generator.h
+++ b/ext/json/generator/generator.h
@ -46,7 +46,6 @@ static const int halfShift  = 10; /* used for shifting by 10 bits */
 static const UTF32 halfBase = 0x0010000UL;
 static const UTF32 halfMask = 0x3FFUL;

-static unsigned char isLegalUTF8(const UTF8 *source, unsigned long length);
 static void unicode_escape(char *buf, UTF16 character);
 static void unicode_escape_to_buffer(FBuffer *buffer, char buf[6], UTF16 character);
 static void convert_UTF8_to_JSON_ASCII(FBuffer *buffer, VALUE string, char script_safe);