[flori/json] JSON.dump: avoid redundant UTF-8 validation

Given that we called `rb_enc_str_asciionly_p`, if the string encoding
isn't valid UTF-8, we can't know it very cheaply by checking the
encoding and coderange that was just computed by Ruby, rather than
to do it ourselves.

Also Ruby might have already computed that earlier.

https://github.com/flori/json/commit/4b04c469d5
This commit is contained in:
Jean Boussier 2024-09-02 09:41:01 +02:00 коммит произвёл Hiroshi SHIBATA
Родитель 835ac8a4d7
Коммит 630c681321
2 изменённых файлов: 16 добавлений и 50 удалений

Просмотреть файл

@ -66,42 +66,6 @@ static const char trailingBytesForUTF8[256] = {
static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
0x03C82080UL, 0xFA082080UL, 0x82082080UL };
/*
* Utility routine to tell whether a sequence of bytes is legal UTF-8.
* This must be called with the length pre-determined by the first byte.
* If not calling this from ConvertUTF8to*, then the length can be set by:
* length = trailingBytesForUTF8[*source]+1;
* and the sequence is illegal right away if there aren't that many bytes
* available.
* If presented with a length > 4, this returns 0. The Unicode
* definition of UTF-8 goes up to 4-byte sequences.
*/
static unsigned char isLegalUTF8(const UTF8 *source, unsigned long length)
{
UTF8 a;
const UTF8 *srcptr = source+length;
switch (length) {
default: return 0;
/* Everything else falls through when "1"... */
case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
case 2: if ((a = (*--srcptr)) > 0xBF) return 0;
switch (*source) {
/* no fall-through in this inner switch */
case 0xE0: if (a < 0xA0) return 0; break;
case 0xED: if (a > 0x9F) return 0; break;
case 0xF0: if (a < 0x90) return 0; break;
case 0xF4: if (a > 0x8F) return 0; break;
default: if (a < 0x80) return 0;
}
case 1: if (*source >= 0x80 && *source < 0xC2) return 0;
}
if (*source > 0xF4) return 0;
return 1;
}
/* Escapes the UTF16 character and stores the result in the buffer buf. */
static void unicode_escape(char *buf, UTF16 character)
{
@ -130,17 +94,18 @@ static void convert_UTF8_to_JSON_ASCII(FBuffer *buffer, VALUE string, char scrip
const UTF8 *sourceEnd = source + RSTRING_LEN(string);
char buf[6] = { '\\', 'u' };
while (source < sourceEnd) {
UTF32 ch = 0;
unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
if (source + extraBytesToRead >= sourceEnd) {
rb_raise(rb_path2class("JSON::GeneratorError"),
"partial character in source, but hit end");
}
if (!isLegalUTF8(source, extraBytesToRead+1)) {
int ascii_only = rb_enc_str_asciionly_p(string);
if (!ascii_only) {
if (RB_ENCODING_GET_INLINED(string) != rb_utf8_encindex() || RB_ENC_CODERANGE(string) != RUBY_ENC_CODERANGE_VALID) {
rb_raise(rb_path2class("JSON::GeneratorError"),
"source sequence is illegal/malformed utf-8");
}
}
while (source < sourceEnd) {
UTF32 ch = 0;
unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
/*
* The cases all fall through. See "Note A" below.
*/
@ -238,6 +203,13 @@ static void convert_UTF8_to_JSON(FBuffer *buffer, VALUE string, char script_safe
char buf[6] = { '\\', 'u' };
int ascii_only = rb_enc_str_asciionly_p(string);
if (!ascii_only) {
if (RB_ENCODING_GET_INLINED(string) != rb_utf8_encindex() || RB_ENC_CODERANGE(string) != RUBY_ENC_CODERANGE_VALID) {
rb_raise(rb_path2class("JSON::GeneratorError"),
"source sequence is illegal/malformed utf-8");
}
}
for (start = 0, end = 0; end < len;) {
p = ptr + end;
c = (unsigned char) *p;
@ -309,11 +281,6 @@ static void convert_UTF8_to_JSON(FBuffer *buffer, VALUE string, char script_safe
continue;
}
}
if (!isLegalUTF8((UTF8 *) p, clen)) {
rb_raise(rb_path2class("JSON::GeneratorError"),
"source sequence is illegal/malformed utf-8");
}
}
end += clen;
}

Просмотреть файл

@ -46,7 +46,6 @@ static const int halfShift = 10; /* used for shifting by 10 bits */
static const UTF32 halfBase = 0x0010000UL;
static const UTF32 halfMask = 0x3FFUL;
static unsigned char isLegalUTF8(const UTF8 *source, unsigned long length);
static void unicode_escape(char *buf, UTF16 character);
static void unicode_escape_to_buffer(FBuffer *buffer, char buf[6], UTF16 character);
static void convert_UTF8_to_JSON_ASCII(FBuffer *buffer, VALUE string, char script_safe);