2023-09-27 19:24:48 +03:00
|
|
|
#include "prism/util/pm_strpbrk.h"
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 06:23:16 +03:00
|
|
|
/**
|
2024-02-14 01:45:27 +03:00
|
|
|
* Add an invalid multibyte character error to the parser.
|
|
|
|
*/
|
|
|
|
static inline void
|
|
|
|
pm_strpbrk_invalid_multibyte_character(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) {
|
|
|
|
pm_diagnostic_list_append_format(&parser->error_list, start, end, PM_ERR_INVALID_MULTIBYTE_CHARACTER, *start);
|
|
|
|
}
|
|
|
|
|
2024-05-16 19:29:14 +03:00
|
|
|
/**
|
|
|
|
* Set the explicit encoding for the parser to the current encoding.
|
|
|
|
*/
|
|
|
|
static inline void
|
|
|
|
pm_strpbrk_explicit_encoding_set(pm_parser_t *parser, const uint8_t *source, size_t width) {
|
|
|
|
if (parser->explicit_encoding != NULL) {
|
|
|
|
if (parser->explicit_encoding == parser->encoding) {
|
|
|
|
// Okay, we already locked to this encoding.
|
|
|
|
} else if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) {
|
|
|
|
// Not okay, we already found a Unicode escape sequence and this
|
|
|
|
// conflicts.
|
|
|
|
pm_diagnostic_list_append_format(&parser->error_list, source, source + width, PM_ERR_MIXED_ENCODING, parser->encoding->name);
|
|
|
|
} else {
|
|
|
|
// Should not be anything else.
|
|
|
|
assert(false && "unreachable");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
parser->explicit_encoding = parser->encoding;
|
|
|
|
}
|
|
|
|
|
2024-02-14 01:45:27 +03:00
|
|
|
/**
|
|
|
|
* This is the default path.
|
2023-10-31 06:23:16 +03:00
|
|
|
*/
|
2023-08-29 17:48:20 +03:00
|
|
|
static inline const uint8_t *
|
2024-02-14 01:45:27 +03:00
|
|
|
pm_strpbrk_utf8(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
|
2023-06-30 21:30:24 +03:00
|
|
|
size_t index = 0;
|
|
|
|
|
|
|
|
while (index < maximum) {
|
2023-08-29 17:48:20 +03:00
|
|
|
if (strchr((const char *) charset, source[index]) != NULL) {
|
2023-06-30 21:30:24 +03:00
|
|
|
return source + index;
|
|
|
|
}
|
|
|
|
|
2024-02-14 01:45:27 +03:00
|
|
|
if (source[index] < 0x80) {
|
|
|
|
index++;
|
|
|
|
} else {
|
|
|
|
size_t width = pm_encoding_utf_8_char_width(source + index, (ptrdiff_t) (maximum - index));
|
2023-06-30 21:30:24 +03:00
|
|
|
|
2024-02-14 01:45:27 +03:00
|
|
|
if (width > 0) {
|
|
|
|
index += width;
|
|
|
|
} else if (!validate) {
|
|
|
|
index++;
|
|
|
|
} else {
|
|
|
|
// At this point we know we have an invalid multibyte character.
|
|
|
|
// We'll walk forward as far as we can until we find the next
|
|
|
|
// valid character so that we don't spam the user with a ton of
|
|
|
|
// the same kind of error.
|
|
|
|
const size_t start = index;
|
|
|
|
|
|
|
|
do {
|
|
|
|
index++;
|
|
|
|
} while (index < maximum && pm_encoding_utf_8_char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);
|
|
|
|
|
|
|
|
pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index);
|
|
|
|
}
|
|
|
|
}
|
2023-06-30 21:30:24 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2023-10-31 06:23:16 +03:00
|
|
|
/**
|
2024-02-14 01:45:27 +03:00
|
|
|
* This is the path when the encoding is ASCII-8BIT.
|
2023-10-31 06:23:16 +03:00
|
|
|
*/
|
2023-08-29 17:48:20 +03:00
|
|
|
static inline const uint8_t *
|
2024-05-16 19:29:14 +03:00
|
|
|
pm_strpbrk_ascii_8bit(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
|
2023-06-30 21:30:24 +03:00
|
|
|
size_t index = 0;
|
|
|
|
|
|
|
|
while (index < maximum) {
|
2023-08-29 17:48:20 +03:00
|
|
|
if (strchr((const char *) charset, source[index]) != NULL) {
|
2023-06-30 21:30:24 +03:00
|
|
|
return source + index;
|
|
|
|
}
|
|
|
|
|
2024-05-16 19:29:14 +03:00
|
|
|
if (validate && source[index] >= 0x80) pm_strpbrk_explicit_encoding_set(parser, source, 1);
|
2023-06-30 21:30:24 +03:00
|
|
|
index++;
|
|
|
|
}
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2024-02-14 01:45:27 +03:00
|
|
|
/**
|
|
|
|
* This is the slow path that does care about the encoding.
|
|
|
|
*/
|
|
|
|
static inline const uint8_t *
|
|
|
|
pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
|
|
|
|
size_t index = 0;
|
2024-05-16 19:29:14 +03:00
|
|
|
const pm_encoding_t *encoding = parser->encoding;
|
2024-02-14 01:45:27 +03:00
|
|
|
|
|
|
|
while (index < maximum) {
|
|
|
|
if (strchr((const char *) charset, source[index]) != NULL) {
|
|
|
|
return source + index;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (source[index] < 0x80) {
|
|
|
|
index++;
|
|
|
|
} else {
|
2024-05-16 19:29:14 +03:00
|
|
|
size_t width = encoding->char_width(source + index, (ptrdiff_t) (maximum - index));
|
|
|
|
if (validate) pm_strpbrk_explicit_encoding_set(parser, source, width);
|
2024-02-14 01:45:27 +03:00
|
|
|
|
|
|
|
if (width > 0) {
|
|
|
|
index += width;
|
|
|
|
} else if (!validate) {
|
|
|
|
index++;
|
|
|
|
} else {
|
|
|
|
// At this point we know we have an invalid multibyte character.
|
|
|
|
// We'll walk forward as far as we can until we find the next
|
|
|
|
// valid character so that we don't spam the user with a ton of
|
|
|
|
// the same kind of error.
|
|
|
|
const size_t start = index;
|
|
|
|
|
|
|
|
do {
|
|
|
|
index++;
|
2024-05-16 19:29:14 +03:00
|
|
|
} while (index < maximum && encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);
|
2024-02-14 01:45:27 +03:00
|
|
|
|
|
|
|
pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* This is the fast path that does not care about the encoding because we know
|
|
|
|
* the encoding only supports single-byte characters.
|
|
|
|
*/
|
|
|
|
static inline const uint8_t *
|
|
|
|
pm_strpbrk_single_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
|
|
|
|
size_t index = 0;
|
2024-05-16 19:29:14 +03:00
|
|
|
const pm_encoding_t *encoding = parser->encoding;
|
2024-02-14 01:45:27 +03:00
|
|
|
|
|
|
|
while (index < maximum) {
|
|
|
|
if (strchr((const char *) charset, source[index]) != NULL) {
|
|
|
|
return source + index;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (source[index] < 0x80 || !validate) {
|
|
|
|
index++;
|
|
|
|
} else {
|
2024-05-16 19:29:14 +03:00
|
|
|
size_t width = encoding->char_width(source + index, (ptrdiff_t) (maximum - index));
|
|
|
|
pm_strpbrk_explicit_encoding_set(parser, source, width);
|
2024-02-14 01:45:27 +03:00
|
|
|
|
|
|
|
if (width > 0) {
|
|
|
|
index += width;
|
|
|
|
} else {
|
|
|
|
// At this point we know we have an invalid multibyte character.
|
|
|
|
// We'll walk forward as far as we can until we find the next
|
|
|
|
// valid character so that we don't spam the user with a ton of
|
|
|
|
// the same kind of error.
|
|
|
|
const size_t start = index;
|
|
|
|
|
|
|
|
do {
|
|
|
|
index++;
|
2024-05-16 19:29:14 +03:00
|
|
|
} while (index < maximum && encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);
|
2024-02-14 01:45:27 +03:00
|
|
|
|
|
|
|
pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2023-10-31 06:23:16 +03:00
|
|
|
/**
|
|
|
|
* Here we have rolled our own version of strpbrk. The standard library strpbrk
|
|
|
|
* has undefined behavior when the source string is not null-terminated. We want
|
|
|
|
* to support strings that are not null-terminated because pm_parse does not
|
|
|
|
* have the contract that the string is null-terminated. (This is desirable
|
|
|
|
* because it means the extension can call pm_parse with the result of a call to
|
|
|
|
* mmap).
|
|
|
|
*
|
|
|
|
* The standard library strpbrk also does not support passing a maximum length
|
|
|
|
* to search. We want to support this for the reason mentioned above, but we
|
|
|
|
* also don't want it to stop on null bytes. Ruby actually allows null bytes
|
|
|
|
* within strings, comments, regular expressions, etc. So we need to be able to
|
|
|
|
* skip past them.
|
|
|
|
*
|
|
|
|
* Finally, we want to support encodings wherein the charset could contain
|
|
|
|
* characters that are trailing bytes of multi-byte characters. For example, in
|
2024-02-14 01:45:27 +03:00
|
|
|
* Shift_JIS, the backslash character can be a trailing byte. In that case we
|
2023-10-31 06:23:16 +03:00
|
|
|
* need to take a slower path and iterate one multi-byte character at a time.
|
|
|
|
*/
|
2023-08-29 17:48:20 +03:00
|
|
|
const uint8_t *
|
2024-02-14 01:45:27 +03:00
|
|
|
pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length, bool validate) {
|
2023-06-30 21:30:24 +03:00
|
|
|
if (length <= 0) {
|
|
|
|
return NULL;
|
2024-02-14 01:45:27 +03:00
|
|
|
} else if (!parser->encoding_changed) {
|
|
|
|
return pm_strpbrk_utf8(parser, source, charset, (size_t) length, validate);
|
|
|
|
} else if (parser->encoding == PM_ENCODING_ASCII_8BIT_ENTRY) {
|
2024-05-16 19:29:14 +03:00
|
|
|
return pm_strpbrk_ascii_8bit(parser, source, charset, (size_t) length, validate);
|
2024-02-14 01:45:27 +03:00
|
|
|
} else if (parser->encoding->multibyte) {
|
|
|
|
return pm_strpbrk_multi_byte(parser, source, charset, (size_t) length, validate);
|
2023-06-30 21:30:24 +03:00
|
|
|
} else {
|
2024-02-14 01:45:27 +03:00
|
|
|
return pm_strpbrk_single_byte(parser, source, charset, (size_t) length, validate);
|
2023-06-20 18:53:02 +03:00
|
|
|
}
|
|
|
|
}
|