Use end of char boundary in start_with?

Previously we used the next character following the found prefix to
determine if the match ended on a broken character.

This had caused surprising behaviour when a valid character was followed
by a UTF-8 continuation byte.

This commit changes the behaviour to instead look for the end of the
last character in the prefix.

[Bug #19784]

Co-authored-by: ywenc <ywenc@github.com>
Co-authored-by: Nobuyoshi Nakada <nobu@ruby-lang.org>
This commit is contained in:
John Hawthorn 2023-08-31 15:12:47 -07:00
Родитель 2ca0f01015
Коммит d89b15cdce
2 изменённых файлов: 11 добавлений и 2 удалений

Просмотреть файл

@ -125,6 +125,15 @@ at_char_boundary(const char *s, const char *p, const char *e, rb_encoding *enc)
return rb_enc_left_char_head(s, p, e, enc) == p;
}
static inline bool
at_char_right_boundary(const char *s, const char *p, const char *e, rb_encoding *enc)
{
RUBY_ASSERT(s <= p);
RUBY_ASSERT(p <= e);
return rb_enc_right_char_head(s, p, e, enc) == p;
}
/* expect tail call optimization */
// YJIT needs this function to never allocate and never raise
static inline VALUE

Просмотреть файл

@ -10472,7 +10472,7 @@ rb_str_start_with(int argc, VALUE *argv, VALUE str)
p = RSTRING_PTR(str);
e = p + slen;
s = p + tlen;
if (!at_char_boundary(p, s, e, enc))
if (!at_char_right_boundary(p, s, e, enc))
continue;
if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
return Qtrue;
@ -10554,7 +10554,7 @@ deleted_prefix_length(VALUE str, VALUE prefix)
}
const char *strend = strptr + olen;
const char *after_prefix = strptr + prefixlen;
if (!at_char_boundary(strptr, after_prefix, strend, enc)) {
if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
/* prefix does not end at char-boundary */
return 0;
}