Use end of char boundary in start_with?

Previously we used the next character following the found prefix to determine if the match ended on a broken character. This had caused surprising behaviour when a valid character was followed by a UTF-8 continuation byte. This commit changes the behaviour to instead look for the end of the last character in the prefix. [Bug #19784] Co-authored-by: ywenc <ywenc@github.com> Co-authored-by: Nobuyoshi Nakada <nobu@ruby-lang.org>
2023-08-31 15:12:47 -07:00 · 2023-08-31 15:12:47 -07:00 · d89b15cdce
--- a/internal/string.h
+++ b/internal/string.h
@ -125,6 +125,15 @@ at_char_boundary(const char *s, const char *p, const char *e, rb_encoding *enc)
    return rb_enc_left_char_head(s, p, e, enc) == p;
 }

+static inline bool
+at_char_right_boundary(const char *s, const char *p, const char *e, rb_encoding *enc)
+{
+    RUBY_ASSERT(s <= p);
+    RUBY_ASSERT(p <= e);
+
+    return rb_enc_right_char_head(s, p, e, enc) == p;
+}
+
 /* expect tail call optimization */
 // YJIT needs this function to never allocate and never raise
 static inline VALUE
--- a/string.c
+++ b/string.c
@ -10472,7 +10472,7 @@ rb_str_start_with(int argc, VALUE *argv, VALUE str)
            p = RSTRING_PTR(str);
            e = p + slen;
            s = p + tlen;
-            if (!at_char_boundary(p, s, e, enc))
+            if (!at_char_right_boundary(p, s, e, enc))
                continue;
            if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
                return Qtrue;
@ -10554,7 +10554,7 @@ deleted_prefix_length(VALUE str, VALUE prefix)
        }
        const char *strend = strptr + olen;
        const char *after_prefix = strptr + prefixlen;
-        if (!at_char_boundary(strptr, after_prefix, strend, enc)) {
+        if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
            /* prefix does not end at char-boundary */
            return 0;
        }