From 44b25b872f926ba92bae9ddf2495e7f59d0c96e2 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Wed, 2 Jan 2008 01:49:58 -0800 Subject: [PATCH] utf8_width(): allow non NUL-terminated input The original interface assumed that the input string is always terminated with a NUL, but that wasn't too useful. Signed-off-by: Junio C Hamano --- utf8.c | 84 ++++++++++++++++++++++++++++++++++++---------------------- utf8.h | 4 +-- 2 files changed, 54 insertions(+), 34 deletions(-) diff --git a/utf8.c b/utf8.c index 24d2ec696a..dc3735364f 100644 --- a/utf8.c +++ b/utf8.c @@ -153,74 +153,94 @@ static int git_wcwidth(ucs_char_t ch) /* * Pick one ucs character starting from the location *start points at, * and return it, while updating the *start pointer to point at the - * end of that character. + * end of that character. When remainder_p is not NULL, the location + * holds the number of bytes remaining in the string that we are allowed + * to pick from. Otherwise we are allowed to pick up to the NUL that + * would eventually appear in the string. *remainder_p is also reduced + * by the number of bytes we have consumed. * * If the string was not a valid UTF-8, *start pointer is set to NULL * and the return value is undefined. */ -ucs_char_t pick_one_utf8_char(const char **start) +ucs_char_t pick_one_utf8_char(const char **start, size_t *remainder_p) { unsigned char *s = (unsigned char *)*start; ucs_char_t ch; + size_t remainder, incr; - if (*s < 0x80) { + /* + * A caller that assumes NUL terminated text can choose + * not to bother with the remainder length. We will + * stop at the first NUL. + */ + remainder = (remainder_p ? *remainder_p : 999); + + if (remainder < 1) { + goto invalid; + } else if (*s < 0x80) { /* 0xxxxxxx */ ch = *s; - *start += 1; + incr = 1; } else if ((s[0] & 0xe0) == 0xc0) { /* 110XXXXx 10xxxxxx */ - if ((s[1] & 0xc0) != 0x80 || - /* overlong? */ - (s[0] & 0xfe) == 0xc0) + if (remainder < 2 || + (s[1] & 0xc0) != 0x80 || + (s[0] & 0xfe) == 0xc0) goto invalid; ch = ((s[0] & 0x1f) << 6) | (s[1] & 0x3f); - *start += 2; + incr = 2; } else if ((s[0] & 0xf0) == 0xe0) { /* 1110XXXX 10Xxxxxx 10xxxxxx */ - if ((s[1] & 0xc0) != 0x80 || - (s[2] & 0xc0) != 0x80 || - /* overlong? */ - (s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) || - /* surrogate? */ - (s[0] == 0xed && (s[1] & 0xe0) == 0xa0) || - /* U+FFFE or U+FFFF? */ - (s[0] == 0xef && s[1] == 0xbf && - (s[2] & 0xfe) == 0xbe)) + if (remainder < 3 || + (s[1] & 0xc0) != 0x80 || + (s[2] & 0xc0) != 0x80 || + /* overlong? */ + (s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) || + /* surrogate? */ + (s[0] == 0xed && (s[1] & 0xe0) == 0xa0) || + /* U+FFFE or U+FFFF? */ + (s[0] == 0xef && s[1] == 0xbf && + (s[2] & 0xfe) == 0xbe)) goto invalid; ch = ((s[0] & 0x0f) << 12) | ((s[1] & 0x3f) << 6) | (s[2] & 0x3f); - *start += 3; + incr = 3; } else if ((s[0] & 0xf8) == 0xf0) { /* 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx */ - if ((s[1] & 0xc0) != 0x80 || - (s[2] & 0xc0) != 0x80 || - (s[3] & 0xc0) != 0x80 || - /* overlong? */ - (s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) || - /* > U+10FFFF? */ - (s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4) + if (remainder < 4 || + (s[1] & 0xc0) != 0x80 || + (s[2] & 0xc0) != 0x80 || + (s[3] & 0xc0) != 0x80 || + /* overlong? */ + (s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) || + /* > U+10FFFF? */ + (s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4) goto invalid; ch = ((s[0] & 0x07) << 18) | ((s[1] & 0x3f) << 12) | ((s[2] & 0x3f) << 6) | (s[3] & 0x3f); - *start += 4; + incr = 4; } else { invalid: *start = NULL; return 0; } + *start += incr; + if (remainder_p) + *remainder_p = remainder - incr; return ch; } /* * This function returns the number of columns occupied by the character * pointed to by the variable start. The pointer is updated to point at - * the next character. If it was not valid UTF-8, the pointer is set to - * NULL. + * the next character. When remainder_p is not NULL, it points at the + * location that stores the number of remaining bytes we can use to pick + * a character (see pick_one_utf8_char() above). */ -int utf8_width(const char **start) +int utf8_width(const char **start, size_t *remainder_p) { - ucs_char_t ch = pick_one_utf8_char(start); + ucs_char_t ch = pick_one_utf8_char(start, remainder_p); if (!*start) return 0; return git_wcwidth(ch); @@ -233,7 +253,7 @@ int is_utf8(const char *text) text++; continue; } - utf8_width(&text); + utf8_width(&text, NULL); if (!text) return 0; } @@ -293,7 +313,7 @@ int print_wrapped_text(const char *text, int indent, int indent2, int width) continue; } if (assume_utf8) - w += utf8_width(&text); + w += utf8_width(&text, NULL); else { w++; text++; diff --git a/utf8.h b/utf8.h index 4a7f0464c4..98cce1b038 100644 --- a/utf8.h +++ b/utf8.h @@ -3,8 +3,8 @@ typedef unsigned int ucs_char_t; /* assuming 32bit int */ -ucs_char_t pick_one_utf8_char(const char **start); -int utf8_width(const char **start); +ucs_char_t pick_one_utf8_char(const char **start, size_t *remainder_p); +int utf8_width(const char **start, size_t *remainder_p); int is_utf8(const char *text); int is_encoding_utf8(const char *name);