utf8_width(): allow non NUL-terminated input

The original interface assumed that the input string is always terminated with a NUL, but that wasn't too useful. Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-01-02 01:49:58 -08:00 · 2008-01-02 01:49:58 -08:00 · 44b25b872f
--- a/utf8.c
+++ b/utf8.c
@ -153,74 +153,94 @@ static int git_wcwidth(ucs_char_t ch)
 /*
 * Pick one ucs character starting from the location *start points at,
 * and return it, while updating the *start pointer to point at the
- * end of that character.
+ * end of that character.  When remainder_p is not NULL, the location
+ * holds the number of bytes remaining in the string that we are allowed
+ * to pick from.  Otherwise we are allowed to pick up to the NUL that
+ * would eventually appear in the string.  *remainder_p is also reduced
+ * by the number of bytes we have consumed.
 *
 * If the string was not a valid UTF-8, *start pointer is set to NULL
 * and the return value is undefined.
 */
-ucs_char_t pick_one_utf8_char(const char **start)
+ucs_char_t pick_one_utf8_char(const char **start, size_t *remainder_p)
 {
 	unsigned char *s = (unsigned char *)*start;
 	ucs_char_t ch;
+	size_t remainder, incr;

-	if (*s < 0x80) {
+	/*
+	 * A caller that assumes NUL terminated text can choose
+	 * not to bother with the remainder length.  We will
+	 * stop at the first NUL.
+	 */
+	remainder = (remainder_p ? *remainder_p : 999);
+
+	if (remainder < 1) {
+		goto invalid;
+	} else if (*s < 0x80) {
 		/* 0xxxxxxx */
 		ch = *s;
-		*start += 1;
+		incr = 1;
 	} else if ((s[0] & 0xe0) == 0xc0) {
 		/* 110XXXXx 10xxxxxx */
-		if ((s[1] & 0xc0) != 0x80 ||
-				/* overlong? */
-				(s[0] & 0xfe) == 0xc0)
+		if (remainder < 2 ||
+		    (s[1] & 0xc0) != 0x80 ||
+		    (s[0] & 0xfe) == 0xc0)
 			goto invalid;
 		ch = ((s[0] & 0x1f) << 6) | (s[1] & 0x3f);
-		*start += 2;
+		incr = 2;
 	} else if ((s[0] & 0xf0) == 0xe0) {
 		/* 1110XXXX 10Xxxxxx 10xxxxxx */
-		if ((s[1] & 0xc0) != 0x80 ||
-				(s[2] & 0xc0) != 0x80 ||
-				/* overlong? */
-				(s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) ||
-				/* surrogate? */
-				(s[0] == 0xed && (s[1] & 0xe0) == 0xa0) ||
-				/* U+FFFE or U+FFFF? */
-				(s[0] == 0xef && s[1] == 0xbf &&
-				 (s[2] & 0xfe) == 0xbe))
+		if (remainder < 3 ||
+		    (s[1] & 0xc0) != 0x80 ||
+		    (s[2] & 0xc0) != 0x80 ||
+		    /* overlong? */
+		    (s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) ||
+		    /* surrogate? */
+		    (s[0] == 0xed && (s[1] & 0xe0) == 0xa0) ||
+		    /* U+FFFE or U+FFFF? */
+		    (s[0] == 0xef && s[1] == 0xbf &&
+		     (s[2] & 0xfe) == 0xbe))
 			goto invalid;
 		ch = ((s[0] & 0x0f) << 12) |
 			((s[1] & 0x3f) << 6) | (s[2] & 0x3f);
-		*start += 3;
+		incr = 3;
 	} else if ((s[0] & 0xf8) == 0xf0) {
 		/* 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx */
-		if ((s[1] & 0xc0) != 0x80 ||
-				(s[2] & 0xc0) != 0x80 ||
-				(s[3] & 0xc0) != 0x80 ||
-				/* overlong? */
-				(s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) ||
-				/* > U+10FFFF? */
-				(s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4)
+		if (remainder < 4 ||
+		    (s[1] & 0xc0) != 0x80 ||
+		    (s[2] & 0xc0) != 0x80 ||
+		    (s[3] & 0xc0) != 0x80 ||
+		    /* overlong? */
+		    (s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) ||
+		    /* > U+10FFFF? */
+		    (s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4)
 			goto invalid;
 		ch = ((s[0] & 0x07) << 18) | ((s[1] & 0x3f) << 12) |
 			((s[2] & 0x3f) << 6) | (s[3] & 0x3f);
-		*start += 4;
+		incr = 4;
 	} else {
 invalid:
 		*start = NULL;
 		return 0;
 	}

+	*start += incr;
+	if (remainder_p)
+		*remainder_p = remainder - incr;
 	return ch;
 }

 /*
 * This function returns the number of columns occupied by the character
 * pointed to by the variable start. The pointer is updated to point at
- * the next character.  If it was not valid UTF-8, the pointer is set to
- * NULL.
+ * the next character. When remainder_p is not NULL, it points at the
+ * location that stores the number of remaining bytes we can use to pick
+ * a character (see pick_one_utf8_char() above).
 */
-int utf8_width(const char **start)
+int utf8_width(const char **start, size_t *remainder_p)
 {
-	ucs_char_t ch = pick_one_utf8_char(start);
+	ucs_char_t ch = pick_one_utf8_char(start, remainder_p);
 	if (!*start)
 		return 0;
 	return git_wcwidth(ch);
@ -233,7 +253,7 @@ int is_utf8(const char *text)
 			text++;
 			continue;
 		}
-		utf8_width(&text);
+		utf8_width(&text, NULL);
 		if (!text)
 			return 0;
 	}
@ -293,7 +313,7 @@ int print_wrapped_text(const char *text, int indent, int indent2, int width)
 			continue;
 		}
 		if (assume_utf8)
-			w += utf8_width(&text);
+			w += utf8_width(&text, NULL);
 		else {
 			w++;
 			text++;
--- a/utf8.h
+++ b/utf8.h
@ -3,8 +3,8 @@

 typedef unsigned int ucs_char_t;  /* assuming 32bit int */

-ucs_char_t pick_one_utf8_char(const char **start);
-int utf8_width(const char **start);
+ucs_char_t pick_one_utf8_char(const char **start, size_t *remainder_p);
+int utf8_width(const char **start, size_t *remainder_p);
 int is_utf8(const char *text);
 int is_encoding_utf8(const char *name);