Bug 754824 - The highlight is off by a few characters in the search result view when some characters are UTF8 encoded on 4 bytes - Follow-up to only match UTF-16 surrogate halves, r=asuth.

2012-06-28 16:48:14 +02:00 · 2012-06-28 16:48:14 +02:00 · f7e759c416
--- a/chat/protocols/irc/irc.js
+++ b/chat/protocols/irc/irc.js
@ -834,11 +834,14 @@ ircAccount.prototype = {

    // Count the number of bytes in a UTF-8 encoded string.
    function charCodeToByteCount(c) {
-      // Unicode characters with a code point >  127 are 2 bytes long.
-      // Unicode characters with a code point > 2047 are 3 bytes long.
-      // Unicode characters with a code point >= 32768 are on 4 bytes,
-      // split by JS to 2 UTF16 characters of 2 bytes.
-      return c < 128 ? 1 : (c < 2048 || c >= 32768) ? 2 : 3;
+      // UTF-8 stores:
+      // - code points below U+0080 on 1 byte,
+      // - code points below U+0800 on 2 bytes,
+      // - code points U+D800 through U+DFFF are UTF-16 surrogate halves
+      // (they indicate that JS has split a 4 bytes UTF-8 character
+      // in two halves of 2 bytes each),
+      // - other code points on 3 bytes.
+      return c < 0x80 ? 1 : (c < 0x800 || (c >= 0xD800 && c <= 0xDFFF)) ? 2 : 3;
    }
    let bytes = 0;
    for (let i = 0; i < aStr.length; i++)