From f7e759c4160fd701fa811484d5e63e9ef4f47aaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20Qu=C3=A8ze?= Date: Thu, 28 Jun 2012 16:48:14 +0200 Subject: [PATCH] Bug 754824 - The highlight is off by a few characters in the search result view when some characters are UTF8 encoded on 4 bytes - Follow-up to only match UTF-16 surrogate halves, r=asuth. --- chat/protocols/irc/irc.js | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/chat/protocols/irc/irc.js b/chat/protocols/irc/irc.js index 63385657a7..008818edff 100644 --- a/chat/protocols/irc/irc.js +++ b/chat/protocols/irc/irc.js @@ -834,11 +834,14 @@ ircAccount.prototype = { // Count the number of bytes in a UTF-8 encoded string. function charCodeToByteCount(c) { - // Unicode characters with a code point > 127 are 2 bytes long. - // Unicode characters with a code point > 2047 are 3 bytes long. - // Unicode characters with a code point >= 32768 are on 4 bytes, - // split by JS to 2 UTF16 characters of 2 bytes. - return c < 128 ? 1 : (c < 2048 || c >= 32768) ? 2 : 3; + // UTF-8 stores: + // - code points below U+0080 on 1 byte, + // - code points below U+0800 on 2 bytes, + // - code points U+D800 through U+DFFF are UTF-16 surrogate halves + // (they indicate that JS has split a 4 bytes UTF-8 character + // in two halves of 2 bytes each), + // - other code points on 3 bytes. + return c < 0x80 ? 1 : (c < 0x800 || (c >= 0xD800 && c <= 0xDFFF)) ? 2 : 3; } let bytes = 0; for (let i = 0; i < aStr.length; i++)