From 3751d2665490eac0595ad0cea32c91577bf9927b Mon Sep 17 00:00:00 2001 From: Jonathan Kew Date: Thu, 23 Apr 2020 14:18:08 +0000 Subject: [PATCH] Bug 425915 - Use complex line breaker to identify word boundaries in SEAsian languages without interword spaces. r=m_kato Differential Revision: https://phabricator.services.mozilla.com/D71206 --- intl/lwbrk/WordBreaker.cpp | 138 +++++++++++++++++++++++------- intl/lwbrk/WordBreaker.h | 2 +- intl/lwbrk/nsUniscribeBreaker.cpp | 4 +- layout/generic/nsTextFrame.cpp | 14 +-- 4 files changed, 118 insertions(+), 40 deletions(-) diff --git a/intl/lwbrk/WordBreaker.cpp b/intl/lwbrk/WordBreaker.cpp index 0b7a51870b8f..4504d6827267 100644 --- a/intl/lwbrk/WordBreaker.cpp +++ b/intl/lwbrk/WordBreaker.cpp @@ -5,10 +5,13 @@ #include "mozilla/intl/WordBreaker.h" #include "mozilla/Preferences.h" +#include "nsComplexBreaker.h" +#include "nsUnicodeProperties.h" using mozilla::intl::WordBreakClass; using mozilla::intl::WordBreaker; using mozilla::intl::WordRange; +using mozilla::unicode::GetScriptCode; /*static*/ already_AddRefed WordBreaker::Create() { @@ -22,7 +25,20 @@ bool WordBreaker::BreakInBetween(const char16_t* aText1, uint32_t aTextLen1, if (!aText1 || !aText2 || (0 == aTextLen1) || (0 == aTextLen2)) return false; - return GetClass(aText1[aTextLen1 - 1]) != GetClass(aText2[0]); + uint8_t c1 = GetClass(aText1[aTextLen1 - 1]); + uint8_t c2 = GetClass(aText2[0]); + + if (c1 == c2 && kWbClassScriptioContinua == c1) { + nsAutoString text(aText1, aTextLen1); + text.Append(aText2, aTextLen2); + AutoTArray breakBefore; + breakBefore.SetLength(aTextLen1 + aTextLen2); + NS_GetComplexLineBreaks(text.get(), text.Length(), breakBefore.Elements()); + bool ret = breakBefore[aTextLen1]; + return ret; + } + + return (c1 != c2); } #define IS_ASCII(c) (0 == (0xFF80 & (c))) @@ -40,7 +56,21 @@ bool WordBreaker::BreakInBetween(const char16_t* aText1, uint32_t aTextLen1, #define IS_KATAKANA(c) ((0x30A0 <= (c)) && ((c) <= 0x30FF)) #define IS_HIRAGANA(c) ((0x3040 <= (c)) && ((c) <= 0x309F)) #define IS_HALFWIDTHKATAKANA(c) ((0xFF60 <= (c)) && ((c) <= 0xFF9F)) -#define IS_THAI(c) (0x0E00 == (0xFF80 & (c))) // Look at the higest 9 bits + +// Return true if aChar belongs to a SEAsian script that is written without +// word spaces, so we need to use the "complex breaker" to find possible word +// boundaries. (https://en.wikipedia.org/wiki/Scriptio_continua) +// (How well this works depends on the level of platform support for finding +// possible line breaks - or possible word boundaries - in the particular +// script. Thai, at least, works pretty well on the major desktop OSes. If +// the script is not supported by the platform, we just won't find any useful +// boundaries.) +static bool IsScriptioContinua(char16_t aChar) { + Script sc = GetScriptCode(aChar); + return sc == Script::THAI || sc == Script::MYANMAR || sc == Script::KHMER || + sc == Script::JAVANESE || sc == Script::BALINESE || + sc == Script::SUNDANESE || sc == Script::LAO; +} /* static */ WordBreakClass WordBreaker::GetClass(char16_t c) { @@ -54,33 +84,37 @@ WordBreakClass WordBreaker::GetClass(char16_t c) { if (IS_ASCII(c)) { if (ASCII_IS_SPACE(c)) { return kWbClassSpace; - } else if (ASCII_IS_ALPHA(c) || ASCII_IS_DIGIT(c) || - (c == '_' && !sStopAtUnderscore)) { - return kWbClassAlphaLetter; - } else { - return kWbClassPunct; } - } else if (IS_THAI(c)) { - return kWbClassThaiLetter; - } else if (c == 0x00A0 /*NBSP*/) { + if (ASCII_IS_ALPHA(c) || ASCII_IS_DIGIT(c) || + (c == '_' && !sStopAtUnderscore)) { + return kWbClassAlphaLetter; + } + return kWbClassPunct; + } + if (c == 0x00A0 /*NBSP*/) { return kWbClassSpace; - } else { - return kWbClassAlphaLetter; } - } else { - if (IS_HAN(c)) { - return kWbClassHanLetter; - } else if (IS_KATAKANA(c)) { - return kWbClassKatakanaLetter; - } else if (IS_HIRAGANA(c)) { - return kWbClassHiraganaLetter; - } else if (IS_HALFWIDTHKATAKANA(c)) { - return kWbClassHWKatakanaLetter; - } else { - return kWbClassAlphaLetter; + if (IsScriptioContinua(c)) { + return kWbClassScriptioContinua; } + return kWbClassAlphaLetter; } - return static_cast(0); + if (IS_HAN(c)) { + return kWbClassHanLetter; + } + if (IS_KATAKANA(c)) { + return kWbClassKatakanaLetter; + } + if (IS_HIRAGANA(c)) { + return kWbClassHiraganaLetter; + } + if (IS_HALFWIDTHKATAKANA(c)) { + return kWbClassHWKatakanaLetter; + } + if (IsScriptioContinua(c)) { + return kWbClassScriptioContinua; + } + return kWbClassAlphaLetter; } WordRange WordBreaker::FindWord(const char16_t* aText, uint32_t aTextLen, @@ -114,10 +148,30 @@ WordRange WordBreaker::FindWord(const char16_t* aText, uint32_t aTextLen, break; } } - if (kWbClassThaiLetter == c) { - // need to call Thai word breaker from here - // we should pass the whole Thai segment to the thai word breaker to find a + + if (kWbClassScriptioContinua == c) { + // we pass the whole text segment to the complex word breaker to find a // shorter answer + AutoTArray breakBefore; + breakBefore.SetLength(range.mEnd - range.mBegin); + NS_GetComplexLineBreaks(aText + range.mBegin, range.mEnd - range.mBegin, + breakBefore.Elements()); + + // Scan forward + for (i = aOffset + 1; i < range.mEnd; i++) { + if (breakBefore[i - range.mBegin]) { + range.mEnd = i; + break; + } + } + + // Scan backward + for (i = aOffset; i > range.mBegin; i--) { + if (breakBefore[i - range.mBegin]) { + range.mBegin = i; + break; + } + } } return range; } @@ -126,18 +180,36 @@ int32_t WordBreaker::NextWord(const char16_t* aText, uint32_t aLen, uint32_t aPos) { WordBreakClass c1, c2; uint32_t cur = aPos; - if (cur == aLen) return NS_WORDBREAKER_NEED_MORE_TEXT; + if (cur == aLen) { + return NS_WORDBREAKER_NEED_MORE_TEXT; + } c1 = GetClass(aText[cur]); for (cur++; cur < aLen; cur++) { c2 = GetClass(aText[cur]); - if (c2 != c1) break; + if (c2 != c1) { + break; + } } - if (kWbClassThaiLetter == c1) { - // need to call Thai word breaker from here - // we should pass the whole Thai segment to the thai word breaker to find a + + if (kWbClassScriptioContinua == c1) { + // we pass the whole text segment to the complex word breaker to find a // shorter answer + AutoTArray breakBefore; + breakBefore.SetLength(aLen - aPos); + NS_GetComplexLineBreaks(aText + aPos, aLen - aPos, breakBefore.Elements()); + uint32_t i = 0; + while (i < cur - aPos && !breakBefore[i]) { + i++; + } + if (i < cur - aPos) { + return aPos + i; + } } - if (cur == aLen) return NS_WORDBREAKER_NEED_MORE_TEXT; + + if (cur == aLen) { + return NS_WORDBREAKER_NEED_MORE_TEXT; + } + return cur; } diff --git a/intl/lwbrk/WordBreaker.h b/intl/lwbrk/WordBreaker.h index 0729a5280dab..57cb4b18b784 100644 --- a/intl/lwbrk/WordBreaker.h +++ b/intl/lwbrk/WordBreaker.h @@ -26,7 +26,7 @@ enum WordBreakClass : uint8_t { kWbClassKatakanaLetter, kWbClassHiraganaLetter, kWbClassHWKatakanaLetter, - kWbClassThaiLetter + kWbClassScriptioContinua }; class WordBreaker { diff --git a/intl/lwbrk/nsUniscribeBreaker.cpp b/intl/lwbrk/nsUniscribeBreaker.cpp index 2d57d52e0fca..d9950be64aa6 100644 --- a/intl/lwbrk/nsUniscribeBreaker.cpp +++ b/intl/lwbrk/nsUniscribeBreaker.cpp @@ -47,7 +47,9 @@ void NS_GetComplexLineBreaks(const char16_t* aText, uint32_t aLength, &items[iItem].a, sla.Elements()) < 0) return; - for (uint32_t j = 0; j + startOffset < endOffset; ++j) { + // We don't want to set a potential break position at the start of text; + // that's the responsibility of a higher level. + for (uint32_t j = startOffset ? 0 : 1; j + startOffset < endOffset; ++j) { aBreakBefore[j + startOffset] = sla[j].fSoftBreak; } } diff --git a/layout/generic/nsTextFrame.cpp b/layout/generic/nsTextFrame.cpp index 8cd2c2a1654f..34e028148e45 100644 --- a/layout/generic/nsTextFrame.cpp +++ b/layout/generic/nsTextFrame.cpp @@ -8025,11 +8025,15 @@ ClusterIterator::ClusterIterator(nsTextFrame* aTextFrame, int32_t aPosition, aContext.Insert(str, 0); } mozilla::intl::WordBreaker* wordBreaker = nsContentUtils::WordBreaker(); - for (int32_t i = 0; i <= textLen; ++i) { - int32_t indexInText = i + textStart; - mWordBreaks[i] |= wordBreaker->BreakInBetween( - aContext.get(), indexInText, aContext.get() + indexInText, - aContext.Length() - indexInText); + int32_t nextWord = textStart > 0 ? textStart - 1 : textStart; + while (true) { + nextWord = + wordBreaker->NextWord(aContext.get(), aContext.Length(), nextWord); + if (NS_WORDBREAKER_NEED_MORE_TEXT == nextWord || + nextWord > textStart + textLen) { + break; + } + mWordBreaks[nextWord - textStart] = true; } }