Bug 425915 - Use complex line breaker to identify word boundaries in SEAsian languages without interword spaces. r=m_kato

Differential Revision: https://phabricator.services.mozilla.com/D71206
This commit is contained in:
Jonathan Kew 2020-04-23 14:18:08 +00:00
Родитель 5ecbd2583e
Коммит 3751d26654
4 изменённых файлов: 118 добавлений и 40 удалений

Просмотреть файл

@ -5,10 +5,13 @@
#include "mozilla/intl/WordBreaker.h"
#include "mozilla/Preferences.h"
#include "nsComplexBreaker.h"
#include "nsUnicodeProperties.h"
using mozilla::intl::WordBreakClass;
using mozilla::intl::WordBreaker;
using mozilla::intl::WordRange;
using mozilla::unicode::GetScriptCode;
/*static*/
already_AddRefed<WordBreaker> WordBreaker::Create() {
@ -22,7 +25,20 @@ bool WordBreaker::BreakInBetween(const char16_t* aText1, uint32_t aTextLen1,
if (!aText1 || !aText2 || (0 == aTextLen1) || (0 == aTextLen2)) return false;
return GetClass(aText1[aTextLen1 - 1]) != GetClass(aText2[0]);
uint8_t c1 = GetClass(aText1[aTextLen1 - 1]);
uint8_t c2 = GetClass(aText2[0]);
if (c1 == c2 && kWbClassScriptioContinua == c1) {
nsAutoString text(aText1, aTextLen1);
text.Append(aText2, aTextLen2);
AutoTArray<uint8_t, 256> breakBefore;
breakBefore.SetLength(aTextLen1 + aTextLen2);
NS_GetComplexLineBreaks(text.get(), text.Length(), breakBefore.Elements());
bool ret = breakBefore[aTextLen1];
return ret;
}
return (c1 != c2);
}
#define IS_ASCII(c) (0 == (0xFF80 & (c)))
@ -40,7 +56,21 @@ bool WordBreaker::BreakInBetween(const char16_t* aText1, uint32_t aTextLen1,
#define IS_KATAKANA(c) ((0x30A0 <= (c)) && ((c) <= 0x30FF))
#define IS_HIRAGANA(c) ((0x3040 <= (c)) && ((c) <= 0x309F))
#define IS_HALFWIDTHKATAKANA(c) ((0xFF60 <= (c)) && ((c) <= 0xFF9F))
#define IS_THAI(c) (0x0E00 == (0xFF80 & (c))) // Look at the higest 9 bits
// Return true if aChar belongs to a SEAsian script that is written without
// word spaces, so we need to use the "complex breaker" to find possible word
// boundaries. (https://en.wikipedia.org/wiki/Scriptio_continua)
// (How well this works depends on the level of platform support for finding
// possible line breaks - or possible word boundaries - in the particular
// script. Thai, at least, works pretty well on the major desktop OSes. If
// the script is not supported by the platform, we just won't find any useful
// boundaries.)
static bool IsScriptioContinua(char16_t aChar) {
Script sc = GetScriptCode(aChar);
return sc == Script::THAI || sc == Script::MYANMAR || sc == Script::KHMER ||
sc == Script::JAVANESE || sc == Script::BALINESE ||
sc == Script::SUNDANESE || sc == Script::LAO;
}
/* static */
WordBreakClass WordBreaker::GetClass(char16_t c) {
@ -54,33 +84,37 @@ WordBreakClass WordBreaker::GetClass(char16_t c) {
if (IS_ASCII(c)) {
if (ASCII_IS_SPACE(c)) {
return kWbClassSpace;
} else if (ASCII_IS_ALPHA(c) || ASCII_IS_DIGIT(c) ||
(c == '_' && !sStopAtUnderscore)) {
return kWbClassAlphaLetter;
} else {
return kWbClassPunct;
}
} else if (IS_THAI(c)) {
return kWbClassThaiLetter;
} else if (c == 0x00A0 /*NBSP*/) {
if (ASCII_IS_ALPHA(c) || ASCII_IS_DIGIT(c) ||
(c == '_' && !sStopAtUnderscore)) {
return kWbClassAlphaLetter;
}
return kWbClassPunct;
}
if (c == 0x00A0 /*NBSP*/) {
return kWbClassSpace;
} else {
return kWbClassAlphaLetter;
}
} else {
if (IS_HAN(c)) {
return kWbClassHanLetter;
} else if (IS_KATAKANA(c)) {
return kWbClassKatakanaLetter;
} else if (IS_HIRAGANA(c)) {
return kWbClassHiraganaLetter;
} else if (IS_HALFWIDTHKATAKANA(c)) {
return kWbClassHWKatakanaLetter;
} else {
return kWbClassAlphaLetter;
if (IsScriptioContinua(c)) {
return kWbClassScriptioContinua;
}
return kWbClassAlphaLetter;
}
return static_cast<WordBreakClass>(0);
if (IS_HAN(c)) {
return kWbClassHanLetter;
}
if (IS_KATAKANA(c)) {
return kWbClassKatakanaLetter;
}
if (IS_HIRAGANA(c)) {
return kWbClassHiraganaLetter;
}
if (IS_HALFWIDTHKATAKANA(c)) {
return kWbClassHWKatakanaLetter;
}
if (IsScriptioContinua(c)) {
return kWbClassScriptioContinua;
}
return kWbClassAlphaLetter;
}
WordRange WordBreaker::FindWord(const char16_t* aText, uint32_t aTextLen,
@ -114,10 +148,30 @@ WordRange WordBreaker::FindWord(const char16_t* aText, uint32_t aTextLen,
break;
}
}
if (kWbClassThaiLetter == c) {
// need to call Thai word breaker from here
// we should pass the whole Thai segment to the thai word breaker to find a
if (kWbClassScriptioContinua == c) {
// we pass the whole text segment to the complex word breaker to find a
// shorter answer
AutoTArray<uint8_t, 256> breakBefore;
breakBefore.SetLength(range.mEnd - range.mBegin);
NS_GetComplexLineBreaks(aText + range.mBegin, range.mEnd - range.mBegin,
breakBefore.Elements());
// Scan forward
for (i = aOffset + 1; i < range.mEnd; i++) {
if (breakBefore[i - range.mBegin]) {
range.mEnd = i;
break;
}
}
// Scan backward
for (i = aOffset; i > range.mBegin; i--) {
if (breakBefore[i - range.mBegin]) {
range.mBegin = i;
break;
}
}
}
return range;
}
@ -126,18 +180,36 @@ int32_t WordBreaker::NextWord(const char16_t* aText, uint32_t aLen,
uint32_t aPos) {
WordBreakClass c1, c2;
uint32_t cur = aPos;
if (cur == aLen) return NS_WORDBREAKER_NEED_MORE_TEXT;
if (cur == aLen) {
return NS_WORDBREAKER_NEED_MORE_TEXT;
}
c1 = GetClass(aText[cur]);
for (cur++; cur < aLen; cur++) {
c2 = GetClass(aText[cur]);
if (c2 != c1) break;
if (c2 != c1) {
break;
}
}
if (kWbClassThaiLetter == c1) {
// need to call Thai word breaker from here
// we should pass the whole Thai segment to the thai word breaker to find a
if (kWbClassScriptioContinua == c1) {
// we pass the whole text segment to the complex word breaker to find a
// shorter answer
AutoTArray<uint8_t, 256> breakBefore;
breakBefore.SetLength(aLen - aPos);
NS_GetComplexLineBreaks(aText + aPos, aLen - aPos, breakBefore.Elements());
uint32_t i = 0;
while (i < cur - aPos && !breakBefore[i]) {
i++;
}
if (i < cur - aPos) {
return aPos + i;
}
}
if (cur == aLen) return NS_WORDBREAKER_NEED_MORE_TEXT;
if (cur == aLen) {
return NS_WORDBREAKER_NEED_MORE_TEXT;
}
return cur;
}

Просмотреть файл

@ -26,7 +26,7 @@ enum WordBreakClass : uint8_t {
kWbClassKatakanaLetter,
kWbClassHiraganaLetter,
kWbClassHWKatakanaLetter,
kWbClassThaiLetter
kWbClassScriptioContinua
};
class WordBreaker {

Просмотреть файл

@ -47,7 +47,9 @@ void NS_GetComplexLineBreaks(const char16_t* aText, uint32_t aLength,
&items[iItem].a, sla.Elements()) < 0)
return;
for (uint32_t j = 0; j + startOffset < endOffset; ++j) {
// We don't want to set a potential break position at the start of text;
// that's the responsibility of a higher level.
for (uint32_t j = startOffset ? 0 : 1; j + startOffset < endOffset; ++j) {
aBreakBefore[j + startOffset] = sla[j].fSoftBreak;
}
}

Просмотреть файл

@ -8025,11 +8025,15 @@ ClusterIterator::ClusterIterator(nsTextFrame* aTextFrame, int32_t aPosition,
aContext.Insert(str, 0);
}
mozilla::intl::WordBreaker* wordBreaker = nsContentUtils::WordBreaker();
for (int32_t i = 0; i <= textLen; ++i) {
int32_t indexInText = i + textStart;
mWordBreaks[i] |= wordBreaker->BreakInBetween(
aContext.get(), indexInText, aContext.get() + indexInText,
aContext.Length() - indexInText);
int32_t nextWord = textStart > 0 ? textStart - 1 : textStart;
while (true) {
nextWord =
wordBreaker->NextWord(aContext.get(), aContext.Length(), nextWord);
if (NS_WORDBREAKER_NEED_MORE_TEXT == nextWord ||
nextWord > textStart + textLen) {
break;
}
mWordBreaks[nextWord - textStart] = true;
}
}