/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* vim: set ts=8 sts=2 et sw=2 tw=80: */ /* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this file, * You can obtain one at http://mozilla.org/MPL/2.0/. */ /* Classes to iterate over grapheme, word, sentence, or line. */ #include "mozilla/intl/Segmenter.h" #include "mozilla/intl/LineBreaker.h" #include "mozilla/intl/WordBreaker.h" #include "mozilla/intl/UnicodeProperties.h" #include "nsUnicodeProperties.h" #include "nsCharTraits.h" using namespace mozilla::unicode; namespace mozilla::intl { SegmentIteratorUtf16::SegmentIteratorUtf16(Span aText) : mText(aText) {} Maybe SegmentIteratorUtf16::Seek(uint32_t aPos) { if (mPos < aPos) { mPos = aPos; } return Next(); } LineBreakIteratorUtf16::LineBreakIteratorUtf16(Span aText, const LineBreakOptions& aOptions) : SegmentIteratorUtf16(aText), mOptions(aOptions) {} Maybe LineBreakIteratorUtf16::Next() { const int32_t nextPos = LineBreaker::Next(mText.Elements(), mText.Length(), mPos); if (nextPos == NS_LINEBREAKER_NEED_MORE_TEXT) { return Nothing(); } mPos = nextPos; return Some(mPos); } WordBreakIteratorUtf16::WordBreakIteratorUtf16(Span aText) : SegmentIteratorUtf16(aText) {} Maybe WordBreakIteratorUtf16::Next() { const int32_t nextPos = WordBreaker::Next(mText.Elements(), mText.Length(), mPos); if (nextPos == NS_WORDBREAKER_NEED_MORE_TEXT) { return Nothing(); } mPos = nextPos; return Some(mPos); } GraphemeClusterBreakIteratorUtf16::GraphemeClusterBreakIteratorUtf16( Span aText) : SegmentIteratorUtf16(aText) {} enum HSType { HST_NONE = U_HST_NOT_APPLICABLE, HST_L = U_HST_LEADING_JAMO, HST_V = U_HST_VOWEL_JAMO, HST_T = U_HST_TRAILING_JAMO, HST_LV = U_HST_LV_SYLLABLE, HST_LVT = U_HST_LVT_SYLLABLE }; static HSType GetHangulSyllableType(uint32_t aCh) { return HSType(UnicodeProperties::GetIntPropertyValue( aCh, UnicodeProperties::IntProperty::HangulSyllableType)); } Maybe GraphemeClusterBreakIteratorUtf16::Next() { const auto len = mText.Length(); if (mPos >= len) { // The iterator has already reached the end. return Nothing(); } uint32_t ch = mText[mPos++]; if (mPos < len && NS_IS_SURROGATE_PAIR(ch, mText[mPos])) { ch = SURROGATE_TO_UCS4(ch, mText[mPos++]); } else if ((ch & ~0xff) == 0x1100 || (ch >= 0xa960 && ch <= 0xa97f) || (ch >= 0xac00 && ch <= 0xd7ff)) { // Handle conjoining Jamo that make Hangul syllables HSType hangulState = GetHangulSyllableType(ch); while (mPos < len) { ch = mText[mPos]; HSType hangulType = GetHangulSyllableType(ch); switch (hangulType) { case HST_L: case HST_LV: case HST_LVT: if (hangulState == HST_L) { hangulState = hangulType; mPos++; continue; } break; case HST_V: if ((hangulState != HST_NONE) && (hangulState != HST_T) && (hangulState != HST_LVT)) { hangulState = hangulType; mPos++; continue; } break; case HST_T: if (hangulState != HST_NONE && hangulState != HST_L) { hangulState = hangulType; mPos++; continue; } break; default: break; } break; } } const uint32_t kVS16 = 0xfe0f; const uint32_t kZWJ = 0x200d; // UTF-16 surrogate values for Fitzpatrick type modifiers const uint32_t kFitzpatrickHigh = 0xD83C; const uint32_t kFitzpatrickLowFirst = 0xDFFB; const uint32_t kFitzpatrickLowLast = 0xDFFF; bool baseIsEmoji = (GetEmojiPresentation(ch) == EmojiDefault) || (GetEmojiPresentation(ch) == TextDefault && ((mPos < len && mText[mPos] == kVS16) || (mPos + 1 < len && mText[mPos] == kFitzpatrickHigh && mText[mPos + 1] >= kFitzpatrickLowFirst && mText[mPos + 1] <= kFitzpatrickLowLast))); bool prevWasZwj = false; while (mPos < len) { ch = mText[mPos]; size_t chLen = 1; // Check for surrogate pairs; note that isolated surrogates will just // be treated as generic (non-cluster-extending) characters here, // which is fine for cluster-iterating purposes if (mPos < len - 1 && NS_IS_SURROGATE_PAIR(ch, mText[mPos + 1])) { ch = SURROGATE_TO_UCS4(ch, mText[mPos + 1]); chLen = 2; } bool extendCluster = IsClusterExtender(ch) || (baseIsEmoji && prevWasZwj && ((GetEmojiPresentation(ch) == EmojiDefault) || (GetEmojiPresentation(ch) == TextDefault && mPos + chLen < len && mText[mPos + chLen] == kVS16))); if (!extendCluster) { break; } prevWasZwj = (ch == kZWJ); mPos += chLen; } MOZ_ASSERT(mPos <= len, "Next() has overshot the string!"); return Some(mPos); } GraphemeClusterBreakReverseIteratorUtf16:: GraphemeClusterBreakReverseIteratorUtf16(Span aText) : SegmentIteratorUtf16(aText) { mPos = mText.Length(); } Maybe GraphemeClusterBreakReverseIteratorUtf16::Next() { if (mPos == 0) { return Nothing(); } uint32_t ch; do { ch = mText[--mPos]; if (mPos > 0 && NS_IS_SURROGATE_PAIR(mText[mPos - 1], ch)) { ch = SURROGATE_TO_UCS4(mText[--mPos], ch); } if (!IsClusterExtender(ch)) { break; } } while (mPos > 0); // XXX May need to handle conjoining Jamo return Some(mPos); } Maybe GraphemeClusterBreakReverseIteratorUtf16::Seek(uint32_t aPos) { if (mPos > aPos) { mPos = aPos; } return Next(); } Result, ICUError> Segmenter::TryCreate( Span aLocale, const SegmenterOptions& aOptions) { if (aOptions.mGranularity == SegmenterGranularity::Sentence) { // Grapheme and Sentence iterator are not yet implemented. return Err(ICUError::InternalError); } return MakeUnique(aLocale, aOptions); } UniquePtr Segmenter::Segment( Span aText) const { switch (mOptions.mGranularity) { case SegmenterGranularity::Grapheme: return MakeUnique(aText); case SegmenterGranularity::Sentence: MOZ_ASSERT_UNREACHABLE("Unimplemented yet!"); return nullptr; case SegmenterGranularity::Word: return MakeUnique(aText); case SegmenterGranularity::Line: return MakeUnique(aText); } MOZ_ASSERT_UNREACHABLE("All granularities must be handled!"); return nullptr; } } // namespace mozilla::intl