From f921df7ad19f812c795d2fcbf17b0b8145086085 Mon Sep 17 00:00:00 2001 From: Milan Burda Date: Fri, 26 Oct 2018 19:34:57 +0200 Subject: [PATCH] chore: move spellcheck out of chromium_src (#15407) --- BUILD.gn | 1 + .../api/atom_api_spell_check_client.cc | 2 +- .../api/atom_api_spell_check_client.h | 2 +- .../spellchecker/spellcheck_worditerator.cc | 429 ------------------ .../spellchecker/spellcheck_worditerator.h | 199 -------- filenames.gni | 2 - 6 files changed, 3 insertions(+), 632 deletions(-) delete mode 100644 chromium_src/chrome/renderer/spellchecker/spellcheck_worditerator.cc delete mode 100644 chromium_src/chrome/renderer/spellchecker/spellcheck_worditerator.h diff --git a/BUILD.gn b/BUILD.gn index e66e1dfc7d..9782fe4cdb 100644 --- a/BUILD.gn +++ b/BUILD.gn @@ -213,6 +213,7 @@ static_library("electron_lib") { "//components/net_log", "//components/network_session_configurator/common", "//components/prefs", + "//components/spellcheck/renderer", "//components/viz/service", "//content/public/app:both", "//content/public/browser", diff --git a/atom/renderer/api/atom_api_spell_check_client.cc b/atom/renderer/api/atom_api_spell_check_client.cc index 848f0f9282..329884bbfe 100644 --- a/atom/renderer/api/atom_api_spell_check_client.cc +++ b/atom/renderer/api/atom_api_spell_check_client.cc @@ -10,7 +10,7 @@ #include "atom/common/native_mate_converters/string16_converter.h" #include "base/logging.h" #include "base/threading/thread_task_runner_handle.h" -#include "chrome/renderer/spellchecker/spellcheck_worditerator.h" +#include "components/spellcheck/renderer/spellcheck_worditerator.h" #include "native_mate/converter.h" #include "native_mate/dictionary.h" #include "native_mate/function_template.h" diff --git a/atom/renderer/api/atom_api_spell_check_client.h b/atom/renderer/api/atom_api_spell_check_client.h index aa2b4214ad..a4ff1b0f2e 100644 --- a/atom/renderer/api/atom_api_spell_check_client.h +++ b/atom/renderer/api/atom_api_spell_check_client.h @@ -11,7 +11,7 @@ #include "base/callback.h" #include "base/memory/weak_ptr.h" -#include "chrome/renderer/spellchecker/spellcheck_worditerator.h" +#include "components/spellcheck/renderer/spellcheck_worditerator.h" #include "native_mate/scoped_persistent.h" #include "third_party/blink/public/platform/web_spell_check_panel_host_client.h" #include "third_party/blink/public/platform/web_vector.h" diff --git a/chromium_src/chrome/renderer/spellchecker/spellcheck_worditerator.cc b/chromium_src/chrome/renderer/spellchecker/spellcheck_worditerator.cc deleted file mode 100644 index af2a12c3ec..0000000000 --- a/chromium_src/chrome/renderer/spellchecker/spellcheck_worditerator.cc +++ /dev/null @@ -1,429 +0,0 @@ -// Copyright (c) 2012 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -// Implements a custom word iterator used for our spellchecker. - -#include "chrome/renderer/spellchecker/spellcheck_worditerator.h" - -#include -#include -#include -#include - -#include "base/i18n/break_iterator.h" -#include "base/logging.h" -#include "base/macros.h" -#include "base/strings/stringprintf.h" -#include "base/strings/utf_string_conversions.h" -#include "third_party/icu/source/common/unicode/normlzr.h" -#include "third_party/icu/source/common/unicode/schriter.h" -#include "third_party/icu/source/common/unicode/uscript.h" -#include "third_party/icu/source/i18n/unicode/ulocdata.h" - -// SpellcheckCharAttribute implementation: - -SpellcheckCharAttribute::SpellcheckCharAttribute() - : script_code_(USCRIPT_LATIN) {} - -SpellcheckCharAttribute::~SpellcheckCharAttribute() {} - -void SpellcheckCharAttribute::SetDefaultLanguage(const std::string& language) { - CreateRuleSets(language); -} - -base::string16 SpellcheckCharAttribute::GetRuleSet( - bool allow_contraction) const { - return allow_contraction ? ruleset_allow_contraction_ - : ruleset_disallow_contraction_; -} - -void SpellcheckCharAttribute::CreateRuleSets(const std::string& language) { - // The template for our custom rule sets, which is based on the word-break - // rules of ICU 4.0: - // . - // The major differences from the original one are listed below: - // * It discards comments in the original rules. - // * It discards characters not needed by our spellchecker (e.g. numbers, - // punctuation characters, Hiraganas, Katakanas, CJK Ideographs, and so on). - // * It allows customization of the $ALetter value (i.e. word characters). - // * It allows customization of the $ALetterPlus value (i.e. whether or not to - // use the dictionary data). - // * It allows choosing whether or not to split a text at contraction - // characters. - // This template only changes the forward-iteration rules. So, calling - // ubrk_prev() returns the same results as the original template. - static const char kRuleTemplate[] = - "!!chain;" - "$CR = [\\p{Word_Break = CR}];" - "$LF = [\\p{Word_Break = LF}];" - "$Newline = [\\p{Word_Break = Newline}];" - "$Extend = [\\p{Word_Break = Extend}];" - "$Format = [\\p{Word_Break = Format}];" - "$Katakana = [\\p{Word_Break = Katakana}];" - // Not all the characters in a given script are ALetter. - // For instance, U+05F4 is MidLetter. So, this may be - // better, but it leads to an empty set error in Thai. - // "$ALetter = [[\\p{script=%s}] & [\\p{Word_Break = ALetter}]];" - "$ALetter = [\\p{script=%s}%s];" - // U+0027 (single quote/apostrophe) is not in MidNumLet any more - // in UAX 29 rev 21 or later. For our purpose, U+0027 - // has to be treated as MidNumLet. ( http://crbug.com/364072 ) - "$MidNumLet = [\\p{Word_Break = MidNumLet} \\u0027];" - "$MidLetter = [\\p{Word_Break = MidLetter}%s];" - "$MidNum = [\\p{Word_Break = MidNum}];" - "$Numeric = [\\p{Word_Break = Numeric}];" - "$ExtendNumLet = [\\p{Word_Break = ExtendNumLet}];" - - "$Control = [\\p{Grapheme_Cluster_Break = Control}]; " - "%s" // ALetterPlus - - "$KatakanaEx = $Katakana ($Extend | $Format)*;" - "$ALetterEx = $ALetterPlus ($Extend | $Format)*;" - "$MidNumLetEx = $MidNumLet ($Extend | $Format)*;" - "$MidLetterEx = $MidLetter ($Extend | $Format)*;" - "$MidNumEx = $MidNum ($Extend | $Format)*;" - "$NumericEx = $Numeric ($Extend | $Format)*;" - "$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;" - - "$Hiragana = [\\p{script=Hiragana}];" - "$Ideographic = [\\p{Ideographic}];" - "$HiraganaEx = $Hiragana ($Extend | $Format)*;" - "$IdeographicEx = $Ideographic ($Extend | $Format)*;" - - "!!forward;" - "$CR $LF;" - "[^$CR $LF $Newline]? ($Extend | $Format)+;" - "$ALetterEx {200};" - "$ALetterEx $ALetterEx {200};" - "%s" // (Allow|Disallow) Contraction - - "!!reverse;" - "$BackALetterEx = ($Format | $Extend)* $ALetterPlus;" - "$BackMidNumLetEx = ($Format | $Extend)* $MidNumLet;" - "$BackNumericEx = ($Format | $Extend)* $Numeric;" - "$BackMidNumEx = ($Format | $Extend)* $MidNum;" - "$BackMidLetterEx = ($Format | $Extend)* $MidLetter;" - "$BackKatakanaEx = ($Format | $Extend)* $Katakana;" - "$BackExtendNumLetEx= ($Format | $Extend)* $ExtendNumLet;" - "$LF $CR;" - "($Format | $Extend)* [^$CR $LF $Newline]?;" - "$BackALetterEx $BackALetterEx;" - "$BackALetterEx ($BackMidLetterEx | $BackMidNumLetEx) $BackALetterEx;" - "$BackNumericEx $BackNumericEx;" - "$BackNumericEx $BackALetterEx;" - "$BackALetterEx $BackNumericEx;" - "$BackNumericEx ($BackMidNumEx | $BackMidNumLetEx) $BackNumericEx;" - "$BackKatakanaEx $BackKatakanaEx;" - "$BackExtendNumLetEx ($BackALetterEx | $BackNumericEx |" - " $BackKatakanaEx | $BackExtendNumLetEx);" - "($BackALetterEx | $BackNumericEx | $BackKatakanaEx)" - " $BackExtendNumLetEx;" - - "!!safe_reverse;" - "($Extend | $Format)+ .?;" - "($MidLetter | $MidNumLet) $BackALetterEx;" - "($MidNum | $MidNumLet) $BackNumericEx;" - - "!!safe_forward;" - "($Extend | $Format)+ .?;" - "($MidLetterEx | $MidNumLetEx) $ALetterEx;" - "($MidNumEx | $MidNumLetEx) $NumericEx;"; - - // Retrieve the script codes used by the given language from ICU. When the - // given language consists of two or more scripts, we just use the first - // script. The size of returned script codes is always < 8. Therefore, we use - // an array of size 8 so we can include all script codes without insufficient - // buffer errors. - UErrorCode error = U_ZERO_ERROR; - UScriptCode script_code[8]; - int scripts = uscript_getCode(language.c_str(), script_code, - arraysize(script_code), &error); - if (U_SUCCESS(error) && scripts >= 1) - script_code_ = script_code[0]; - - // Retrieve the values for $ALetter and $ALetterPlus. We use the dictionary - // only for the languages which need it (i.e. Korean and Thai) to prevent ICU - // from returning dictionary words (i.e. Korean or Thai words) for languages - // which don't need them. - const char* aletter = uscript_getName(script_code_); - if (!aletter) - aletter = "Latin"; - - const char kWithDictionary[] = - "$dictionary = [:LineBreak = Complex_Context:];" - "$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]];"; - const char kWithoutDictionary[] = "$ALetterPlus = $ALetter;"; - const char* aletter_plus = kWithoutDictionary; - if (script_code_ == USCRIPT_HANGUL || script_code_ == USCRIPT_THAI || - script_code_ == USCRIPT_LAO || script_code_ == USCRIPT_KHMER) - aletter_plus = kWithDictionary; - - // Treat numbers as word characters except for Arabic and Hebrew. - const char* aletter_extra = " [0123456789]"; - if (script_code_ == USCRIPT_HEBREW) - aletter_extra = ""; - else if (script_code_ == USCRIPT_ARABIC) - // When "script=Arabic", it does not include tatweel, which is - // "script=Common" so add it back. Otherwise, it creates unwanted - // word breaks. - aletter_extra = " [\\u0640]"; - - const char kMidLetterExtra[] = ""; - // For Hebrew, treat single/double quoation marks as MidLetter. - const char kMidLetterExtraHebrew[] = "\"'"; - const char* midletter_extra = kMidLetterExtra; - if (script_code_ == USCRIPT_HEBREW) - midletter_extra = kMidLetterExtraHebrew; - - // Create two custom rule-sets: one allows contraction and the other does not. - // We save these strings in UTF-16 so we can use it without conversions. (ICU - // needs UTF-16 strings.) - const char kAllowContraction[] = - "$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};"; - const char kDisallowContraction[] = ""; - - ruleset_allow_contraction_ = base::ASCIIToUTF16( - base::StringPrintf(kRuleTemplate, aletter, aletter_extra, midletter_extra, - aletter_plus, kAllowContraction)); - ruleset_disallow_contraction_ = base::ASCIIToUTF16( - base::StringPrintf(kRuleTemplate, aletter, aletter_extra, midletter_extra, - aletter_plus, kDisallowContraction)); -} - -bool SpellcheckCharAttribute::OutputChar(UChar c, - base::string16* output) const { - // Call the language-specific function if necessary. - // Otherwise, we call the default one. - switch (script_code_) { - case USCRIPT_ARABIC: - return OutputArabic(c, output); - - case USCRIPT_HANGUL: - return OutputHangul(c, output); - - case USCRIPT_HEBREW: - return OutputHebrew(c, output); - - default: - return OutputDefault(c, output); - } -} - -bool SpellcheckCharAttribute::OutputArabic(UChar c, - base::string16* output) const { - // Include non-Arabic characters (which should trigger a spelling error) - // and Arabic characters excluding vowel marks and class "Lm". - // We filter the latter because, while they are "letters", they are - // optional and so don't affect the correctness of the rest of the word. - if (!(0x0600 <= c && c <= 0x06FF) || (u_isalpha(c) && c != 0x0640)) - output->push_back(c); - return true; -} - -bool SpellcheckCharAttribute::OutputHangul(UChar c, - base::string16* output) const { - // Decompose a Hangul character to a Hangul vowel and consonants used by our - // spellchecker. A Hangul character of Unicode is a ligature consisting of a - // Hangul vowel and consonants, e.g. U+AC01 "Gag" consists of U+1100 "G", - // U+1161 "a", and U+11A8 "g". That is, we can treat each Hangul character as - // a point of a cubic linear space consisting of (first consonant, vowel, last - // consonant). Therefore, we can compose a Hangul character from a vowel and - // two consonants with linear composition: - // character = 0xAC00 + - // (first consonant - 0x1100) * 28 * 21 + - // (vowel - 0x1161) * 28 + - // (last consonant - 0x11A7); - // We can also decompose a Hangul character with linear decomposition: - // first consonant = (character - 0xAC00) / 28 / 21; - // vowel = (character - 0xAC00) / 28 % 21; - // last consonant = (character - 0xAC00) % 28; - // This code is copied from Unicode Standard Annex #15 - // and added some comments. - const int kSBase = 0xAC00; // U+AC00: the top of Hangul characters. - const int kLBase = 0x1100; // U+1100: the top of Hangul first consonants. - const int kVBase = 0x1161; // U+1161: the top of Hangul vowels. - const int kTBase = 0x11A7; // U+11A7: the top of Hangul last consonants. - const int kLCount = 19; // The number of Hangul first consonants. - const int kVCount = 21; // The number of Hangul vowels. - const int kTCount = 28; // The number of Hangul last consonants. - const int kNCount = kVCount * kTCount; - const int kSCount = kLCount * kNCount; - - int index = c - kSBase; - if (index < 0 || index >= kSBase + kSCount) { - // This is not a Hangul syllable. Call the default output function since we - // should output this character when it is a Hangul syllable. - return OutputDefault(c, output); - } - - // This is a Hangul character. Decompose this characters into Hangul vowels - // and consonants. - int l = kLBase + index / kNCount; - int v = kVBase + (index % kNCount) / kTCount; - int t = kTBase + index % kTCount; - output->push_back(l); - output->push_back(v); - if (t != kTBase) - output->push_back(t); - return true; -} - -bool SpellcheckCharAttribute::OutputHebrew(UChar c, - base::string16* output) const { - // Discard characters except Hebrew alphabets. We also discard Hebrew niqquds - // to prevent our Hebrew dictionary from marking a Hebrew word including - // niqquds as misspelled. (Same as Arabic vowel marks, we need to check - // niqquds manually and filter them out since their script codes are - // USCRIPT_HEBREW.) - // Pass through ASCII single/double quotation marks and Hebrew Geresh and - // Gershayim. - if ((0x05D0 <= c && c <= 0x05EA) || c == 0x22 || c == 0x27 || c == 0x05F4 || - c == 0x05F3) - output->push_back(c); - return true; -} - -bool SpellcheckCharAttribute::OutputDefault(UChar c, - base::string16* output) const { - // Check the script code of this character and output only if it is the one - // used by the spellchecker language. - UErrorCode status = U_ZERO_ERROR; - UScriptCode script_code = uscript_getScript(c, &status); - if (script_code == script_code_ || script_code == USCRIPT_COMMON) - output->push_back(c); - return true; -} - -// SpellcheckWordIterator implementation: - -SpellcheckWordIterator::SpellcheckWordIterator() - : text_(nullptr), attribute_(nullptr), iterator_() {} - -SpellcheckWordIterator::~SpellcheckWordIterator() { - Reset(); -} - -bool SpellcheckWordIterator::Initialize( - const SpellcheckCharAttribute* attribute, - bool allow_contraction) { - // Create a custom ICU break iterator with empty text used in this object. (We - // allow setting text later so we can re-use this iterator.) - DCHECK(attribute); - const base::string16 rule(attribute->GetRuleSet(allow_contraction)); - - // If there is no rule set, the attributes were invalid. - if (rule.empty()) - return false; - - std::unique_ptr iterator( - new base::i18n::BreakIterator(base::string16(), rule)); - if (!iterator->Init()) { - // Since we're not passing in any text, the only reason this could fail - // is if we fail to parse the rules. Since the rules are hardcoded, - // that would be a bug in this class. - NOTREACHED() << "failed to open iterator (broken rules)"; - return false; - } - iterator_ = std::move(iterator); - - // Set the character attributes so we can normalize the words extracted by - // this iterator. - attribute_ = attribute; - return true; -} - -bool SpellcheckWordIterator::IsInitialized() const { - // Return true iff we have an iterator. - return !!iterator_; -} - -bool SpellcheckWordIterator::SetText(const base::char16* text, size_t length) { - DCHECK(!!iterator_); - - // Set the text to be split by this iterator. - if (!iterator_->SetText(text, length)) { - LOG(ERROR) << "failed to set text"; - return false; - } - - text_ = text; - return true; -} - -SpellcheckWordIterator::WordIteratorStatus SpellcheckWordIterator::GetNextWord( - base::string16* word_string, - int* word_start, - int* word_length) { - DCHECK(!!text_); - - word_string->clear(); - *word_start = 0; - *word_length = 0; - - if (!text_) { - return IS_END_OF_TEXT; - } - - // Find a word that can be checked for spelling or a character that can be - // skipped over. Rather than moving past a skippable character this returns - // IS_SKIPPABLE and defers handling the character to the calling function. - while (iterator_->Advance()) { - const size_t start = iterator_->prev(); - const size_t length = iterator_->pos() - start; - switch (iterator_->GetWordBreakStatus()) { - case base::i18n::BreakIterator::IS_WORD_BREAK: { - if (Normalize(start, length, word_string)) { - *word_start = start; - *word_length = length; - return IS_WORD; - } - break; - } - case base::i18n::BreakIterator::IS_SKIPPABLE_WORD: { - *word_string = iterator_->GetString(); - *word_start = start; - *word_length = length; - return IS_SKIPPABLE; - } - // |iterator_| is RULE_BASED so the break status should never be - // IS_LINE_OR_CHAR_BREAK. - case base::i18n::BreakIterator::IS_LINE_OR_CHAR_BREAK: { - NOTREACHED(); - break; - } - } - } - - // There aren't any more words in the given text. - return IS_END_OF_TEXT; -} - -void SpellcheckWordIterator::Reset() { - iterator_.reset(); -} - -bool SpellcheckWordIterator::Normalize(int input_start, - int input_length, - base::string16* output_string) const { - // We use NFKC (Normalization Form, Compatible decomposition, followed by - // canonical Composition) defined in Unicode Standard Annex #15 to normalize - // this token because it it the most suitable normalization algorithm for our - // spellchecker. Nevertheless, it is not a perfect algorithm for our - // spellchecker and we need manual normalization as well. The normalized - // text does not have to be NUL-terminated since its characters are copied to - // string16, which adds a NUL character when we need. - icu::UnicodeString input(FALSE, &text_[input_start], input_length); - UErrorCode status = U_ZERO_ERROR; - icu::UnicodeString output; - icu::Normalizer::normalize(input, UNORM_NFKC, 0, output, status); - if (status != U_ZERO_ERROR && status != U_STRING_NOT_TERMINATED_WARNING) - return false; - - // Copy the normalized text to the output. - icu::StringCharacterIterator it(output); - for (UChar c = it.first(); c != icu::CharacterIterator::DONE; c = it.next()) - attribute_->OutputChar(c, output_string); - - return !output_string->empty(); -} diff --git a/chromium_src/chrome/renderer/spellchecker/spellcheck_worditerator.h b/chromium_src/chrome/renderer/spellchecker/spellcheck_worditerator.h deleted file mode 100644 index 966137a324..0000000000 --- a/chromium_src/chrome/renderer/spellchecker/spellcheck_worditerator.h +++ /dev/null @@ -1,199 +0,0 @@ -// Copyright (c) 2011 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -// Defines an iterator class that enumerates words supported by our spellchecker -// from multi-language text. This class is used for filtering out characters -// not supported by our spellchecker. - -#ifndef CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_ -#define CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_ - -#include - -#include -#include - -#include "base/macros.h" -#include "base/strings/string16.h" -#include "third_party/icu/source/common/unicode/uscript.h" - -namespace base { -namespace i18n { -class BreakIterator; -} // namespace i18n -} // namespace base - -// A class which encapsulates language-specific operations used by -// SpellcheckWordIterator. When we set the spellchecker language, this class -// creates rule sets that filter out the characters not supported by the -// spellchecker. (Please read the comment in the SpellcheckWordIterator class -// about how to use this class.) -class SpellcheckCharAttribute { - public: - SpellcheckCharAttribute(); - ~SpellcheckCharAttribute(); - - // Sets the language of the spellchecker. When this function is called with an - // ISO language code, this function creates the custom rule-sets used by - // the ICU break iterator so it can extract only words used by the language. - // GetRuleSet() returns the rule-sets created in this function. - void SetDefaultLanguage(const std::string& language); - - // Returns a custom rule-set string used by the ICU break iterator. This class - // has two rule-sets, one splits a contraction and the other does not, so we - // can split a concaticated word (e.g. "seven-year-old") into words (e.g. - // "seven", "year", and "old") and check their spellings. The result stirng is - // encoded in UTF-16 since ICU needs UTF-16 strings. - base::string16 GetRuleSet(bool allow_contraction) const; - - // Outputs a character only if it is a word character. (Please read the - // comments in CreateRuleSets() why we need this function.) - bool OutputChar(UChar c, base::string16* output) const; - - private: - // Creates the rule-sets that return words possibly used by the given - // language. Unfortunately, these rule-sets are not perfect and have some - // false-positives. For example, they return combined accent marks even though - // we need English words only. We call OutputCharacter() to filter out such - // false-positive characters. - void CreateRuleSets(const std::string& language); - - // Outputs a character only if it is one used by the given language. These - // functions are called from OutputChar(). - bool OutputArabic(UChar c, base::string16* output) const; - bool OutputHangul(UChar c, base::string16* output) const; - bool OutputHebrew(UChar c, base::string16* output) const; - bool OutputDefault(UChar c, base::string16* output) const; - - // The custom rule-set strings used by ICU break iterator. Since it is not so - // easy to create custom rule-sets from an ISO language code, this class - // saves these rule-set strings created when we set the language. - base::string16 ruleset_allow_contraction_; - base::string16 ruleset_disallow_contraction_; - - // The script code used by this language. - UScriptCode script_code_; - - DISALLOW_COPY_AND_ASSIGN(SpellcheckCharAttribute); -}; - -// A class which extracts words that can be checked for spelling from a -// multi-language string. The ICU word-break iterator does not discard some -// punctuation characters attached to a word. For example, when we set a word -// "_hello_" to a word-break iterator, it just returns "_hello_". Neither does -// it discard characters not used by the language. For example, it returns -// Russian words even though we need English words only. To extract only the -// words that our spellchecker can check their spellings, this class uses custom -// rule-sets created by the SpellcheckCharAttribute class. Also, this class -// normalizes extracted words so our spellchecker can check the spellings of -// words that include ligatures, combined characters, full-width characters, -// etc. This class uses UTF-16 strings as its input and output strings since -// UTF-16 is the native encoding of ICU and avoid unnecessary conversions -// when changing the encoding of this string for our spellchecker. (Chrome can -// use two or more spellcheckers and we cannot assume their encodings.) -// The following snippet is an example that extracts words with this class. -// -// // Creates the language-specific attributes for US English. -// SpellcheckCharAttribute attribute; -// attribute.SetDefaultLanguage("en-US"); -// -// // Set up a SpellcheckWordIterator object which extracts English words, -// // and retrieve them. -// SpellcheckWordIterator iterator; -// base::string16 text(base::UTF8ToUTF16("this is a test.")); -// iterator.Initialize(&attribute, true); -// iterator.SetText(text.c_str(), text_.length()); -// -// base::string16 word; -// int offset; -// int length; -// while (iterator.GetNextWord(&word, &offset, &length)) { -// ... -// } -// -class SpellcheckWordIterator { - public: - enum WordIteratorStatus { - // The end of a sequence of text that the iterator recognizes as characters - // that can form a word. - IS_WORD, - // Non-word characters that the iterator can skip past, such as punctuation, - // whitespace, and characters from another character set. - IS_SKIPPABLE, - // The end of the text that the iterator is going over. - IS_END_OF_TEXT - }; - - SpellcheckWordIterator(); - ~SpellcheckWordIterator(); - - // Initializes a word-iterator object with the language-specific attribute. If - // we need to split contractions and concatenated words, call this function - // with its 'allow_contraction' parameter false. (This function uses lots of - // temporal memory to compile a custom word-break rule into an automaton.) - bool Initialize(const SpellcheckCharAttribute* attribute, - bool allow_contraction); - - // Returns whether this word iterator is initialized. - bool IsInitialized() const; - - // Set text to be iterated. (This text does not have to be NULL-terminated.) - // This function also resets internal state so we can reuse this iterator - // without calling Initialize(). - bool SetText(const base::char16* text, size_t length); - - // Advances |iterator_| through |text_| and gets the current status of the - // word iterator within |text|: - // - // - Returns IS_WORD if the iterator just found the end of a sequence of word - // characters and it was able to normalize the sequence. This stores the - // normalized string into |word_string| and stores the position and length - // into |word_start| and |word_length| respectively. Keep in mind that - // since this function normalizes the output word, the length of - // |word_string| may be different from the |word_length|. Therefore, when - // we call functions that change the input text, such as - // string16::replace(), we need to use |word_start| and |word_length| as - // listed in the following snippet: - // - // while(iterator.GetNextWord(&word, &offset, &length)) - // text.replace(offset, length, word); - // - // - Returns IS_SKIPPABLE if the iterator just found a character that the - // iterator can skip past such as punctuation, whitespace, and characters - // from another character set. This stores the character, position, and - // length into |word_string|, |word_start|, and |word_length| respectively. - // - // - Returns IS_END_OF_TEXT if the iterator has reached the end of |text_|. - SpellcheckWordIterator::WordIteratorStatus - GetNextWord(base::string16* word_string, int* word_start, int* word_length); - - // Releases all the resources attached to this object. - void Reset(); - - private: - // Normalizes a non-terminated string returned from an ICU word-break - // iterator. A word returned from an ICU break iterator may include characters - // not supported by our spellchecker, e.g. ligatures, combining/ characters, - // full-width letters, etc. This function replaces such characters with - // alternative characters supported by our spellchecker. This function also - // calls SpellcheckWordIterator::OutputChar() to filter out false-positive - // characters. - bool Normalize(int input_start, - int input_length, - base::string16* output_string) const; - - // The pointer to the input string from which we are extracting words. - const base::char16* text_; - - // The language-specific attributes used for filtering out non-word - // characters. - const SpellcheckCharAttribute* attribute_; - - // The break iterator. - std::unique_ptr iterator_; - - DISALLOW_COPY_AND_ASSIGN(SpellcheckWordIterator); -}; - -#endif // CHROME_RENDERER_SPELLCHECKER_SPELLCHECK_WORDITERATOR_H_ diff --git a/filenames.gni b/filenames.gni index 922bf283dc..39597c6679 100644 --- a/filenames.gni +++ b/filenames.gni @@ -664,8 +664,6 @@ filenames = { "chromium_src/chrome/browser/process_singleton.h", "chromium_src/chrome/browser/ui/views/frame/global_menu_bar_registrar_x11.cc", "chromium_src/chrome/browser/ui/views/frame/global_menu_bar_registrar_x11.h", - "chromium_src/chrome/renderer/spellchecker/spellcheck_worditerator.cc", - "chromium_src/chrome/renderer/spellchecker/spellcheck_worditerator.h", ] lib_sources_nss = [