diff --git a/build/mac/build_scripts/MozillaBuildList.pm b/build/mac/build_scripts/MozillaBuildList.pm index 9fe16a4865f..edc434e484e 100644 --- a/build/mac/build_scripts/MozillaBuildList.pm +++ b/build/mac/build_scripts/MozillaBuildList.pm @@ -815,6 +815,7 @@ sub BuildClientDist() #LWBRK InstallFromManifest(":mozilla:intl:lwbrk:public:MANIFEST", "$distdirectory:lwbrk:"); + InstallFromManifest(":mozilla:intl:lwbrk:idl:MANIFEST_IDL", "$distdirectory:idl:"); #STRRES InstallFromManifest(":mozilla:intl:strres:public:MANIFEST_IDL", "$distdirectory:idl:"); @@ -1507,6 +1508,7 @@ sub BuildIDLProjects() BuildIDLProject(":mozilla:intl:unicharutil:macbuild:unicharutilIDL.xml", "unicharutil"); BuildIDLProject(":mozilla:intl:uconv:macbuild:uconvIDL.xml", "uconv"); BuildIDLProject(":mozilla:intl:chardet:macbuild:chardetIDL.xml", "chardet"); + BuildIDLProject(":mozilla:intl:lwbrk:macbuild:lwbrkIDL.xml", "lwbrk"); if ($main::options{iiextras}) { diff --git a/intl/build/nsI18nModule.cpp b/intl/build/nsI18nModule.cpp index 37f8a306883..b150016e693 100644 --- a/intl/build/nsI18nModule.cpp +++ b/intl/build/nsI18nModule.cpp @@ -46,6 +46,7 @@ // lwbrk #include "nsLWBrkConstructors.h" +#include "nsSemanticUnitScanner.h" // unicharutil #include "nsUcharUtilConstructors.h" @@ -56,6 +57,9 @@ // locale #include "nsLocaleConstructors.h" + +NS_GENERIC_FACTORY_CONSTRUCTOR(nsSemanticUnitScanner); + static NS_METHOD AddCategoryEntry(const char* category, const char* key, @@ -290,6 +294,8 @@ static nsModuleComponentInfo components[] = // lwbrk { "Line and Word Breaker", NS_LWBRK_CID, NS_LWBRK_CONTRACTID, nsLWBreakerFImpConstructor}, + { "Semantic Unit Scanner", NS_SEMANTICUNITSCANNER_CID, + NS_SEMANTICUNITSCANNER_CONTRACTID, nsSemanticUnitScannerConstructor}, // unicharutil { "Unichar Utility", NS_UNICHARUTIL_CID, diff --git a/intl/lwbrk/Makefile.in b/intl/lwbrk/Makefile.in index a9453f945df..42eae090c11 100644 --- a/intl/lwbrk/Makefile.in +++ b/intl/lwbrk/Makefile.in @@ -26,7 +26,7 @@ VPATH = @srcdir@ include $(DEPTH)/config/autoconf.mk -DIRS = public src +DIRS = idl public src ifdef ENABLE_TESTS DIRS += tests diff --git a/intl/lwbrk/idl/MANIFEST_IDL b/intl/lwbrk/idl/MANIFEST_IDL new file mode 100644 index 00000000000..e69de29bb2d diff --git a/intl/lwbrk/idl/Makefile.in b/intl/lwbrk/idl/Makefile.in new file mode 100644 index 00000000000..21d68fda8d1 --- /dev/null +++ b/intl/lwbrk/idl/Makefile.in @@ -0,0 +1,35 @@ +# +# The contents of this file are subject to the Netscape Public +# License Version 1.1 (the "License"); you may not use this file +# except in compliance with the License. You may obtain a copy of +# the License at http://www.mozilla.org/NPL/ +# +# Software distributed under the License is distributed on an "AS +# IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or +# implied. See the License for the specific language governing +# rights and limitations under the License. +# +# The Original Code is mozilla.org code. +# +# The Initial Developer of the Original Code is Netscape +# Communications Corporation. Portions created by Netscape are +# Copyright (C) 1998 Netscape Communications Corporation. All +# Rights Reserved. +# +# Contributor(s): +# + +DEPTH = ../../.. +topsrcdir = @top_srcdir@ +srcdir = @srcdir@ +VPATH = @srcdir@ + +include $(DEPTH)/config/autoconf.mk + +MODULE = lwbrk + +XPIDLSRCS = \ + nsISemanticUnitScanner.idl \ + $(NULL) + +include $(topsrcdir)/config/rules.mk diff --git a/intl/lwbrk/idl/nsISemanticUnitScanner.idl b/intl/lwbrk/idl/nsISemanticUnitScanner.idl new file mode 100644 index 00000000000..b7113e733e9 --- /dev/null +++ b/intl/lwbrk/idl/nsISemanticUnitScanner.idl @@ -0,0 +1,82 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: NPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Netscape Public License + * Version 1.1 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * http://www.mozilla.org/NPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is mozilla.org code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * Frank Yung-Fong Tang + * + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the NPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the NPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "nsISupports.idl" + +%{C++ +// {ADF42751-1CEF-4ad2-AA8E-BCB849D8D31F} +#define NS_SEMANTICUNITSCANNER_CID { 0xadf42751, 0x1cef, 0x4ad2, { 0xaa, 0x8e, 0xbc, 0xb8, 0x49, 0xd8, 0xd3, 0x1f}} +#define NS_SEMANTICUNITSCANNER_CONTRACTID "@mozilla.org/intl/semanticunitscanner;1" +%} + +/** + * Provides a language independent way to break UNICODE + * text into meaningful semantic units (e.g. words). + */ +[scriptable, uuid(9f620be4-e535-11d6-b254-00039310a47a)] +interface nsISemanticUnitScanner : nsISupports { + /** + * start() + * + * Starts up the semantic unit scanner with an optional + * character set, which acts as a hint to optimize the heuristics + * used to determine the language(s) of the processed text. + * + * @param characterSet the character set the text was originally + * encoded in (can be NULL) + */ + void start(in string characterSet); + + /** + * next() + * Get the begin / end offset of the next unit in the current text + * + * @param text the text to be scanned + * @param length the number of characters in the text to be processed + * @param pos the current position + * @param isLastBuffer, the buffer is the last one + * @param begin the begin offset of the next unit + * @param begin the end offset of the next unit + * @return has more unit in the current text + */ + boolean next(in wstring text, in long length, in long pos, + in boolean isLastBuffer, + out long begin, out long end ); + +}; diff --git a/intl/lwbrk/macbuild/lwbrkIDL.xml b/intl/lwbrk/macbuild/lwbrkIDL.xml new file mode 100644 index 00000000000..e69de29bb2d diff --git a/intl/lwbrk/src/Makefile.in b/intl/lwbrk/src/Makefile.in index a76175cccb0..3d7db7b5fab 100644 --- a/intl/lwbrk/src/Makefile.in +++ b/intl/lwbrk/src/Makefile.in @@ -42,6 +42,7 @@ CPPSRCS = \ nsJISx4501LineBreaker.cpp \ nsLWBreakerFImp.cpp \ nsSampleWordBreaker.cpp \ + nsSemanticUnitScanner.cpp \ $(NULL) include $(topsrcdir)/config/rules.mk diff --git a/intl/lwbrk/src/nsSampleWordBreaker.cpp b/intl/lwbrk/src/nsSampleWordBreaker.cpp index a674924a9dd..bfa6d1018e0 100644 --- a/intl/lwbrk/src/nsSampleWordBreaker.cpp +++ b/intl/lwbrk/src/nsSampleWordBreaker.cpp @@ -74,18 +74,6 @@ nsresult nsSampleWordBreaker::BreakInBetween( } -// hack -typedef enum { - kWbClassSpace = 0, - kWbClassAlphaLetter, - kWbClassPunct, - kWbClassHanLetter, - kWbClassKatakanaLetter, - kWbClassHiraganaLetter, - kWbClassHWKatakanaLetter, - kWbClassThaiLetter -} wb_class; - #define IS_ASCII(c) (0 == ( 0xFF80 & (c))) #define ASCII_IS_ALPHA(c) ((( 'a' <= (c)) && ((c) <= 'z')) || (( 'A' <= (c)) && ((c) <= 'Z'))) #define ASCII_IS_DIGIT(c) (( '0' <= (c)) && ((c) <= '9')) diff --git a/intl/lwbrk/src/nsSampleWordBreaker.h b/intl/lwbrk/src/nsSampleWordBreaker.h index 4aa51be66c1..ca30c1c072b 100644 --- a/intl/lwbrk/src/nsSampleWordBreaker.h +++ b/intl/lwbrk/src/nsSampleWordBreaker.h @@ -41,6 +41,17 @@ #include "nsIWordBreaker.h" +typedef enum { + kWbClassSpace = 0, + kWbClassAlphaLetter, + kWbClassPunct, + kWbClassHanLetter, + kWbClassKatakanaLetter, + kWbClassHiraganaLetter, + kWbClassHWKatakanaLetter, + kWbClassThaiLetter +} wb_class; + class nsSampleWordBreaker : public nsIWordBreaker { NS_DECL_ISUPPORTS diff --git a/intl/lwbrk/src/nsSemanticUnitScanner.cpp b/intl/lwbrk/src/nsSemanticUnitScanner.cpp new file mode 100644 index 00000000000..cc5d2f00599 --- /dev/null +++ b/intl/lwbrk/src/nsSemanticUnitScanner.cpp @@ -0,0 +1,119 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: NPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Netscape Public License + * Version 1.1 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * http://www.mozilla.org/NPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is mozilla.org code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the NPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the NPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "nsSemanticUnitScanner.h" +#include "prmem.h" + +NS_IMPL_ISUPPORTS1(nsSemanticUnitScanner, nsISemanticUnitScanner) + +nsSemanticUnitScanner::nsSemanticUnitScanner() : nsSampleWordBreaker() +{ + NS_INIT_ISUPPORTS(); + /* member initializers and constructor code */ +} + +nsSemanticUnitScanner::~nsSemanticUnitScanner() +{ + /* destructor code */ +} + + +/* void start (in string characterSet); */ +NS_IMETHODIMP nsSemanticUnitScanner::Start(const char *characterSet) +{ + // do nothing for now. + return NS_OK; +} + +/* void next (in wstring text, in long length, in long pos, out boolean hasMoreUnits, out long begin, out long end); */ +NS_IMETHODIMP nsSemanticUnitScanner::Next(const PRUnichar *text, PRInt32 length, PRInt32 pos, PRBool isLastBuffer, PRInt32 *begin, PRInt32 *end, PRBool *_retval) +{ + // xxx need to bullet proff and check input pointer + // make sure begin, end and _retval is not nsnull here + + // if we reach the end, just return + if (pos >= length) { + *begin = pos; + *end = pos; + *_retval = PR_FALSE; + return NS_OK; + } + + PRUint8 char_class = nsSampleWordBreaker::GetClass(text[pos]); + + // if we are in chinese mode, return on han letter at a time + // we should not do this if we are in Japanese or Korena mode + if (kWbClassHanLetter == char_class) { + *begin = pos; + *end = pos+1; + *_retval = PR_TRUE; + return NS_OK; + } + + PRUint32 next; + PRBool needMoreText; + // find the next "word" + nsresult res = nsSampleWordBreaker::Next(text, (PRUint32) length, (PRUint32) pos, + &next, &needMoreText); + + NS_ASSERTION(NS_SUCCEEDED(res), "nsSampleWordBreaker::Next failed"); + if(NS_FAILED(res)) + return res; + + // if we don't have enough text to make decision, return + if (needMoreText) { + *begin = pos; + *end = pos; + *_retval = PR_FALSE; + return NS_OK; + } + + // if what we got is space or punct, look at the next break + if ( (char_class == kWbClassSpace) || (char_class == kWbClassPunct) ) { + // if the next "word" is not letters, + // call itself recursively with the new pos + return Next(text, length, next, isLastBuffer, begin, end, _retval); + } + + // for the rest, return + *begin = pos; + *end = next; + *_retval = PR_TRUE; + return NS_OK; +} + diff --git a/intl/lwbrk/src/nsSemanticUnitScanner.h b/intl/lwbrk/src/nsSemanticUnitScanner.h new file mode 100644 index 00000000000..2b9cef055ca --- /dev/null +++ b/intl/lwbrk/src/nsSemanticUnitScanner.h @@ -0,0 +1,58 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: NPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Netscape Public License + * Version 1.1 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * http://www.mozilla.org/NPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is mozilla.org code. + * + * The Initial Developer of the Original Code is + * Netscape Communications Corporation. + * Portions created by the Initial Developer are Copyright (C) 1998 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the NPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the NPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#ifndef nsSemanticUnitScanner_h__ +#define nsSemanticUnitScanner_h__ + +#include "nsSampleWordBreaker.h" +#include "nsISemanticUnitScanner.h" + + +class nsSemanticUnitScanner : public nsISemanticUnitScanner + , public nsSampleWordBreaker +{ +public: + NS_DECL_ISUPPORTS + NS_DECL_NSISEMANTICUNITSCANNER + + nsSemanticUnitScanner(); + virtual ~nsSemanticUnitScanner(); + /* additional members */ +}; + +#endif \ No newline at end of file diff --git a/intl/macbuild/i18n.xml b/intl/macbuild/i18n.xml index e720e698410..c9b3b20f7c0 100644 --- a/intl/macbuild/i18n.xml +++ b/intl/macbuild/i18n.xml @@ -1243,6 +1243,13 @@ Text Debug + + Name + nsSemanticUnitScanner.cpp + MacOS + Text + Debug + @@ -1435,6 +1442,11 @@ nsEntityConverter.cpp MacOS + + Name + nsSemanticUnitScanner.cpp + MacOS + @@ -2627,6 +2639,13 @@ Text Debug + + Name + nsSemanticUnitScanner.cpp + MacOS + Text + Debug + @@ -2819,6 +2838,11 @@ nsEntityConverter.cpp MacOS + + Name + nsSemanticUnitScanner.cpp + MacOS + @@ -3034,6 +3058,12 @@ rulebrk.c MacOS + + i18n.shlb + Name + nsSemanticUnitScanner.cpp + MacOS + strres