зеркало из https://github.com/mozilla/pjs.git
bug 176528, need a nsISemanticUnitScanner to support intl spam mail filter
Implement a new interface for this purpose. win&linux, p=ftang, r=shanjian, sr=beard mac build, p=nhotta, r=ftang, sr=beard
This commit is contained in:
Родитель
75b79eb6e0
Коммит
3282f4a651
|
@ -815,6 +815,7 @@ sub BuildClientDist()
|
|||
|
||||
#LWBRK
|
||||
InstallFromManifest(":mozilla:intl:lwbrk:public:MANIFEST", "$distdirectory:lwbrk:");
|
||||
InstallFromManifest(":mozilla:intl:lwbrk:idl:MANIFEST_IDL", "$distdirectory:idl:");
|
||||
|
||||
#STRRES
|
||||
InstallFromManifest(":mozilla:intl:strres:public:MANIFEST_IDL", "$distdirectory:idl:");
|
||||
|
@ -1507,6 +1508,7 @@ sub BuildIDLProjects()
|
|||
BuildIDLProject(":mozilla:intl:unicharutil:macbuild:unicharutilIDL.xml", "unicharutil");
|
||||
BuildIDLProject(":mozilla:intl:uconv:macbuild:uconvIDL.xml", "uconv");
|
||||
BuildIDLProject(":mozilla:intl:chardet:macbuild:chardetIDL.xml", "chardet");
|
||||
BuildIDLProject(":mozilla:intl:lwbrk:macbuild:lwbrkIDL.xml", "lwbrk");
|
||||
|
||||
if ($main::options{iiextras})
|
||||
{
|
||||
|
|
|
@ -46,6 +46,7 @@
|
|||
|
||||
// lwbrk
|
||||
#include "nsLWBrkConstructors.h"
|
||||
#include "nsSemanticUnitScanner.h"
|
||||
|
||||
// unicharutil
|
||||
#include "nsUcharUtilConstructors.h"
|
||||
|
@ -56,6 +57,9 @@
|
|||
// locale
|
||||
#include "nsLocaleConstructors.h"
|
||||
|
||||
|
||||
NS_GENERIC_FACTORY_CONSTRUCTOR(nsSemanticUnitScanner);
|
||||
|
||||
static NS_METHOD
|
||||
AddCategoryEntry(const char* category,
|
||||
const char* key,
|
||||
|
@ -290,6 +294,8 @@ static nsModuleComponentInfo components[] =
|
|||
// lwbrk
|
||||
{ "Line and Word Breaker", NS_LWBRK_CID,
|
||||
NS_LWBRK_CONTRACTID, nsLWBreakerFImpConstructor},
|
||||
{ "Semantic Unit Scanner", NS_SEMANTICUNITSCANNER_CID,
|
||||
NS_SEMANTICUNITSCANNER_CONTRACTID, nsSemanticUnitScannerConstructor},
|
||||
|
||||
// unicharutil
|
||||
{ "Unichar Utility", NS_UNICHARUTIL_CID,
|
||||
|
|
|
@ -26,7 +26,7 @@ VPATH = @srcdir@
|
|||
|
||||
include $(DEPTH)/config/autoconf.mk
|
||||
|
||||
DIRS = public src
|
||||
DIRS = idl public src
|
||||
|
||||
ifdef ENABLE_TESTS
|
||||
DIRS += tests
|
||||
|
|
|
@ -0,0 +1,35 @@
|
|||
#
|
||||
# The contents of this file are subject to the Netscape Public
|
||||
# License Version 1.1 (the "License"); you may not use this file
|
||||
# except in compliance with the License. You may obtain a copy of
|
||||
# the License at http://www.mozilla.org/NPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS
|
||||
# IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
|
||||
# implied. See the License for the specific language governing
|
||||
# rights and limitations under the License.
|
||||
#
|
||||
# The Original Code is mozilla.org code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is Netscape
|
||||
# Communications Corporation. Portions created by Netscape are
|
||||
# Copyright (C) 1998 Netscape Communications Corporation. All
|
||||
# Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
#
|
||||
|
||||
DEPTH = ../../..
|
||||
topsrcdir = @top_srcdir@
|
||||
srcdir = @srcdir@
|
||||
VPATH = @srcdir@
|
||||
|
||||
include $(DEPTH)/config/autoconf.mk
|
||||
|
||||
MODULE = lwbrk
|
||||
|
||||
XPIDLSRCS = \
|
||||
nsISemanticUnitScanner.idl \
|
||||
$(NULL)
|
||||
|
||||
include $(topsrcdir)/config/rules.mk
|
|
@ -0,0 +1,82 @@
|
|||
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
|
||||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: NPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Netscape Public License
|
||||
* Version 1.1 (the "License"); you may not use this file except in
|
||||
* compliance with the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/NPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is mozilla.org code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 1998
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Frank Yung-Fong Tang <ftang@netscape.com>
|
||||
*
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the NPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the NPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
#include "nsISupports.idl"
|
||||
|
||||
%{C++
|
||||
// {ADF42751-1CEF-4ad2-AA8E-BCB849D8D31F}
|
||||
#define NS_SEMANTICUNITSCANNER_CID { 0xadf42751, 0x1cef, 0x4ad2, { 0xaa, 0x8e, 0xbc, 0xb8, 0x49, 0xd8, 0xd3, 0x1f}}
|
||||
#define NS_SEMANTICUNITSCANNER_CONTRACTID "@mozilla.org/intl/semanticunitscanner;1"
|
||||
%}
|
||||
|
||||
/**
|
||||
* Provides a language independent way to break UNICODE
|
||||
* text into meaningful semantic units (e.g. words).
|
||||
*/
|
||||
[scriptable, uuid(9f620be4-e535-11d6-b254-00039310a47a)]
|
||||
interface nsISemanticUnitScanner : nsISupports {
|
||||
/**
|
||||
* start()
|
||||
*
|
||||
* Starts up the semantic unit scanner with an optional
|
||||
* character set, which acts as a hint to optimize the heuristics
|
||||
* used to determine the language(s) of the processed text.
|
||||
*
|
||||
* @param characterSet the character set the text was originally
|
||||
* encoded in (can be NULL)
|
||||
*/
|
||||
void start(in string characterSet);
|
||||
|
||||
/**
|
||||
* next()
|
||||
* Get the begin / end offset of the next unit in the current text
|
||||
*
|
||||
* @param text the text to be scanned
|
||||
* @param length the number of characters in the text to be processed
|
||||
* @param pos the current position
|
||||
* @param isLastBuffer, the buffer is the last one
|
||||
* @param begin the begin offset of the next unit
|
||||
* @param begin the end offset of the next unit
|
||||
* @return has more unit in the current text
|
||||
*/
|
||||
boolean next(in wstring text, in long length, in long pos,
|
||||
in boolean isLastBuffer,
|
||||
out long begin, out long end );
|
||||
|
||||
};
|
|
@ -42,6 +42,7 @@ CPPSRCS = \
|
|||
nsJISx4501LineBreaker.cpp \
|
||||
nsLWBreakerFImp.cpp \
|
||||
nsSampleWordBreaker.cpp \
|
||||
nsSemanticUnitScanner.cpp \
|
||||
$(NULL)
|
||||
|
||||
include $(topsrcdir)/config/rules.mk
|
||||
|
|
|
@ -74,18 +74,6 @@ nsresult nsSampleWordBreaker::BreakInBetween(
|
|||
}
|
||||
|
||||
|
||||
// hack
|
||||
typedef enum {
|
||||
kWbClassSpace = 0,
|
||||
kWbClassAlphaLetter,
|
||||
kWbClassPunct,
|
||||
kWbClassHanLetter,
|
||||
kWbClassKatakanaLetter,
|
||||
kWbClassHiraganaLetter,
|
||||
kWbClassHWKatakanaLetter,
|
||||
kWbClassThaiLetter
|
||||
} wb_class;
|
||||
|
||||
#define IS_ASCII(c) (0 == ( 0xFF80 & (c)))
|
||||
#define ASCII_IS_ALPHA(c) ((( 'a' <= (c)) && ((c) <= 'z')) || (( 'A' <= (c)) && ((c) <= 'Z')))
|
||||
#define ASCII_IS_DIGIT(c) (( '0' <= (c)) && ((c) <= '9'))
|
||||
|
|
|
@ -41,6 +41,17 @@
|
|||
|
||||
#include "nsIWordBreaker.h"
|
||||
|
||||
typedef enum {
|
||||
kWbClassSpace = 0,
|
||||
kWbClassAlphaLetter,
|
||||
kWbClassPunct,
|
||||
kWbClassHanLetter,
|
||||
kWbClassKatakanaLetter,
|
||||
kWbClassHiraganaLetter,
|
||||
kWbClassHWKatakanaLetter,
|
||||
kWbClassThaiLetter
|
||||
} wb_class;
|
||||
|
||||
class nsSampleWordBreaker : public nsIWordBreaker
|
||||
{
|
||||
NS_DECL_ISUPPORTS
|
||||
|
|
|
@ -0,0 +1,119 @@
|
|||
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
||||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: NPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Netscape Public License
|
||||
* Version 1.1 (the "License"); you may not use this file except in
|
||||
* compliance with the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/NPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is mozilla.org code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 1998
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
*
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the NPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the NPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
#include "nsSemanticUnitScanner.h"
|
||||
#include "prmem.h"
|
||||
|
||||
NS_IMPL_ISUPPORTS1(nsSemanticUnitScanner, nsISemanticUnitScanner)
|
||||
|
||||
nsSemanticUnitScanner::nsSemanticUnitScanner() : nsSampleWordBreaker()
|
||||
{
|
||||
NS_INIT_ISUPPORTS();
|
||||
/* member initializers and constructor code */
|
||||
}
|
||||
|
||||
nsSemanticUnitScanner::~nsSemanticUnitScanner()
|
||||
{
|
||||
/* destructor code */
|
||||
}
|
||||
|
||||
|
||||
/* void start (in string characterSet); */
|
||||
NS_IMETHODIMP nsSemanticUnitScanner::Start(const char *characterSet)
|
||||
{
|
||||
// do nothing for now.
|
||||
return NS_OK;
|
||||
}
|
||||
|
||||
/* void next (in wstring text, in long length, in long pos, out boolean hasMoreUnits, out long begin, out long end); */
|
||||
NS_IMETHODIMP nsSemanticUnitScanner::Next(const PRUnichar *text, PRInt32 length, PRInt32 pos, PRBool isLastBuffer, PRInt32 *begin, PRInt32 *end, PRBool *_retval)
|
||||
{
|
||||
// xxx need to bullet proff and check input pointer
|
||||
// make sure begin, end and _retval is not nsnull here
|
||||
|
||||
// if we reach the end, just return
|
||||
if (pos >= length) {
|
||||
*begin = pos;
|
||||
*end = pos;
|
||||
*_retval = PR_FALSE;
|
||||
return NS_OK;
|
||||
}
|
||||
|
||||
PRUint8 char_class = nsSampleWordBreaker::GetClass(text[pos]);
|
||||
|
||||
// if we are in chinese mode, return on han letter at a time
|
||||
// we should not do this if we are in Japanese or Korena mode
|
||||
if (kWbClassHanLetter == char_class) {
|
||||
*begin = pos;
|
||||
*end = pos+1;
|
||||
*_retval = PR_TRUE;
|
||||
return NS_OK;
|
||||
}
|
||||
|
||||
PRUint32 next;
|
||||
PRBool needMoreText;
|
||||
// find the next "word"
|
||||
nsresult res = nsSampleWordBreaker::Next(text, (PRUint32) length, (PRUint32) pos,
|
||||
&next, &needMoreText);
|
||||
|
||||
NS_ASSERTION(NS_SUCCEEDED(res), "nsSampleWordBreaker::Next failed");
|
||||
if(NS_FAILED(res))
|
||||
return res;
|
||||
|
||||
// if we don't have enough text to make decision, return
|
||||
if (needMoreText) {
|
||||
*begin = pos;
|
||||
*end = pos;
|
||||
*_retval = PR_FALSE;
|
||||
return NS_OK;
|
||||
}
|
||||
|
||||
// if what we got is space or punct, look at the next break
|
||||
if ( (char_class == kWbClassSpace) || (char_class == kWbClassPunct) ) {
|
||||
// if the next "word" is not letters,
|
||||
// call itself recursively with the new pos
|
||||
return Next(text, length, next, isLastBuffer, begin, end, _retval);
|
||||
}
|
||||
|
||||
// for the rest, return
|
||||
*begin = pos;
|
||||
*end = next;
|
||||
*_retval = PR_TRUE;
|
||||
return NS_OK;
|
||||
}
|
||||
|
|
@ -0,0 +1,58 @@
|
|||
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
||||
/* ***** BEGIN LICENSE BLOCK *****
|
||||
* Version: NPL 1.1/GPL 2.0/LGPL 2.1
|
||||
*
|
||||
* The contents of this file are subject to the Netscape Public License
|
||||
* Version 1.1 (the "License"); you may not use this file except in
|
||||
* compliance with the License. You may obtain a copy of the License at
|
||||
* http://www.mozilla.org/NPL/
|
||||
*
|
||||
* Software distributed under the License is distributed on an "AS IS" basis,
|
||||
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
||||
* for the specific language governing rights and limitations under the
|
||||
* License.
|
||||
*
|
||||
* The Original Code is mozilla.org code.
|
||||
*
|
||||
* The Initial Developer of the Original Code is
|
||||
* Netscape Communications Corporation.
|
||||
* Portions created by the Initial Developer are Copyright (C) 1998
|
||||
* the Initial Developer. All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
*
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the terms of
|
||||
* either the GNU General Public License Version 2 or later (the "GPL"), or
|
||||
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
|
||||
* in which case the provisions of the GPL or the LGPL are applicable instead
|
||||
* of those above. If you wish to allow use of your version of this file only
|
||||
* under the terms of either the GPL or the LGPL, and not to allow others to
|
||||
* use your version of this file under the terms of the NPL, indicate your
|
||||
* decision by deleting the provisions above and replace them with the notice
|
||||
* and other provisions required by the GPL or the LGPL. If you do not delete
|
||||
* the provisions above, a recipient may use your version of this file under
|
||||
* the terms of any one of the NPL, the GPL or the LGPL.
|
||||
*
|
||||
* ***** END LICENSE BLOCK ***** */
|
||||
|
||||
#ifndef nsSemanticUnitScanner_h__
|
||||
#define nsSemanticUnitScanner_h__
|
||||
|
||||
#include "nsSampleWordBreaker.h"
|
||||
#include "nsISemanticUnitScanner.h"
|
||||
|
||||
|
||||
class nsSemanticUnitScanner : public nsISemanticUnitScanner
|
||||
, public nsSampleWordBreaker
|
||||
{
|
||||
public:
|
||||
NS_DECL_ISUPPORTS
|
||||
NS_DECL_NSISEMANTICUNITSCANNER
|
||||
|
||||
nsSemanticUnitScanner();
|
||||
virtual ~nsSemanticUnitScanner();
|
||||
/* additional members */
|
||||
};
|
||||
|
||||
#endif
|
|
@ -1243,6 +1243,13 @@
|
|||
<FILEKIND>Text</FILEKIND>
|
||||
<FILEFLAGS>Debug</FILEFLAGS>
|
||||
</FILE>
|
||||
<FILE>
|
||||
<PATHTYPE>Name</PATHTYPE>
|
||||
<PATH>nsSemanticUnitScanner.cpp</PATH>
|
||||
<PATHFORMAT>MacOS</PATHFORMAT>
|
||||
<FILEKIND>Text</FILEKIND>
|
||||
<FILEFLAGS>Debug</FILEFLAGS>
|
||||
</FILE>
|
||||
</FILELIST>
|
||||
<LINKORDER>
|
||||
<FILEREF>
|
||||
|
@ -1435,6 +1442,11 @@
|
|||
<PATH>nsEntityConverter.cpp</PATH>
|
||||
<PATHFORMAT>MacOS</PATHFORMAT>
|
||||
</FILEREF>
|
||||
<FILEREF>
|
||||
<PATHTYPE>Name</PATHTYPE>
|
||||
<PATH>nsSemanticUnitScanner.cpp</PATH>
|
||||
<PATHFORMAT>MacOS</PATHFORMAT>
|
||||
</FILEREF>
|
||||
</LINKORDER>
|
||||
</TARGET>
|
||||
<TARGET>
|
||||
|
@ -2627,6 +2639,13 @@
|
|||
<FILEKIND>Text</FILEKIND>
|
||||
<FILEFLAGS>Debug</FILEFLAGS>
|
||||
</FILE>
|
||||
<FILE>
|
||||
<PATHTYPE>Name</PATHTYPE>
|
||||
<PATH>nsSemanticUnitScanner.cpp</PATH>
|
||||
<PATHFORMAT>MacOS</PATHFORMAT>
|
||||
<FILEKIND>Text</FILEKIND>
|
||||
<FILEFLAGS>Debug</FILEFLAGS>
|
||||
</FILE>
|
||||
</FILELIST>
|
||||
<LINKORDER>
|
||||
<FILEREF>
|
||||
|
@ -2819,6 +2838,11 @@
|
|||
<PATH>nsEntityConverter.cpp</PATH>
|
||||
<PATHFORMAT>MacOS</PATHFORMAT>
|
||||
</FILEREF>
|
||||
<FILEREF>
|
||||
<PATHTYPE>Name</PATHTYPE>
|
||||
<PATH>nsSemanticUnitScanner.cpp</PATH>
|
||||
<PATHFORMAT>MacOS</PATHFORMAT>
|
||||
</FILEREF>
|
||||
</LINKORDER>
|
||||
</TARGET>
|
||||
</TARGETLIST>
|
||||
|
@ -3034,6 +3058,12 @@
|
|||
<PATH>rulebrk.c</PATH>
|
||||
<PATHFORMAT>MacOS</PATHFORMAT>
|
||||
</FILEREF>
|
||||
<FILEREF>
|
||||
<TARGETNAME>i18n.shlb</TARGETNAME>
|
||||
<PATHTYPE>Name</PATHTYPE>
|
||||
<PATH>nsSemanticUnitScanner.cpp</PATH>
|
||||
<PATHFORMAT>MacOS</PATHFORMAT>
|
||||
</FILEREF>
|
||||
</GROUP>
|
||||
<GROUP><NAME>strres</NAME>
|
||||
<FILEREF>
|
||||
|
|
Загрузка…
Ссылка в новой задаче