bug 176528, need a nsISemanticUnitScanner to support intl spam mail filter

Implement a new interface for this purpose.
win&linux, p=ftang, r=shanjian, sr=beard
mac build, p=nhotta, r=ftang, sr=beard
This commit is contained in:
shanjian%netscape.com 2002-11-20 21:13:52 +00:00
Родитель 75b79eb6e0
Коммит 3282f4a651
13 изменённых файлов: 345 добавлений и 13 удалений

Просмотреть файл

@ -815,6 +815,7 @@ sub BuildClientDist()
#LWBRK
InstallFromManifest(":mozilla:intl:lwbrk:public:MANIFEST", "$distdirectory:lwbrk:");
InstallFromManifest(":mozilla:intl:lwbrk:idl:MANIFEST_IDL", "$distdirectory:idl:");
#STRRES
InstallFromManifest(":mozilla:intl:strres:public:MANIFEST_IDL", "$distdirectory:idl:");
@ -1507,6 +1508,7 @@ sub BuildIDLProjects()
BuildIDLProject(":mozilla:intl:unicharutil:macbuild:unicharutilIDL.xml", "unicharutil");
BuildIDLProject(":mozilla:intl:uconv:macbuild:uconvIDL.xml", "uconv");
BuildIDLProject(":mozilla:intl:chardet:macbuild:chardetIDL.xml", "chardet");
BuildIDLProject(":mozilla:intl:lwbrk:macbuild:lwbrkIDL.xml", "lwbrk");
if ($main::options{iiextras})
{

Просмотреть файл

@ -46,6 +46,7 @@
// lwbrk
#include "nsLWBrkConstructors.h"
#include "nsSemanticUnitScanner.h"
// unicharutil
#include "nsUcharUtilConstructors.h"
@ -56,6 +57,9 @@
// locale
#include "nsLocaleConstructors.h"
NS_GENERIC_FACTORY_CONSTRUCTOR(nsSemanticUnitScanner);
static NS_METHOD
AddCategoryEntry(const char* category,
const char* key,
@ -290,6 +294,8 @@ static nsModuleComponentInfo components[] =
// lwbrk
{ "Line and Word Breaker", NS_LWBRK_CID,
NS_LWBRK_CONTRACTID, nsLWBreakerFImpConstructor},
{ "Semantic Unit Scanner", NS_SEMANTICUNITSCANNER_CID,
NS_SEMANTICUNITSCANNER_CONTRACTID, nsSemanticUnitScannerConstructor},
// unicharutil
{ "Unichar Utility", NS_UNICHARUTIL_CID,

Просмотреть файл

@ -26,7 +26,7 @@ VPATH = @srcdir@
include $(DEPTH)/config/autoconf.mk
DIRS = public src
DIRS = idl public src
ifdef ENABLE_TESTS
DIRS += tests

Просмотреть файл

Просмотреть файл

@ -0,0 +1,35 @@
#
# The contents of this file are subject to the Netscape Public
# License Version 1.1 (the "License"); you may not use this file
# except in compliance with the License. You may obtain a copy of
# the License at http://www.mozilla.org/NPL/
#
# Software distributed under the License is distributed on an "AS
# IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
# implied. See the License for the specific language governing
# rights and limitations under the License.
#
# The Original Code is mozilla.org code.
#
# The Initial Developer of the Original Code is Netscape
# Communications Corporation. Portions created by Netscape are
# Copyright (C) 1998 Netscape Communications Corporation. All
# Rights Reserved.
#
# Contributor(s):
#
DEPTH = ../../..
topsrcdir = @top_srcdir@
srcdir = @srcdir@
VPATH = @srcdir@
include $(DEPTH)/config/autoconf.mk
MODULE = lwbrk
XPIDLSRCS = \
nsISemanticUnitScanner.idl \
$(NULL)
include $(topsrcdir)/config/rules.mk

Просмотреть файл

@ -0,0 +1,82 @@
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/* ***** BEGIN LICENSE BLOCK *****
* Version: NPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Netscape Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/NPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is mozilla.org code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 1998
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Frank Yung-Fong Tang <ftang@netscape.com>
*
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the NPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the NPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include "nsISupports.idl"
%{C++
// {ADF42751-1CEF-4ad2-AA8E-BCB849D8D31F}
#define NS_SEMANTICUNITSCANNER_CID { 0xadf42751, 0x1cef, 0x4ad2, { 0xaa, 0x8e, 0xbc, 0xb8, 0x49, 0xd8, 0xd3, 0x1f}}
#define NS_SEMANTICUNITSCANNER_CONTRACTID "@mozilla.org/intl/semanticunitscanner;1"
%}
/**
* Provides a language independent way to break UNICODE
* text into meaningful semantic units (e.g. words).
*/
[scriptable, uuid(9f620be4-e535-11d6-b254-00039310a47a)]
interface nsISemanticUnitScanner : nsISupports {
/**
* start()
*
* Starts up the semantic unit scanner with an optional
* character set, which acts as a hint to optimize the heuristics
* used to determine the language(s) of the processed text.
*
* @param characterSet the character set the text was originally
* encoded in (can be NULL)
*/
void start(in string characterSet);
/**
* next()
* Get the begin / end offset of the next unit in the current text
*
* @param text the text to be scanned
* @param length the number of characters in the text to be processed
* @param pos the current position
* @param isLastBuffer, the buffer is the last one
* @param begin the begin offset of the next unit
* @param begin the end offset of the next unit
* @return has more unit in the current text
*/
boolean next(in wstring text, in long length, in long pos,
in boolean isLastBuffer,
out long begin, out long end );
};

Просмотреть файл

Просмотреть файл

@ -42,6 +42,7 @@ CPPSRCS = \
nsJISx4501LineBreaker.cpp \
nsLWBreakerFImp.cpp \
nsSampleWordBreaker.cpp \
nsSemanticUnitScanner.cpp \
$(NULL)
include $(topsrcdir)/config/rules.mk

Просмотреть файл

@ -74,18 +74,6 @@ nsresult nsSampleWordBreaker::BreakInBetween(
}
// hack
typedef enum {
kWbClassSpace = 0,
kWbClassAlphaLetter,
kWbClassPunct,
kWbClassHanLetter,
kWbClassKatakanaLetter,
kWbClassHiraganaLetter,
kWbClassHWKatakanaLetter,
kWbClassThaiLetter
} wb_class;
#define IS_ASCII(c) (0 == ( 0xFF80 & (c)))
#define ASCII_IS_ALPHA(c) ((( 'a' <= (c)) && ((c) <= 'z')) || (( 'A' <= (c)) && ((c) <= 'Z')))
#define ASCII_IS_DIGIT(c) (( '0' <= (c)) && ((c) <= '9'))

Просмотреть файл

@ -41,6 +41,17 @@
#include "nsIWordBreaker.h"
typedef enum {
kWbClassSpace = 0,
kWbClassAlphaLetter,
kWbClassPunct,
kWbClassHanLetter,
kWbClassKatakanaLetter,
kWbClassHiraganaLetter,
kWbClassHWKatakanaLetter,
kWbClassThaiLetter
} wb_class;
class nsSampleWordBreaker : public nsIWordBreaker
{
NS_DECL_ISUPPORTS

Просмотреть файл

@ -0,0 +1,119 @@
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* ***** BEGIN LICENSE BLOCK *****
* Version: NPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Netscape Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/NPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is mozilla.org code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 1998
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
*
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the NPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the NPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include "nsSemanticUnitScanner.h"
#include "prmem.h"
NS_IMPL_ISUPPORTS1(nsSemanticUnitScanner, nsISemanticUnitScanner)
nsSemanticUnitScanner::nsSemanticUnitScanner() : nsSampleWordBreaker()
{
NS_INIT_ISUPPORTS();
/* member initializers and constructor code */
}
nsSemanticUnitScanner::~nsSemanticUnitScanner()
{
/* destructor code */
}
/* void start (in string characterSet); */
NS_IMETHODIMP nsSemanticUnitScanner::Start(const char *characterSet)
{
// do nothing for now.
return NS_OK;
}
/* void next (in wstring text, in long length, in long pos, out boolean hasMoreUnits, out long begin, out long end); */
NS_IMETHODIMP nsSemanticUnitScanner::Next(const PRUnichar *text, PRInt32 length, PRInt32 pos, PRBool isLastBuffer, PRInt32 *begin, PRInt32 *end, PRBool *_retval)
{
// xxx need to bullet proff and check input pointer
// make sure begin, end and _retval is not nsnull here
// if we reach the end, just return
if (pos >= length) {
*begin = pos;
*end = pos;
*_retval = PR_FALSE;
return NS_OK;
}
PRUint8 char_class = nsSampleWordBreaker::GetClass(text[pos]);
// if we are in chinese mode, return on han letter at a time
// we should not do this if we are in Japanese or Korena mode
if (kWbClassHanLetter == char_class) {
*begin = pos;
*end = pos+1;
*_retval = PR_TRUE;
return NS_OK;
}
PRUint32 next;
PRBool needMoreText;
// find the next "word"
nsresult res = nsSampleWordBreaker::Next(text, (PRUint32) length, (PRUint32) pos,
&next, &needMoreText);
NS_ASSERTION(NS_SUCCEEDED(res), "nsSampleWordBreaker::Next failed");
if(NS_FAILED(res))
return res;
// if we don't have enough text to make decision, return
if (needMoreText) {
*begin = pos;
*end = pos;
*_retval = PR_FALSE;
return NS_OK;
}
// if what we got is space or punct, look at the next break
if ( (char_class == kWbClassSpace) || (char_class == kWbClassPunct) ) {
// if the next "word" is not letters,
// call itself recursively with the new pos
return Next(text, length, next, isLastBuffer, begin, end, _retval);
}
// for the rest, return
*begin = pos;
*end = next;
*_retval = PR_TRUE;
return NS_OK;
}

Просмотреть файл

@ -0,0 +1,58 @@
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* ***** BEGIN LICENSE BLOCK *****
* Version: NPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Netscape Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/NPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is mozilla.org code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 1998
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
*
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the NPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the NPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#ifndef nsSemanticUnitScanner_h__
#define nsSemanticUnitScanner_h__
#include "nsSampleWordBreaker.h"
#include "nsISemanticUnitScanner.h"
class nsSemanticUnitScanner : public nsISemanticUnitScanner
, public nsSampleWordBreaker
{
public:
NS_DECL_ISUPPORTS
NS_DECL_NSISEMANTICUNITSCANNER
nsSemanticUnitScanner();
virtual ~nsSemanticUnitScanner();
/* additional members */
};
#endif

Просмотреть файл

@ -1243,6 +1243,13 @@
<FILEKIND>Text</FILEKIND>
<FILEFLAGS>Debug</FILEFLAGS>
</FILE>
<FILE>
<PATHTYPE>Name</PATHTYPE>
<PATH>nsSemanticUnitScanner.cpp</PATH>
<PATHFORMAT>MacOS</PATHFORMAT>
<FILEKIND>Text</FILEKIND>
<FILEFLAGS>Debug</FILEFLAGS>
</FILE>
</FILELIST>
<LINKORDER>
<FILEREF>
@ -1435,6 +1442,11 @@
<PATH>nsEntityConverter.cpp</PATH>
<PATHFORMAT>MacOS</PATHFORMAT>
</FILEREF>
<FILEREF>
<PATHTYPE>Name</PATHTYPE>
<PATH>nsSemanticUnitScanner.cpp</PATH>
<PATHFORMAT>MacOS</PATHFORMAT>
</FILEREF>
</LINKORDER>
</TARGET>
<TARGET>
@ -2627,6 +2639,13 @@
<FILEKIND>Text</FILEKIND>
<FILEFLAGS>Debug</FILEFLAGS>
</FILE>
<FILE>
<PATHTYPE>Name</PATHTYPE>
<PATH>nsSemanticUnitScanner.cpp</PATH>
<PATHFORMAT>MacOS</PATHFORMAT>
<FILEKIND>Text</FILEKIND>
<FILEFLAGS>Debug</FILEFLAGS>
</FILE>
</FILELIST>
<LINKORDER>
<FILEREF>
@ -2819,6 +2838,11 @@
<PATH>nsEntityConverter.cpp</PATH>
<PATHFORMAT>MacOS</PATHFORMAT>
</FILEREF>
<FILEREF>
<PATHTYPE>Name</PATHTYPE>
<PATH>nsSemanticUnitScanner.cpp</PATH>
<PATHFORMAT>MacOS</PATHFORMAT>
</FILEREF>
</LINKORDER>
</TARGET>
</TARGETLIST>
@ -3034,6 +3058,12 @@
<PATH>rulebrk.c</PATH>
<PATHFORMAT>MacOS</PATHFORMAT>
</FILEREF>
<FILEREF>
<TARGETNAME>i18n.shlb</TARGETNAME>
<PATHTYPE>Name</PATHTYPE>
<PATH>nsSemanticUnitScanner.cpp</PATH>
<PATHFORMAT>MacOS</PATHFORMAT>
</FILEREF>
</GROUP>
<GROUP><NAME>strres</NAME>
<FILEREF>