#12481 libmime to feed more characters for charset detection

r=naoki, r=ducarroz, sr=sspitzer
This commit is contained in:
shanjian%netscape.com 2001-10-24 00:16:01 +00:00
Родитель db72c5cdf7
Коммит e97a3c32b9
4 изменённых файлов: 262 добавлений и 64 удалений

Просмотреть файл

@ -59,6 +59,7 @@
#include "mimebuf.h"
#include "nsMsgI18N.h"
#include "nsMimeTypes.h"
#include "nsICharsetConverterManager2.h"
static NS_DEFINE_CID(kPrefCID, NS_PREF_CID);
static NS_DEFINE_CID(kCharsetConverterManagerCID, NS_ICHARSETCONVERTERMANAGER_CID);
@ -1637,6 +1638,87 @@ void comi18n_destructor()
NS_IF_RELEASE(MimeCharsetConverterClass::mDetector);
}
//detect charset soly based on aBuf. return in aCharset
nsresult
MIME_detect_charset(const char *aBuf, PRInt32 aLength, const char** aCharset)
{
nsresult res;
char theBuffer[128];
CBufDescriptor theBufDecriptor( theBuffer, PR_TRUE, sizeof(theBuffer), 0);
nsCAutoString detector_contractid(theBufDecriptor);
nsXPIDLString detector_name;
nsCOMPtr<nsIStringCharsetDetector> detector;
detector_contractid.Assign(NS_STRCDETECTOR_CONTRACTID_BASE);
nsCOMPtr<nsIPref> prefs(do_GetService(NS_PREF_CONTRACTID, &res));
if (NS_SUCCEEDED(res)) {
if (NS_SUCCEEDED(prefs->GetLocalizedUnicharPref("intl.charset.detector", getter_Copies(detector_name)))) {
detector_contractid.Append(NS_ConvertUCS2toUTF8(detector_name).get());
}
}
if (detector_contractid.Length() > sizeof(NS_STRCDETECTOR_CONTRACTID_BASE)) {
detector = do_CreateInstance(detector_contractid, &res);
if (NS_SUCCEEDED(res)) {
nsDetectionConfident oConfident;
res = detector->DoIt(aBuf, aLength, aCharset, oConfident);
if (NS_SUCCEEDED(res) && (eBestAnswer == oConfident || eSureAnswer == oConfident)) {
return NS_OK;
}
else
*aCharset = nsnull;
}
}
return res;
}
//Get unicode decoder(from inputcharset to unicode) for aInputCharset
nsresult
MIME_get_unicode_decoder(const char* aInputCharset, nsIUnicodeDecoder **aDecoder)
{
nsresult res;
// get charset converters.
nsCOMPtr<nsICharsetConverterManager2> ccm2 =
do_GetService(NS_CHARSETCONVERTERMANAGER_CONTRACTID, &res);
if (NS_SUCCEEDED(res)) {
nsCOMPtr <nsIAtom> charsetAtom;
if (*aInputCharset)
res = ccm2->GetCharsetAtom(NS_ConvertASCIItoUCS2(aInputCharset).get(), getter_AddRefs(charsetAtom));
else
res = ccm2->GetCharsetAtom(NS_LITERAL_STRING("ISO-8859-1").get(), getter_AddRefs(charsetAtom));
// create a decoder (conv to unicode), ok if failed if we do auto detection
if (NS_SUCCEEDED(res))
res = ccm2->GetUnicodeDecoder(charsetAtom, aDecoder);
}
return res;
}
//Get unicode encoder(from unicode to inputcharset) for aOutputCharset
nsresult
MIME_get_unicode_encoder(const char* aOutputCharset, nsIUnicodeEncoder **aEncoder)
{
nsresult res;
// get charset converters.
nsCOMPtr<nsICharsetConverterManager2> ccm2 =
do_GetService(NS_CHARSETCONVERTERMANAGER_CONTRACTID, &res);
if (NS_SUCCEEDED(res)) {
nsCOMPtr <nsIAtom> charsetAtom;
if (*aOutputCharset) {
res = ccm2->GetCharsetAtom(NS_ConvertASCIItoUCS2(aOutputCharset).get(), getter_AddRefs(charsetAtom));
// create a encoder (conv from unicode)
if (NS_SUCCEEDED(res))
res = ccm2->GetUnicodeEncoder(charsetAtom, aEncoder);
}
}
return res;
}
} /* end of extern "C" */
// END PUBLIC INTERFACE

Просмотреть файл

@ -135,6 +135,10 @@ PRInt32 MIME_ConvertCharset(const PRBool autoDetection, const char* from_charset
*/
char * NextChar_UTF8(char *str);
nsresult MIME_detect_charset(const char *aBuf, PRInt32 aLength, const char** aCharset);
nsresult MIME_get_unicode_decoder(const char* aInputCharset, nsIUnicodeDecoder **aDecoder);
nsresult MIME_get_unicode_encoder(const char* aOutputCharset, nsIUnicodeEncoder **aEncoder);
#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */

Просмотреть файл

@ -28,6 +28,7 @@
#include "mimetext.h"
#include "mimebuf.h"
#include "mimethtm.h"
#include "comi18n.h"
#include "prlog.h"
#include "prmem.h"
@ -50,6 +51,7 @@ static int MimeInlineText_parse_end (MimeObject *, PRBool);
static int MimeInlineText_parse_decoded_buffer (char *, PRInt32, MimeObject *);
static int MimeInlineText_rotate_convert_and_parse_line(char *, PRInt32,
MimeObject *);
static int MimeInlineText_open_dam(MimeObject *obj);
static int
MimeInlineTextClassInitialize(MimeInlineTextClass *clazz)
@ -72,6 +74,7 @@ static int
MimeInlineText_initialize (MimeObject *obj)
{
MimeInlineText *text = (MimeInlineText *) obj;
text->inputAutodetect = PR_FALSE;
/* This is an abstract class; it shouldn't be directly instanciated. */
PR_ASSERT(obj->clazz != (MimeObjectClass *) &mimeInlineTextClass);
@ -107,8 +110,9 @@ MimeInlineText_initialize (MimeObject *obj)
if (!text->charset)
{
//we need to autodetect, but set defaultCharset first
if (obj->options && obj->options->default_charset)
text->charset = nsCRT::strdup(obj->options->default_charset);
text->defaultCharset = nsCRT::strdup(obj->options->default_charset);
else
{
// New change for falling back to a default view charset
@ -125,13 +129,29 @@ MimeInlineText_initialize (MimeObject *obj)
if (!text->defaultCharset)
text->defaultCharset = nsCRT::strdup("");
text->charset = nsCRT::strdup("");
}
text->inputAutodetect = PR_TRUE;
}
}
}
if (text->inputAutodetect)
{
//we need to prepare lineDam for charset detection
text->lineDamBuffer = (char*)PR_Malloc(DAM_MAX_BUFFER_SIZE);
text->lineDamPtrs = (char**)PR_Malloc(DAM_MAX_LINES*sizeof(char*));
text->curDamOffset = 0;
text->lastLineInDam = 0;
if (!text->lineDamBuffer || !text->lineDamPtrs)
{
text->charset = text->defaultCharset;
text->defaultCharset = nsnull;
text->inputAutodetect = PR_FALSE;
PR_FREEIF(text->lineDamBuffer);
PR_FREEIF(text->lineDamPtrs);
}
}
return ((MimeObjectClass*)&MIME_SUPERCLASS)->initialize(obj);
}
@ -151,6 +171,11 @@ MimeInlineText_finalize (MimeObject *obj)
PR_ASSERT(!text->cbuffer);
PR_FREEIF (text->cbuffer);
if (text->inputAutodetect) {
PR_FREEIF(text->lineDamBuffer);
PR_FREEIF(text->lineDamPtrs);
}
((MimeObjectClass*)&MIME_SUPERCLASS)->finalize (obj);
}
@ -161,6 +186,12 @@ MimeInlineText_parse_eof (MimeObject *obj, PRBool abort_p)
if (obj->closed_p) return 0;
NS_ASSERTION(!obj->parsed_p, "obj already parsed");
MimeInlineText *text = (MimeInlineText *) obj;
//we haven't find charset yet? now its the time
if (!text->charset && text->inputAutodetect)
MimeInlineText_open_dam(obj);
/* If there is still data in the ibuffer, that means that the last line of
this part didn't end in a newline; so push it out anyway (this means that
the parse_line method will be called with a string with no trailing
@ -278,6 +309,119 @@ MimeInlineText_parse_decoded_buffer (char *buf, PRInt32 size, MimeObject *obj)
&(text)->cbuffer, &(text)->cbuffer_size) \
: 0)
static int
MimeInlineText_convert_and_parse_line(char *line, PRInt32 length, MimeObject *obj)
{
int status;
char *converted = 0;
PRInt32 converted_len = 0;
MimeInlineText *text = (MimeInlineText *) obj;
//in case of charset autodetection, charset can be override by meta charset
if (text->inputAutodetect) {
if (mime_typep(obj, (MimeObjectClass *) &mimeInlineTextHTMLClass))
{
MimeInlineTextHTML *textHTML = (MimeInlineTextHTML *) obj;
if (textHTML->charset &&
*textHTML->charset &&
nsCRT::strcmp(textHTML->charset, text->charset))
{
//if meta tag specified charset is different from our detected result, use meta charset.
//but we don't want to redo previous lines
MIME_get_unicode_decoder(textHTML->charset, getter_AddRefs(text->inputDecoder));
PR_Free(text->charset);
text->charset = nsCRT::strdup(textHTML->charset);
}
}
}
//initiate decoder if not yet
if (text->inputDecoder == nsnull)
MIME_get_unicode_decoder(text->charset, getter_AddRefs(text->inputDecoder));
if (text->utf8Encoder == nsnull)
MIME_get_unicode_encoder("UTF-8", getter_AddRefs(text->utf8Encoder));
PRBool useInputCharsetConverter = obj->options->m_inputCharsetToUnicodeDecoder && !nsCRT::strcasecmp(text->charset, obj->options->charsetForCachedInputDecoder);
if (useInputCharsetConverter)
status = obj->options->charset_conversion_fn(/*input_autodetect*/PR_FALSE, line, length,
text->charset,
"UTF-8",
&converted,
&converted_len,
obj->options->stream_closure, obj->options->m_inputCharsetToUnicodeDecoder,
obj->options->m_unicodeToUTF8Encoder);
else
status = obj->options->charset_conversion_fn(/*input_autodetect*/PR_FALSE, line, length,
text->charset,
"UTF-8",
&converted,
&converted_len,
obj->options->stream_closure, (nsIUnicodeDecoder*)text->inputDecoder,
(nsIUnicodeEncoder*)text->utf8Encoder);
if (status < 0)
{
PR_FREEIF(converted);
return status;
}
if (converted)
{
line = converted;
length = converted_len;
}
/* Now that the line has been converted, call the subclass's parse_line
method with the decoded data. */
status = obj->clazz->parse_line(line, length, obj);
PR_FREEIF(converted);
return status;
}
//In this function call, all buffered lines in lineDam will be sent to charset detector
// and a charset will be used to parse all those line and following lines in this mime obj.
static int
MimeInlineText_open_dam(MimeObject *obj)
{
MimeInlineText *text = (MimeInlineText *) obj;
const char* detectedCharset;
nsresult res;
int status;
res = MIME_detect_charset(text->lineDamBuffer, text->curDamOffset, &detectedCharset);
if (NS_SUCCEEDED(res) && detectedCharset && *detectedCharset)
text->charset = nsCRT::strdup(detectedCharset);
else
{
PR_ASSERT(!text->charset);
//if autodetection does not lead to a result, use default-charset.
text->charset = text->defaultCharset;
text->defaultCharset = nsnull;
}
for (PRInt32 i = 0; i < text->lastLineInDam-1; i++)
{
status = MimeInlineText_convert_and_parse_line(
text->lineDamPtrs[i],
text->lineDamPtrs[i+1] - text->lineDamPtrs[i],
obj );
}
status = MimeInlineText_convert_and_parse_line(
text->lineDamPtrs[i],
text->lineDamBuffer + text->curDamOffset - text->lineDamPtrs[i],
obj );
PR_Free(text->lineDamPtrs);
PR_Free(text->lineDamBuffer);
text->lineDamPtrs = nsnull;
text->lineDamBuffer = nsnull;
return status;
}
static int
MimeInlineText_rotate_convert_and_parse_line(char *line, PRInt32 length,
@ -285,7 +429,6 @@ MimeInlineText_rotate_convert_and_parse_line(char *line, PRInt32 length,
{
int status;
MimeInlineTextClass *textc = (MimeInlineTextClass *) obj->clazz;
char *converted = 0;
PR_ASSERT(!obj->closed_p);
if (obj->closed_p) return -1;
@ -312,72 +455,30 @@ MimeInlineText_rotate_convert_and_parse_line(char *line, PRInt32 length,
(doConvert)
)
{
PRInt32 converted_len = 0;
const char *input_charset = NULL;
PRBool input_autodetect = PR_FALSE;
MimeInlineText *text = (MimeInlineText *) obj;
//
// Ok, first, check if this is an Inline HTML display, and if so,
// see if we detected a charset via a META tag.
//
if (mime_typep(obj, (MimeObjectClass *) &mimeInlineTextHTMLClass))
//if we don't have a charset yet, and autodetect is on, push line to dam
if (!(text->charset) && text->inputAutodetect)
{
MimeInlineTextHTML *textHTML = (MimeInlineTextHTML *) obj;
input_charset = textHTML->charset;
}
if (!input_charset)
{
if (obj->options->override_charset && obj->options->default_charset && *(obj->options->default_charset))
input_charset = obj->options->default_charset;
else if ( (text) && (text->charset) && (*(text->charset)) )
input_charset = text->charset;
else
{
if (obj->options->default_charset)
input_charset = obj->options->default_charset;
else
input_charset = text->defaultCharset;
input_autodetect = PR_TRUE;
//see if we reach the lineDam buffer limit, if so, there is no need to keep buffering
if (text->lastLineInDam >= DAM_MAX_LINES ||
DAM_MAX_BUFFER_SIZE - text->curDamOffset <= length) {
MimeInlineText_open_dam(obj);
status = MimeInlineText_convert_and_parse_line(line, length, obj);
}
else {
//buffering current line
text->lineDamPtrs[text->lastLineInDam] = text->lineDamBuffer + text->curDamOffset;
nsCRT::memcpy(text->lineDamPtrs[text->lastLineInDam], line, length);
text->lastLineInDam++;
text->curDamOffset += length;
}
}
const char *inputCharset = !nsCRT::strcasecmp(input_charset, "us-ascii") ? "ISO-8859-1" : input_charset;
PRBool useInputCharsetConverter = obj->options->m_inputCharsetToUnicodeDecoder && !nsCRT::strcasecmp(inputCharset, obj->options->charsetForCachedInputDecoder);
if (useInputCharsetConverter)
status = obj->options->charset_conversion_fn(input_autodetect, line, length,
inputCharset,
"UTF-8",
&converted,
&converted_len,
obj->options->stream_closure, obj->options->m_inputCharsetToUnicodeDecoder,
obj->options->m_unicodeToUTF8Encoder);
else
status = obj->options->charset_conversion_fn(input_autodetect, line, length,
input_charset,
"UTF-8",
&converted,
&converted_len,
obj->options->stream_closure, nsnull,
obj->options->m_unicodeToUTF8Encoder);
if (status < 0)
{
PR_FREEIF(converted);
return status;
}
if (converted)
{
line = converted;
length = converted_len;
}
else
status = MimeInlineText_convert_and_parse_line(line, length, obj);
}
else
status = obj->clazz->parse_line(line, length, obj);
/* Now that the line has been converted, call the subclass's parse_line
method with the decoded data. */
status = obj->clazz->parse_line(line, length, obj);
PR_FREEIF(converted);
return status;
}

Просмотреть файл

@ -83,6 +83,9 @@ struct MimeInlineTextClass {
extern MimeInlineTextClass mimeInlineTextClass;
#define DAM_MAX_BUFFER_SIZE 8*1024
#define DAM_MAX_LINES 1024
struct MimeInlineText {
MimeLeaf leaf; /* superclass variables */
char *charset; /* The charset from the content-type of this
@ -91,7 +94,15 @@ struct MimeInlineText {
char *defaultCharset; /* This is a charset to use when all else fails */
char *cbuffer; /* Buffer used for charset conversion. */
PRInt32 cbuffer_size;
nsCOMPtr<nsIUnicodeDecoder> inputDecoder;
nsCOMPtr<nsIUnicodeEncoder> utf8Encoder;
PRBool inputAutodetect;
PRInt32 lastLineInDam;
PRInt32 curDamOffset;
char *lineDamBuffer;
char **lineDamPtrs;
};
#endif /* _MIMETEXT_H_ */