Added support to use the specified document charset when encoding a

document to HTML to Text. The charset information is first encoded
in XIF and then that information is used when interpretting the unicode
for output.

Added support to output character entity information which should address
bug 4709
This commit is contained in:
kostello%netscape.com 1999-04-26 06:16:49 +00:00
Родитель 09c3f2e1b1
Коммит 65091e3762
12 изменённых файлов: 634 добавлений и 114 удалений

Просмотреть файл

@ -36,6 +36,14 @@
#include "nsIParser.h"
#include "nsHTMLEntities.h"
#include "nsIUnicodeEncoder.h"
#include "nsICharsetAlias.h"
#include "nsIServiceManager.h"
#include "nsICharsetConverterManager.h"
static NS_DEFINE_IID(kISupportsIID, NS_ISUPPORTS_IID);
static NS_DEFINE_IID(kIContentSinkIID, NS_ICONTENT_SINK_IID);
static NS_DEFINE_IID(kIHTMLContentSinkIID, NS_IHTML_CONTENT_SINK_IID);
@ -343,6 +351,58 @@ NS_New_HTML_ContentSinkStream(nsIHTMLContentSink** aInstancePtrResult,
return it->QueryInterface(kIHTMLContentSinkIID, (void **)aInstancePtrResult);
}
/**
* Inits the encoder instance variable for the sink based on the charset
*
* @update gpk 4/21/99
* @param aCharset
* @return NS_xxx error result
*/
nsresult nsHTMLContentSinkStream::InitEncoder(const nsString& aCharset)
{
nsresult res = NS_OK;
nsICharsetAlias* calias = nsnull;
res = nsServiceManager::GetService(kCharsetAliasCID,
kICharsetAliasIID,
(nsISupports**)&calias);
NS_ASSERTION( nsnull != calias, "cannot find charet alias");
nsAutoString charsetName = aCharset;
if( NS_SUCCEEDED(res) && (nsnull != calias))
{
res = calias->GetPreferred(aCharset, charsetName);
nsServiceManager::ReleaseService(kCharsetAliasCID, calias);
if(NS_FAILED(res))
{
// failed - unknown alias , fallback to ISO-8859-1
charsetName = "ISO-8859-1";
}
nsICharsetConverterManager * ccm = nsnull;
res = nsServiceManager::GetService(kCharsetConverterManagerCID,
kICharsetConverterManagerIID,
(nsISupports**)&ccm);
if(NS_SUCCEEDED(res) && (nsnull != ccm))
{
nsIUnicodeEncoder * encoder = nsnull;
res = ccm->GetUnicodeEncoder(&charsetName, &encoder);
if(NS_SUCCEEDED(res) && (nsnull != encoder))
{
NS_IF_RELEASE(mUnicodeEncoder);
mUnicodeEncoder = encoder;
}
nsServiceManager::ReleaseService(kCharsetConverterManagerCID, ccm);
}
}
return res;
}
/**
* Construct a content sink stream.
* @update gess7/7/98
@ -361,6 +421,7 @@ nsHTMLContentSinkStream::nsHTMLContentSinkStream(PRBool aDoFormat,PRBool aDoHead
mDoHeader = aDoHeader;
mBuffer = nsnull;
mBufferSize = 0;
mUnicodeEncoder = nsnull;
}
/**
@ -381,6 +442,7 @@ nsHTMLContentSinkStream::nsHTMLContentSinkStream(ostream& aStream,PRBool aDoForm
mDoHeader = aDoHeader;
mBuffer = nsnull;
mBufferSize = 0;
mUnicodeEncoder = nsnull;
}
@ -443,9 +505,16 @@ void nsHTMLContentSinkStream::UnicodeToHTMLString(const nsString& aSrc)
const char* entity = nsnull;
PRUint32 offset = 0;
PRUint32 addedLength = 0;
nsAutoString data;
if (mUnicodeEncoder == nsnull)
InitEncoder("");
if (length > 0)
{
// Step 1. Convert anything that maps to character entity to
// the entity value
EnsureBufferSize(length);
for (PRInt32 i = 0; i < length; i++)
{
@ -454,29 +523,31 @@ void nsHTMLContentSinkStream::UnicodeToHTMLString(const nsString& aSrc)
entity = UnicodeToEntity(ch);
if (entity)
{
PRUint32 size = strlen(entity);
addedLength += size;
EnsureBufferSize(length+addedLength+1);
mBuffer[offset++] = '&';
mBuffer[offset] = 0;
strcat(mBuffer,entity);
PRUint32 temp = offset + size;
while (offset < temp)
{
mBuffer[offset] = tolower(mBuffer[offset]);
offset++;
}
mBuffer[offset++] = ';';
mBuffer[offset] = 0;
nsAutoString temp(entity);
temp.ToLowerCase();
data.Append('&');
data.Append(temp);
data.Append(';');
}
else if (ch < 128)
else
{
mBuffer[offset++] = (unsigned char)ch;
mBuffer[offset] = 0;
data.Append(ch);
}
}
}
// Step 2. Run the result through the converter
length = data.Length();
EnsureBufferSize(length);
PRInt32 bufferLength = mBufferSize;
mUnicodeEncoder->Reset();
nsresult result = mUnicodeEncoder->Convert(data, &length, mBuffer, &bufferLength);
mBuffer[bufferLength] = 0;
PRInt32 temp = bufferLength;
if (NS_SUCCEEDED(result))
result = mUnicodeEncoder->Finish(mBuffer,&temp);
}
}
@ -487,6 +558,7 @@ void nsHTMLContentSinkStream::UnicodeToHTMLString(const nsString& aSrc)
* @return
*/
nsHTMLContentSinkStream::~nsHTMLContentSinkStream() {
NS_IF_RELEASE(mUnicodeEncoder);
mOutput=0; //we don't own the stream we're given; just forget it.
}
@ -526,8 +598,8 @@ void nsHTMLContentSinkStream::WriteAttributes(const nsIParserNode& aNode,ostream
key.ToUpperCase();
key.ToCString(mBuffer,sizeof(gBuffer)-1);
EnsureBufferSize(key.Length());
key.ToCString(mBuffer,mBufferSize);
aStream << " " << mBuffer << char(kEqual);
mColPos += 1 + strlen(mBuffer) + 1;
@ -993,7 +1065,14 @@ nsHTMLContentSinkStream::AddLeaf(const nsIParserNode& aNode, ostream& aStream){
AddStartTag(aNode,aStream);
mHTMLTagStack[--mHTMLStackPos] = eHTMLTag_unknown;
}
if (type == eHTMLTag_text)
else if (type == eHTMLTag_entity)
{
const nsString& entity = aNode.GetText();
UnicodeToHTMLString(entity);
aStream << '&' << mBuffer << ';';
mColPos += entity.Length() + 2;
}
else if (type == eHTMLTag_text)
{
const nsString& text = aNode.GetText();
if ((mDoFormat == PR_FALSE) || preformatted == PR_TRUE)
@ -1136,9 +1215,25 @@ nsHTMLContentSinkStream::AddComment(const nsIParserNode& aNode){
*/
NS_IMETHODIMP
nsHTMLContentSinkStream::OpenContainer(const nsIParserNode& aNode){
if(mOutput) {
AddStartTag(aNode,*mOutput);
// eHTMLTags tag = (eHTMLTags)aNode.GetNodeType();
if(mOutput)
{
const nsString& name = aNode.GetText();
if (name.Equals("XIF_DOC_INFO"))
{
PRInt32 count=aNode.GetAttributeCount();
for(PRInt32 i=0;i<count;i++)
{
const nsString& key=aNode.GetKeyAt(i);
const nsString& value=aNode.GetValueAt(i);
if (key.Equals("charset"))
InitEncoder(value);
}
}
else
{
AddStartTag(aNode,*mOutput);
}
}
return NS_OK;
}

Просмотреть файл

@ -53,6 +53,8 @@
class ostream;
#endif
class nsIUnicodeEncoder;
class nsHTMLContentSinkStream : public nsIHTMLContentSink {
public:
@ -135,6 +137,7 @@ protected:
void UnicodeToHTMLString(const nsString& aSrc);
nsresult InitEncoder(const nsString& aCharset);
@ -153,6 +156,8 @@ protected:
char* mBuffer;
PRInt32 mBufferSize;
nsIUnicodeEncoder* mUnicodeEncoder;
};
extern NS_HTMLPARS nsresult

Просмотреть файл

@ -33,6 +33,12 @@
#include "nsString.h"
#include "nsIParser.h"
#include "nsHTMLEntities.h"
#include "nsXIFDTD.h"
#include "nsIUnicodeEncoder.h"
#include "nsICharsetAlias.h"
#include "nsIServiceManager.h"
#include "nsICharsetConverterManager.h"
static NS_DEFINE_IID(kISupportsIID, NS_ISUPPORTS_IID);
static NS_DEFINE_IID(kIContentSinkIID, NS_ICONTENT_SINK_IID);
@ -44,6 +50,55 @@ static PRBool IsInline(eHTMLTags aTag);
static PRBool IsBlockLevel(eHTMLTags aTag);
/**
* Inits the encoder instance variable for the sink based on the charset
*
* @update gpk 4/21/99
* @param aCharset
* @return NS_xxx error result
*/
nsresult nsHTMLToTXTSinkStream::InitEncoder(const nsString& aCharset)
{
nsresult res = NS_OK;
nsICharsetAlias* calias = nsnull;
res = nsServiceManager::GetService(kCharsetAliasCID,
kICharsetAliasIID,
(nsISupports**)&calias);
NS_ASSERTION( nsnull != calias, "cannot find charet alias");
nsAutoString charsetName = aCharset;
if( NS_SUCCEEDED(res) && (nsnull != calias))
{
res = calias->GetPreferred(aCharset, charsetName);
nsServiceManager::ReleaseService(kCharsetAliasCID, calias);
if(NS_FAILED(res))
{
// failed - unknown alias , fallback to ISO-8859-1
charsetName = "ISO-8859-1";
}
nsICharsetConverterManager * ccm = nsnull;
res = nsServiceManager::GetService(kCharsetConverterManagerCID,
kICharsetConverterManagerIID,
(nsISupports**)&ccm);
if(NS_SUCCEEDED(res) && (nsnull != ccm))
{
nsIUnicodeEncoder * encoder = nsnull;
res = ccm->GetUnicodeEncoder(&charsetName, &encoder);
if(NS_SUCCEEDED(res) && (nsnull != encoder))
{
NS_IF_RELEASE(mUnicodeEncoder);
mUnicodeEncoder = encoder;
}
nsServiceManager::ReleaseService(kCharsetConverterManagerCID, ccm);
}
}
return res;
}
@ -117,6 +172,7 @@ nsHTMLToTXTSinkStream::nsHTMLToTXTSinkStream() {
mDoOutput = PR_FALSE;
mBufferSize = 0;
mBuffer = nsnull;
mUnicodeEncoder = nsnull;
}
/**
@ -133,6 +189,7 @@ nsHTMLToTXTSinkStream::nsHTMLToTXTSinkStream(ostream& aStream) {
mDoOutput = PR_FALSE;
mBufferSize = 0;
mBuffer = nsnull;
mUnicodeEncoder = nsnull;
}
@ -145,6 +202,7 @@ nsHTMLToTXTSinkStream::nsHTMLToTXTSinkStream(ostream& aStream) {
nsHTMLToTXTSinkStream::~nsHTMLToTXTSinkStream() {
mOutput=0; //we don't own the stream we're given; just forget it.
delete [] mBuffer;
NS_IF_RELEASE(mUnicodeEncoder);
}
@ -409,42 +467,40 @@ void nsHTMLToTXTSinkStream::EnsureBufferSize(PRInt32 aNewSize)
}
void nsHTMLToTXTSinkStream::UnicodeToTXTString(const nsString& aSrc)
{
#define CH_NBSP 160
#define CH_QUOT 34
#define CH_AMP 38
#define CH_LT 60
#define CH_GT 62
PRInt32 length = aSrc.Length();
PRUnichar ch;
const char* entity = nsnull;
PRUint32 offset = 0;
PRUint32 addedLength = 0;
nsresult result;
PRInt32 bufferLength;
if (mUnicodeEncoder == nsnull)
InitEncoder("");
if (length > 0)
{
EnsureBufferSize(length);
for (PRInt32 i = 0; i < length; i++)
{
ch = aSrc.CharAt(i);
switch (ch)
{
case CH_QUOT: ch = '"'; break;
case CH_AMP: ch = '&'; break;
case CH_GT: ch = '>'; break;
case CH_LT: ch = '<'; break;
case CH_NBSP: ch = ' '; break;
}
bufferLength = mBufferSize;
mUnicodeEncoder->Reset();
result = mUnicodeEncoder->Convert(aSrc, &length, mBuffer, &bufferLength);
mBuffer[bufferLength] = 0;
PRInt32 temp = bufferLength;
if (NS_SUCCEEDED(result))
result = mUnicodeEncoder->Finish(mBuffer,&temp);
if (ch < 128)
{
mBuffer[offset++] = (unsigned char)ch;
mBuffer[offset] = 0;
}
for (PRInt32 i = 0; i < bufferLength; i++)
{
if (mBuffer[i] == char(CH_NBSP))
mBuffer[i] = ' ';
}
}
}
@ -483,6 +539,18 @@ nsHTMLToTXTSinkStream::AddLeaf(const nsIParserNode& aNode, ostream& aStream)
mStrBuffer.Append(mBuffer);
mColPos += text.Length();
}
else if (type == eHTMLTag_entity)
{
const nsString& text = aNode.GetText();
UnicodeToTXTString(text);
PRInt32 entity = NS_EntityToUnicode(mBuffer);
if (entity < 256)
{
char ch = (char)entity;
aStream << ch;
mColPos++;
}
}
else if (type == eHTMLTag_whitespace)
{
if (PR_TRUE)
@ -551,6 +619,18 @@ NS_IMETHODIMP
nsHTMLToTXTSinkStream::OpenContainer(const nsIParserNode& aNode){
eHTMLTags type = (eHTMLTags)aNode.GetNodeType();
const nsString& name = aNode.GetText();
if (name.Equals("XIF_DOC_INFO"))
{
PRInt32 count=aNode.GetAttributeCount();
for(PRInt32 i=0;i<count;i++)
{
const nsString& key=aNode.GetKeyAt(i);
const nsString& value=aNode.GetValueAt(i);
if (key.Equals("charset"))
InitEncoder(value);
}
}
if (type == eHTMLTag_body)
mDoOutput = PR_TRUE;

Просмотреть файл

@ -51,6 +51,9 @@
class ostream;
#endif
class nsIUnicodeEncoder;
class nsHTMLToTXTSinkStream : public nsIHTMLContentSink {
public:
@ -117,6 +120,7 @@ protected:
void EnsureBufferSize(PRInt32 aNewSize);
void UnicodeToTXTString(const nsString& aSrc);
nsresult InitEncoder(const nsString& aCharset);
protected:
@ -127,7 +131,8 @@ protected:
char* mBuffer;
PRInt32 mBufferSize;
nsString mStrBuffer;
nsString mStrBuffer;
nsIUnicodeEncoder* mUnicodeEncoder;
};

Просмотреть файл

@ -43,6 +43,8 @@ static NS_DEFINE_IID(kClassIID, NS_XIF_DTD_IID);
static const char* kNullToken = "Error: Null token given";
static const char* kInvalidTagStackPos = "Error: invalid tag stack position";
static const char* kXIFDocHeader= "<!DOCTYPE xif>";
static const char* kXIFDocInfo= "document_info";
static const char* kXIFCharset= "charset";
struct nsXIFTagEntry {
@ -73,7 +75,10 @@ nsXIFTagEntry gXIFTagTable[] =
{"css_stylerule", eXIFTag_css_stylerule},
{"css_stylesheet", eXIFTag_css_stylesheet},
{"document_info", eXIFTag_document_info},
{"encode", eXIFTag_encode},
{"entity", eXIFTag_entity},
{"import", eXIFTag_import},
@ -343,6 +348,7 @@ nsXIFDTD::nsXIFDTD() : nsIDTD(){
mInContent=PR_FALSE;
mLowerCaseAttributes=PR_TRUE;
mLowerCaseTags=PR_TRUE;
mCharset = "";
}
/**
@ -395,15 +401,38 @@ eAutoDetectResult nsXIFDTD::CanParse(nsString& aContentType, nsString& aCommand,
if(aContentType.Equals(kXIFTextContentType)){
result=ePrimaryDetect;
}
else {
else
{
if(kNotFound!=aBuffer.Find(kXIFDocHeader)) {
PRInt32 offset = aBuffer.Find("<section>");
if (offset != -1)
aBuffer.Cut(0,offset);
aContentType= kXIFTextContentType;
result=ePrimaryDetect;
}
}
nsString charset ="ISO-8859-1";
PRInt32 offset;
offset = aBuffer.Find(kXIFDocInfo);
if(kNotFound!=offset)
{
offset = aBuffer.Find(kXIFCharset);
if (kNotFound!=offset)
{
PRInt32 start = aBuffer.Find('"',offset);
PRInt32 end = aBuffer.Find('"',start+1);
if ((start != kNotFound) && (end != kNotFound))
{
charset = "";
for (PRInt32 i = start+1; i < end; i++)
{
PRUnichar ch = aBuffer[i];
charset.Append(ch);
}
}
}
}
mCharset = charset;
return result;
}
@ -638,6 +667,11 @@ nsresult nsXIFDTD::HandleStartToken(CToken* aToken) {
result = OpenContainer(node);
break;
case eXIFTag_entity:
StartTopOfStack();
ProcessEntityTag(node);
break;
case eXIFTag_content:
StartTopOfStack();
mInContent = PR_TRUE;
@ -647,6 +681,10 @@ nsresult nsXIFDTD::HandleStartToken(CToken* aToken) {
ProcessEncodeTag(node);
break;
case eXIFTag_document_info:
ProcessDocumentInfoTag(node);
break;
case eXIFTag_attr:
AddAttribute(node);
@ -1355,8 +1393,8 @@ void nsXIFDTD::BeginStartTag(const nsIParserNode& aNode)
if (type == eXIFTag_container)
PushHTMLTag(tag,tagName);
CToken* token = new CStartToken(tagName);
nsCParserNode* node = new nsCParserNode(token);
// CToken* token = new CStartToken(tagName);
// nsCParserNode* node = new nsCParserNode(token);
PushNodeAndToken(tagName);
break;
}
@ -1629,6 +1667,38 @@ void nsXIFDTD::ProcessEncodeTag(const nsIParserNode& aNode)
}
void nsXIFDTD::ProcessEntityTag(const nsIParserNode& aNode)
{
nsString value;
if (GetAttribute(aNode,nsString("value"),value))
{
CEntityToken* entity = new CEntityToken(value);
nsCParserNode node((CToken*)entity);
mSink->AddLeaf(node);
}
}
void nsXIFDTD::ProcessDocumentInfoTag(const nsIParserNode& aNode)
{
nsString value;
nsString key("charset");
if (GetAttribute(aNode,key,value))
{
PushNodeAndToken(nsString("XIF_DOC_INFO"));
CAttributeToken* attribute = new CAttributeToken(key,value);
nsIParserNode* top = PeekNode();
if (top != nsnull)
((nsCParserNode*)top)->AddAttribute(attribute);
}
}
/*** CSS Methods ****/
void nsXIFDTD::BeginCSSStyleSheet(const nsIParserNode& aNode)

Просмотреть файл

@ -71,7 +71,9 @@ enum eXIFTags
eXIFTag_css_stylesheet,
eXIFTag_doctype,
eXIFTag_encode,
eXIFTag_document_info,
eXIFTag_encode,
eXIFTag_entity,
eXIFTag_import,
eXIFTag_leaf,
eXIFTag_link,
@ -490,6 +492,8 @@ private:
private:
void ProcessEncodeTag(const nsIParserNode& aNode);
void ProcessEntityTag(const nsIParserNode& aNode);
void ProcessDocumentInfoTag(const nsIParserNode& aNode);
void BeginCSSStyleSheet(const nsIParserNode& aNode);
void EndCSSStyleSheet(const nsIParserNode& aNode);
@ -556,6 +560,7 @@ protected:
PRBool mLowerCaseTags;
PRBool mLowerCaseAttributes;
nsITokenizer* mTokenizer;
nsString mCharset;
};

Просмотреть файл

@ -36,6 +36,14 @@
#include "nsIParser.h"
#include "nsHTMLEntities.h"
#include "nsIUnicodeEncoder.h"
#include "nsICharsetAlias.h"
#include "nsIServiceManager.h"
#include "nsICharsetConverterManager.h"
static NS_DEFINE_IID(kISupportsIID, NS_ISUPPORTS_IID);
static NS_DEFINE_IID(kIContentSinkIID, NS_ICONTENT_SINK_IID);
static NS_DEFINE_IID(kIHTMLContentSinkIID, NS_IHTML_CONTENT_SINK_IID);
@ -343,6 +351,58 @@ NS_New_HTML_ContentSinkStream(nsIHTMLContentSink** aInstancePtrResult,
return it->QueryInterface(kIHTMLContentSinkIID, (void **)aInstancePtrResult);
}
/**
* Inits the encoder instance variable for the sink based on the charset
*
* @update gpk 4/21/99
* @param aCharset
* @return NS_xxx error result
*/
nsresult nsHTMLContentSinkStream::InitEncoder(const nsString& aCharset)
{
nsresult res = NS_OK;
nsICharsetAlias* calias = nsnull;
res = nsServiceManager::GetService(kCharsetAliasCID,
kICharsetAliasIID,
(nsISupports**)&calias);
NS_ASSERTION( nsnull != calias, "cannot find charet alias");
nsAutoString charsetName = aCharset;
if( NS_SUCCEEDED(res) && (nsnull != calias))
{
res = calias->GetPreferred(aCharset, charsetName);
nsServiceManager::ReleaseService(kCharsetAliasCID, calias);
if(NS_FAILED(res))
{
// failed - unknown alias , fallback to ISO-8859-1
charsetName = "ISO-8859-1";
}
nsICharsetConverterManager * ccm = nsnull;
res = nsServiceManager::GetService(kCharsetConverterManagerCID,
kICharsetConverterManagerIID,
(nsISupports**)&ccm);
if(NS_SUCCEEDED(res) && (nsnull != ccm))
{
nsIUnicodeEncoder * encoder = nsnull;
res = ccm->GetUnicodeEncoder(&charsetName, &encoder);
if(NS_SUCCEEDED(res) && (nsnull != encoder))
{
NS_IF_RELEASE(mUnicodeEncoder);
mUnicodeEncoder = encoder;
}
nsServiceManager::ReleaseService(kCharsetConverterManagerCID, ccm);
}
}
return res;
}
/**
* Construct a content sink stream.
* @update gess7/7/98
@ -361,6 +421,7 @@ nsHTMLContentSinkStream::nsHTMLContentSinkStream(PRBool aDoFormat,PRBool aDoHead
mDoHeader = aDoHeader;
mBuffer = nsnull;
mBufferSize = 0;
mUnicodeEncoder = nsnull;
}
/**
@ -381,6 +442,7 @@ nsHTMLContentSinkStream::nsHTMLContentSinkStream(ostream& aStream,PRBool aDoForm
mDoHeader = aDoHeader;
mBuffer = nsnull;
mBufferSize = 0;
mUnicodeEncoder = nsnull;
}
@ -443,9 +505,16 @@ void nsHTMLContentSinkStream::UnicodeToHTMLString(const nsString& aSrc)
const char* entity = nsnull;
PRUint32 offset = 0;
PRUint32 addedLength = 0;
nsAutoString data;
if (mUnicodeEncoder == nsnull)
InitEncoder("");
if (length > 0)
{
// Step 1. Convert anything that maps to character entity to
// the entity value
EnsureBufferSize(length);
for (PRInt32 i = 0; i < length; i++)
{
@ -454,29 +523,31 @@ void nsHTMLContentSinkStream::UnicodeToHTMLString(const nsString& aSrc)
entity = UnicodeToEntity(ch);
if (entity)
{
PRUint32 size = strlen(entity);
addedLength += size;
EnsureBufferSize(length+addedLength+1);
mBuffer[offset++] = '&';
mBuffer[offset] = 0;
strcat(mBuffer,entity);
PRUint32 temp = offset + size;
while (offset < temp)
{
mBuffer[offset] = tolower(mBuffer[offset]);
offset++;
}
mBuffer[offset++] = ';';
mBuffer[offset] = 0;
nsAutoString temp(entity);
temp.ToLowerCase();
data.Append('&');
data.Append(temp);
data.Append(';');
}
else if (ch < 128)
else
{
mBuffer[offset++] = (unsigned char)ch;
mBuffer[offset] = 0;
data.Append(ch);
}
}
}
// Step 2. Run the result through the converter
length = data.Length();
EnsureBufferSize(length);
PRInt32 bufferLength = mBufferSize;
mUnicodeEncoder->Reset();
nsresult result = mUnicodeEncoder->Convert(data, &length, mBuffer, &bufferLength);
mBuffer[bufferLength] = 0;
PRInt32 temp = bufferLength;
if (NS_SUCCEEDED(result))
result = mUnicodeEncoder->Finish(mBuffer,&temp);
}
}
@ -487,6 +558,7 @@ void nsHTMLContentSinkStream::UnicodeToHTMLString(const nsString& aSrc)
* @return
*/
nsHTMLContentSinkStream::~nsHTMLContentSinkStream() {
NS_IF_RELEASE(mUnicodeEncoder);
mOutput=0; //we don't own the stream we're given; just forget it.
}
@ -526,8 +598,8 @@ void nsHTMLContentSinkStream::WriteAttributes(const nsIParserNode& aNode,ostream
key.ToUpperCase();
key.ToCString(mBuffer,sizeof(gBuffer)-1);
EnsureBufferSize(key.Length());
key.ToCString(mBuffer,mBufferSize);
aStream << " " << mBuffer << char(kEqual);
mColPos += 1 + strlen(mBuffer) + 1;
@ -993,7 +1065,14 @@ nsHTMLContentSinkStream::AddLeaf(const nsIParserNode& aNode, ostream& aStream){
AddStartTag(aNode,aStream);
mHTMLTagStack[--mHTMLStackPos] = eHTMLTag_unknown;
}
if (type == eHTMLTag_text)
else if (type == eHTMLTag_entity)
{
const nsString& entity = aNode.GetText();
UnicodeToHTMLString(entity);
aStream << '&' << mBuffer << ';';
mColPos += entity.Length() + 2;
}
else if (type == eHTMLTag_text)
{
const nsString& text = aNode.GetText();
if ((mDoFormat == PR_FALSE) || preformatted == PR_TRUE)
@ -1136,9 +1215,25 @@ nsHTMLContentSinkStream::AddComment(const nsIParserNode& aNode){
*/
NS_IMETHODIMP
nsHTMLContentSinkStream::OpenContainer(const nsIParserNode& aNode){
if(mOutput) {
AddStartTag(aNode,*mOutput);
// eHTMLTags tag = (eHTMLTags)aNode.GetNodeType();
if(mOutput)
{
const nsString& name = aNode.GetText();
if (name.Equals("XIF_DOC_INFO"))
{
PRInt32 count=aNode.GetAttributeCount();
for(PRInt32 i=0;i<count;i++)
{
const nsString& key=aNode.GetKeyAt(i);
const nsString& value=aNode.GetValueAt(i);
if (key.Equals("charset"))
InitEncoder(value);
}
}
else
{
AddStartTag(aNode,*mOutput);
}
}
return NS_OK;
}

Просмотреть файл

@ -53,6 +53,8 @@
class ostream;
#endif
class nsIUnicodeEncoder;
class nsHTMLContentSinkStream : public nsIHTMLContentSink {
public:
@ -135,6 +137,7 @@ protected:
void UnicodeToHTMLString(const nsString& aSrc);
nsresult InitEncoder(const nsString& aCharset);
@ -153,6 +156,8 @@ protected:
char* mBuffer;
PRInt32 mBufferSize;
nsIUnicodeEncoder* mUnicodeEncoder;
};
extern NS_HTMLPARS nsresult

Просмотреть файл

@ -33,6 +33,12 @@
#include "nsString.h"
#include "nsIParser.h"
#include "nsHTMLEntities.h"
#include "nsXIFDTD.h"
#include "nsIUnicodeEncoder.h"
#include "nsICharsetAlias.h"
#include "nsIServiceManager.h"
#include "nsICharsetConverterManager.h"
static NS_DEFINE_IID(kISupportsIID, NS_ISUPPORTS_IID);
static NS_DEFINE_IID(kIContentSinkIID, NS_ICONTENT_SINK_IID);
@ -44,6 +50,55 @@ static PRBool IsInline(eHTMLTags aTag);
static PRBool IsBlockLevel(eHTMLTags aTag);
/**
* Inits the encoder instance variable for the sink based on the charset
*
* @update gpk 4/21/99
* @param aCharset
* @return NS_xxx error result
*/
nsresult nsHTMLToTXTSinkStream::InitEncoder(const nsString& aCharset)
{
nsresult res = NS_OK;
nsICharsetAlias* calias = nsnull;
res = nsServiceManager::GetService(kCharsetAliasCID,
kICharsetAliasIID,
(nsISupports**)&calias);
NS_ASSERTION( nsnull != calias, "cannot find charet alias");
nsAutoString charsetName = aCharset;
if( NS_SUCCEEDED(res) && (nsnull != calias))
{
res = calias->GetPreferred(aCharset, charsetName);
nsServiceManager::ReleaseService(kCharsetAliasCID, calias);
if(NS_FAILED(res))
{
// failed - unknown alias , fallback to ISO-8859-1
charsetName = "ISO-8859-1";
}
nsICharsetConverterManager * ccm = nsnull;
res = nsServiceManager::GetService(kCharsetConverterManagerCID,
kICharsetConverterManagerIID,
(nsISupports**)&ccm);
if(NS_SUCCEEDED(res) && (nsnull != ccm))
{
nsIUnicodeEncoder * encoder = nsnull;
res = ccm->GetUnicodeEncoder(&charsetName, &encoder);
if(NS_SUCCEEDED(res) && (nsnull != encoder))
{
NS_IF_RELEASE(mUnicodeEncoder);
mUnicodeEncoder = encoder;
}
nsServiceManager::ReleaseService(kCharsetConverterManagerCID, ccm);
}
}
return res;
}
@ -117,6 +172,7 @@ nsHTMLToTXTSinkStream::nsHTMLToTXTSinkStream() {
mDoOutput = PR_FALSE;
mBufferSize = 0;
mBuffer = nsnull;
mUnicodeEncoder = nsnull;
}
/**
@ -133,6 +189,7 @@ nsHTMLToTXTSinkStream::nsHTMLToTXTSinkStream(ostream& aStream) {
mDoOutput = PR_FALSE;
mBufferSize = 0;
mBuffer = nsnull;
mUnicodeEncoder = nsnull;
}
@ -145,6 +202,7 @@ nsHTMLToTXTSinkStream::nsHTMLToTXTSinkStream(ostream& aStream) {
nsHTMLToTXTSinkStream::~nsHTMLToTXTSinkStream() {
mOutput=0; //we don't own the stream we're given; just forget it.
delete [] mBuffer;
NS_IF_RELEASE(mUnicodeEncoder);
}
@ -409,42 +467,40 @@ void nsHTMLToTXTSinkStream::EnsureBufferSize(PRInt32 aNewSize)
}
void nsHTMLToTXTSinkStream::UnicodeToTXTString(const nsString& aSrc)
{
#define CH_NBSP 160
#define CH_QUOT 34
#define CH_AMP 38
#define CH_LT 60
#define CH_GT 62
PRInt32 length = aSrc.Length();
PRUnichar ch;
const char* entity = nsnull;
PRUint32 offset = 0;
PRUint32 addedLength = 0;
nsresult result;
PRInt32 bufferLength;
if (mUnicodeEncoder == nsnull)
InitEncoder("");
if (length > 0)
{
EnsureBufferSize(length);
for (PRInt32 i = 0; i < length; i++)
{
ch = aSrc.CharAt(i);
switch (ch)
{
case CH_QUOT: ch = '"'; break;
case CH_AMP: ch = '&'; break;
case CH_GT: ch = '>'; break;
case CH_LT: ch = '<'; break;
case CH_NBSP: ch = ' '; break;
}
bufferLength = mBufferSize;
mUnicodeEncoder->Reset();
result = mUnicodeEncoder->Convert(aSrc, &length, mBuffer, &bufferLength);
mBuffer[bufferLength] = 0;
PRInt32 temp = bufferLength;
if (NS_SUCCEEDED(result))
result = mUnicodeEncoder->Finish(mBuffer,&temp);
if (ch < 128)
{
mBuffer[offset++] = (unsigned char)ch;
mBuffer[offset] = 0;
}
for (PRInt32 i = 0; i < bufferLength; i++)
{
if (mBuffer[i] == char(CH_NBSP))
mBuffer[i] = ' ';
}
}
}
@ -483,6 +539,18 @@ nsHTMLToTXTSinkStream::AddLeaf(const nsIParserNode& aNode, ostream& aStream)
mStrBuffer.Append(mBuffer);
mColPos += text.Length();
}
else if (type == eHTMLTag_entity)
{
const nsString& text = aNode.GetText();
UnicodeToTXTString(text);
PRInt32 entity = NS_EntityToUnicode(mBuffer);
if (entity < 256)
{
char ch = (char)entity;
aStream << ch;
mColPos++;
}
}
else if (type == eHTMLTag_whitespace)
{
if (PR_TRUE)
@ -551,6 +619,18 @@ NS_IMETHODIMP
nsHTMLToTXTSinkStream::OpenContainer(const nsIParserNode& aNode){
eHTMLTags type = (eHTMLTags)aNode.GetNodeType();
const nsString& name = aNode.GetText();
if (name.Equals("XIF_DOC_INFO"))
{
PRInt32 count=aNode.GetAttributeCount();
for(PRInt32 i=0;i<count;i++)
{
const nsString& key=aNode.GetKeyAt(i);
const nsString& value=aNode.GetValueAt(i);
if (key.Equals("charset"))
InitEncoder(value);
}
}
if (type == eHTMLTag_body)
mDoOutput = PR_TRUE;

Просмотреть файл

@ -51,6 +51,9 @@
class ostream;
#endif
class nsIUnicodeEncoder;
class nsHTMLToTXTSinkStream : public nsIHTMLContentSink {
public:
@ -117,6 +120,7 @@ protected:
void EnsureBufferSize(PRInt32 aNewSize);
void UnicodeToTXTString(const nsString& aSrc);
nsresult InitEncoder(const nsString& aCharset);
protected:
@ -127,7 +131,8 @@ protected:
char* mBuffer;
PRInt32 mBufferSize;
nsString mStrBuffer;
nsString mStrBuffer;
nsIUnicodeEncoder* mUnicodeEncoder;
};

Просмотреть файл

@ -43,6 +43,8 @@ static NS_DEFINE_IID(kClassIID, NS_XIF_DTD_IID);
static const char* kNullToken = "Error: Null token given";
static const char* kInvalidTagStackPos = "Error: invalid tag stack position";
static const char* kXIFDocHeader= "<!DOCTYPE xif>";
static const char* kXIFDocInfo= "document_info";
static const char* kXIFCharset= "charset";
struct nsXIFTagEntry {
@ -73,7 +75,10 @@ nsXIFTagEntry gXIFTagTable[] =
{"css_stylerule", eXIFTag_css_stylerule},
{"css_stylesheet", eXIFTag_css_stylesheet},
{"document_info", eXIFTag_document_info},
{"encode", eXIFTag_encode},
{"entity", eXIFTag_entity},
{"import", eXIFTag_import},
@ -343,6 +348,7 @@ nsXIFDTD::nsXIFDTD() : nsIDTD(){
mInContent=PR_FALSE;
mLowerCaseAttributes=PR_TRUE;
mLowerCaseTags=PR_TRUE;
mCharset = "";
}
/**
@ -395,15 +401,38 @@ eAutoDetectResult nsXIFDTD::CanParse(nsString& aContentType, nsString& aCommand,
if(aContentType.Equals(kXIFTextContentType)){
result=ePrimaryDetect;
}
else {
else
{
if(kNotFound!=aBuffer.Find(kXIFDocHeader)) {
PRInt32 offset = aBuffer.Find("<section>");
if (offset != -1)
aBuffer.Cut(0,offset);
aContentType= kXIFTextContentType;
result=ePrimaryDetect;
}
}
nsString charset ="ISO-8859-1";
PRInt32 offset;
offset = aBuffer.Find(kXIFDocInfo);
if(kNotFound!=offset)
{
offset = aBuffer.Find(kXIFCharset);
if (kNotFound!=offset)
{
PRInt32 start = aBuffer.Find('"',offset);
PRInt32 end = aBuffer.Find('"',start+1);
if ((start != kNotFound) && (end != kNotFound))
{
charset = "";
for (PRInt32 i = start+1; i < end; i++)
{
PRUnichar ch = aBuffer[i];
charset.Append(ch);
}
}
}
}
mCharset = charset;
return result;
}
@ -638,6 +667,11 @@ nsresult nsXIFDTD::HandleStartToken(CToken* aToken) {
result = OpenContainer(node);
break;
case eXIFTag_entity:
StartTopOfStack();
ProcessEntityTag(node);
break;
case eXIFTag_content:
StartTopOfStack();
mInContent = PR_TRUE;
@ -647,6 +681,10 @@ nsresult nsXIFDTD::HandleStartToken(CToken* aToken) {
ProcessEncodeTag(node);
break;
case eXIFTag_document_info:
ProcessDocumentInfoTag(node);
break;
case eXIFTag_attr:
AddAttribute(node);
@ -1355,8 +1393,8 @@ void nsXIFDTD::BeginStartTag(const nsIParserNode& aNode)
if (type == eXIFTag_container)
PushHTMLTag(tag,tagName);
CToken* token = new CStartToken(tagName);
nsCParserNode* node = new nsCParserNode(token);
// CToken* token = new CStartToken(tagName);
// nsCParserNode* node = new nsCParserNode(token);
PushNodeAndToken(tagName);
break;
}
@ -1629,6 +1667,38 @@ void nsXIFDTD::ProcessEncodeTag(const nsIParserNode& aNode)
}
void nsXIFDTD::ProcessEntityTag(const nsIParserNode& aNode)
{
nsString value;
if (GetAttribute(aNode,nsString("value"),value))
{
CEntityToken* entity = new CEntityToken(value);
nsCParserNode node((CToken*)entity);
mSink->AddLeaf(node);
}
}
void nsXIFDTD::ProcessDocumentInfoTag(const nsIParserNode& aNode)
{
nsString value;
nsString key("charset");
if (GetAttribute(aNode,key,value))
{
PushNodeAndToken(nsString("XIF_DOC_INFO"));
CAttributeToken* attribute = new CAttributeToken(key,value);
nsIParserNode* top = PeekNode();
if (top != nsnull)
((nsCParserNode*)top)->AddAttribute(attribute);
}
}
/*** CSS Methods ****/
void nsXIFDTD::BeginCSSStyleSheet(const nsIParserNode& aNode)

Просмотреть файл

@ -71,7 +71,9 @@ enum eXIFTags
eXIFTag_css_stylesheet,
eXIFTag_doctype,
eXIFTag_encode,
eXIFTag_document_info,
eXIFTag_encode,
eXIFTag_entity,
eXIFTag_import,
eXIFTag_leaf,
eXIFTag_link,
@ -490,6 +492,8 @@ private:
private:
void ProcessEncodeTag(const nsIParserNode& aNode);
void ProcessEntityTag(const nsIParserNode& aNode);
void ProcessDocumentInfoTag(const nsIParserNode& aNode);
void BeginCSSStyleSheet(const nsIParserNode& aNode);
void EndCSSStyleSheet(const nsIParserNode& aNode);
@ -556,6 +560,7 @@ protected:
PRBool mLowerCaseTags;
PRBool mLowerCaseAttributes;
nsITokenizer* mTokenizer;
nsString mCharset;
};