From 889faf818b4739f94157c0117190a07928fcfc75 Mon Sep 17 00:00:00 2001 From: "sayrer%gmail.com" Date: Sat, 6 May 2006 04:13:20 +0000 Subject: [PATCH] b=325080. add atom 0.3 and better html handling to feed processor. r+a=ben --- toolkit/components/build/Makefile.in | 6 + toolkit/components/build/nsToolkitCompsCID.h | 5 + .../components/build/nsToolkitCompsModule.cpp | 14 ++ toolkit/components/feeds/public/Makefile.in | 7 +- .../feeds/public/nsIFeedTextConstruct.idl | 3 +- .../public/nsIScriptableUnescapeHTML.idl | 51 +++++ toolkit/components/feeds/src/FeedProcessor.js | 188 +++++++++++++++--- toolkit/components/feeds/src/Makefile.in | 22 +- .../feeds/src/nsScriptableUnescapeHTML.cpp | 99 +++++++++ .../feeds/src/nsScriptableUnescapeHTML.h | 49 +++++ toolkit/components/feeds/test/test.js | 1 + .../test/xml/rfc4287/entry_html_cdata.xml | 28 +++ .../feeds/test/xml/rfc4287/entry_parent.xml | 2 +- .../feeds/test/xml/rfc4287/entry_title.xml | 2 +- .../xml/rfc4287/feed_atom_rights_xhtml.xml | 2 +- .../test/xml/rfc4287/feed_rights_xhtml.xml | 2 +- .../rfc4287/feed_rights_xhtml_nested_divs.xml | 2 +- .../feeds/test/xml/rfc4287/feed_subtitle.xml | 2 +- .../test/xml/rfc4287/feed_tantek_title.xml | 46 +++++ .../test/xml/rfc4287/feed_title_xhtml.xml | 2 +- .../feeds/test/xml/rss2/feed_description.xml | 2 +- .../rss2/feed_subtitle_markup_stripped.xml | 4 +- .../test/xml/rss2/item_content_encoded.xml | 2 +- .../feeds/test/xml/rss2/item_description.xml | 2 +- .../test/xml/rss2/item_description_2.xml | 2 +- .../test/xml/rss2/item_description_cdata.xml | 2 +- .../rss2/item_description_decode_entities.xml | 21 ++ .../feeds/test/xml/rss2/item_plain_desc.xml | 2 +- 28 files changed, 525 insertions(+), 45 deletions(-) create mode 100644 toolkit/components/feeds/public/nsIScriptableUnescapeHTML.idl create mode 100644 toolkit/components/feeds/src/nsScriptableUnescapeHTML.cpp create mode 100644 toolkit/components/feeds/src/nsScriptableUnescapeHTML.h create mode 100644 toolkit/components/feeds/test/xml/rfc4287/entry_html_cdata.xml create mode 100644 toolkit/components/feeds/test/xml/rfc4287/feed_tantek_title.xml create mode 100644 toolkit/components/feeds/test/xml/rss2/item_description_decode_entities.xml diff --git a/toolkit/components/build/Makefile.in b/toolkit/components/build/Makefile.in index a5cae4b19ca4..e4bcefaf06b5 100644 --- a/toolkit/components/build/Makefile.in +++ b/toolkit/components/build/Makefile.in @@ -86,6 +86,7 @@ REQUIRES = \ xuldoc \ alerts \ url-classifier \ + feeds \ $(NULL) EXPORTS = nsToolkitCompsCID.h @@ -98,6 +99,7 @@ LOCAL_INCLUDES = \ -I$(srcdir)/../typeaheadfind/src \ -I$(srcdir)/../alerts/src \ -I$(srcdir)/../url-classifier/src \ + -I$(srcdir)/../feeds/src \ $(NULL) SHARED_LIBRARY_LIBS = \ @@ -119,6 +121,10 @@ ifdef MOZ_URL_CLASSIFIER SHARED_LIBRARY_LIBS += $(DIST)/lib/$(LIB_PREFIX)urlclassifier_s.$(LIB_SUFFIX) endif +ifdef MOZ_FEEDS +SHARED_LIBRARY_LIBS += $(DIST)/lib/$(LIB_PREFIX)feed_s.$(LIB_SUFFIX) +endif + ifndef MOZ_SUITE # XXX Suite isn't ready to build this just yet SHARED_LIBRARY_LIBS += ../typeaheadfind/src/$(LIB_PREFIX)fastfind_s.$(LIB_SUFFIX) diff --git a/toolkit/components/build/nsToolkitCompsCID.h b/toolkit/components/build/nsToolkitCompsCID.h index a428ee5e8168..57383cfdadb9 100644 --- a/toolkit/components/build/nsToolkitCompsCID.h +++ b/toolkit/components/build/nsToolkitCompsCID.h @@ -77,6 +77,8 @@ #define NS_URLCLASSIFIERDBSERVICE_CONTRACTID \ "@mozilla.org/url-classifier/dbservice;1" +#define NS_SCRIPTABLEUNESCAPEHTML_CONTRACTID "@mozilla.org/feed-unescapehtml;1" + ///////////////////////////////////////////////////////////////////////////// // {A0CCAAF8-09DA-44D8-B250-9AC3E93C8117} @@ -120,3 +122,6 @@ #define NS_URLCLASSIFIERDBSERVICE_CID \ { 0x5eb7c3c1, 0xec1f, 0x4007, { 0x87, 0xcc, 0xee, 0xfb, 0x37, 0xd6, 0x8c, 0xe6} } + +#define NS_SCRIPTABLEUNESCAPEHTML_CID \ +{ 0x10f2f5f0, 0xf103, 0x4901, { 0x98, 0x0f, 0xba, 0x11, 0xbd, 0x70, 0xd6, 0x0d} } diff --git a/toolkit/components/build/nsToolkitCompsModule.cpp b/toolkit/components/build/nsToolkitCompsModule.cpp index bafdf60fef07..17280a95aac2 100644 --- a/toolkit/components/build/nsToolkitCompsModule.cpp +++ b/toolkit/components/build/nsToolkitCompsModule.cpp @@ -58,6 +58,10 @@ #include "nsUrlClassifierDBService.h" #endif +#ifdef MOZ_FEEDS +#include "nsScriptableUnescapeHTML.h" +#endif + ///////////////////////////////////////////////////////////////////////////// NS_GENERIC_FACTORY_CONSTRUCTOR_INIT(nsAppStartup, Init) @@ -82,6 +86,10 @@ NS_GENERIC_FACTORY_SINGLETON_CONSTRUCTOR(nsUrlClassifierDBService, nsUrlClassifierDBService::GetInstance) #endif +#ifdef MOZ_FEEDS +NS_GENERIC_FACTORY_CONSTRUCTOR(nsScriptableUnescapeHTML) +#endif + ///////////////////////////////////////////////////////////////////////////// static const nsModuleComponentInfo components[] = @@ -125,6 +133,12 @@ static const nsModuleComponentInfo components[] = NS_URLCLASSIFIERDBSERVICE_CONTRACTID, nsUrlClassifierDBServiceConstructor }, #endif +#ifdef MOZ_FEEDS + { "Unescape HTML", + NS_SCRIPTABLEUNESCAPEHTML_CID, + NS_SCRIPTABLEUNESCAPEHTML_CONTRACTID, + nsScriptableUnescapeHTMLConstructor }, +#endif }; NS_IMPL_NSGETMODULE(nsToolkitCompsModule, components) diff --git a/toolkit/components/feeds/public/Makefile.in b/toolkit/components/feeds/public/Makefile.in index 56c6a8d46e8b..f8f8b113cd61 100644 --- a/toolkit/components/feeds/public/Makefile.in +++ b/toolkit/components/feeds/public/Makefile.in @@ -49,6 +49,9 @@ XPIDLSRCS = nsIFeedProcessor.idl \ nsIFeedListener.idl \ nsIFeed.idl \ nsIFeedContainer.idl \ - nsIFeedEntry.idl + nsIFeedEntry.idl \ + nsIFeedTextConstruct.idl \ + nsIScriptableUnescapeHTML.idl \ + $(NULL) -include $(topsrcdir)/config/rules.mk \ No newline at end of file +include $(topsrcdir)/config/rules.mk diff --git a/toolkit/components/feeds/public/nsIFeedTextConstruct.idl b/toolkit/components/feeds/public/nsIFeedTextConstruct.idl index b042ebf953bd..1bb4602b8180 100644 --- a/toolkit/components/feeds/public/nsIFeedTextConstruct.idl +++ b/toolkit/components/feeds/public/nsIFeedTextConstruct.idl @@ -70,4 +70,5 @@ interface nsIFeedTextConstruct : nsISupports * The content of the text construct. */ attribute AString text; -} +}; + diff --git a/toolkit/components/feeds/public/nsIScriptableUnescapeHTML.idl b/toolkit/components/feeds/public/nsIScriptableUnescapeHTML.idl new file mode 100644 index 000000000000..f1206f5f6665 --- /dev/null +++ b/toolkit/components/feeds/public/nsIScriptableUnescapeHTML.idl @@ -0,0 +1,51 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** +* Version: MPL 1.1/GPL 2.0/LGPL 2.1 +* +* The contents of this file are subject to the Mozilla Public License Version +* 1.1 (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* http://www.mozilla.org/MPL/ +* +* Software distributed under the License is distributed on an "AS IS" basis, +* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +* for the specific language governing rights and limitations under the +* License. +* +* The Original Code is mozilla.org code. +* +* The Initial Developer of the Original Code is Robert Sayre. +* Portions created by the Initial Developer are Copyright (C) 2006 +* the Initial Developer. All Rights Reserved. +* +* Contributor(s): +* +* Alternatively, the contents of this file may be used under the terms of +* either the GNU General Public License Version 2 or later (the "GPL"), or +* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +* in which case the provisions of the GPL or the LGPL are applicable instead +* of those above. If you wish to allow use of your version of this file only +* under the terms of either the GPL or the LGPL, and not to allow others to +* use your version of this file under the terms of the MPL, indicate your +* decision by deleting the provisions above and replace them with the notice +* and other provisions required by the GPL or the LGPL. If you do not delete +* the provisions above, a recipient may use your version of this file under +* the terms of any one of the MPL, the GPL or the LGPL. +* +* ***** END LICENSE BLOCK ***** */ + +#include "nsISupports.idl" + +/** + * A utility class that unescapes HTML strings. + */ +[scriptable, uuid(0ff58de6-2460-4095-9ff9-9756efedc756)] +interface nsIScriptableUnescapeHTML : nsISupports +{ + /** + * Converts all entities to Unicode. + * + * @param src The HTML string to escape. + */ + AString unescape(in AString src); +}; diff --git a/toolkit/components/feeds/src/FeedProcessor.js b/toolkit/components/feeds/src/FeedProcessor.js index 377e50c3e528..08da9b76cdb8 100644 --- a/toolkit/components/feeds/src/FeedProcessor.js +++ b/toolkit/components/feeds/src/FeedProcessor.js @@ -48,8 +48,11 @@ const IO_CONTRACTID = "@mozilla.org/network/io-service;1" const BAG_CONTRACTID = "@mozilla.org/hash-property-bag;1" const ARRAY_CONTRACTID = "@mozilla.org/array;1"; const SAX_CONTRACTID = "@mozilla.org/saxparser/xmlreader;1"; +const UNESCAPE_CONTRACTID = "@mozilla.org/feed-unescapehtml;1"; var gIoService = Cc[IO_CONTRACTID].getService(Ci.nsIIOService); +var gUnescapeHTML = Cc[UNESCAPE_CONTRACTID]. + getService(Ci.nsIScriptableUnescapeHTML); /***** Some general utils *****/ function strToURI(link, base) { @@ -80,6 +83,17 @@ function isIID(a, iid) { return rv; } +function isIFeedTextConstruct(a) { + var rv = false; + try { + a.QueryInterface(Ci.nsIFeedTextConstruct); + rv = true; + } + catch(e) { + } + return rv; +} + function isIArray(a) { return isIID(a, Ci.nsIArray); } @@ -92,6 +106,20 @@ function stripTags(someHTML) { return someHTML.replace(/<[^>]+>/g,""); } +function plainTextFromTextConstruct(textConstruct) { + if (textConstruct != null && + isIFeedTextConstruct(textConstruct)) { + var text = textConstruct.text; + if (textConstruct.type != "text") { + text = gUnescapeHTML.unescape(stripTags(text)); + } + return text; + } + + // it was not a textConstruct, just a string + return textConstruct; +} + function xmlEscape(s) { s = s.replace(/&/g, "&"); s = s.replace(/>/g, ">"); @@ -201,6 +229,7 @@ function W3CToIETFDate(dateString) { // namespace map var gNamespaces = { "http://www.w3.org/2005/Atom":"atom", + "http://purl.org/atom/ns#":"atom03", "http://purl.org/rss/1.0/modules/content/":"content", "http://purl.org/dc/elements/1.1/":"dc", "http://www.w3.org/1999/02/22-rdf-syntax-ns#":"rdf", @@ -224,7 +253,9 @@ var gKnownTextElements = ["title","link","description","language","copyright", "atom:logo", "atom:published", "atom:updated", "wfw:comment", "wfw:commentRss", "wiki:version", "wiki:status", "wiki:importance","wiki:diff", - "wiki:history","content:encoded", "atom:icon"]; + "wiki:history","content:encoded", "atom:icon", + "atom03:title", "atom03:summary", "atom03:content", + "atom03:tagline", "atom:title"]; function FeedResult() {} FeedResult.prototype = { @@ -250,17 +281,33 @@ FeedResult.prototype = { function Feed() { this._sub = null; + this._title = null; this.items = []; + this.link = null; } Feed.prototype = { subtitle: function Feed_subtitle(doStripTags) { - return doStripTags ? stripTags(this._sub) : this._sub; + if (this._sub == null) + return null; + + if (doStripTags) + return plainTextFromTextConstruct(this._sub); + + if (isIID(this._sub, Ci.nsIFeedTextConstruct)) + return this._sub.text; + + return this._sub; + }, + + get title() { + return plainTextFromTextConstruct(this._title); }, searchLists: { - _sub: ["description","dc:description","rss1:description","atom:subtitle"], - items: ["items","entries"], - title: ["title","rss1:title","atom:title"], + _sub: ["description","dc:description","rss1:description", + "atom03:tagline","atom:subtitle"], + items: ["items","atom03_entries","entries"], + _title: ["title","rss1:title", "atom03:title","atom:title"], link: [["link",strToURI],["rss1:link",strToURI]], categories: ["categories", "dc:subject"], cloud: ["cloud"], @@ -292,31 +339,51 @@ Feed.prototype = { function Entry() { this._summary = null; this._content = null; + this._title = null; this.fields = Cc["@mozilla.org/hash-property-bag;1"]. createInstance(Ci.nsIWritablePropertyBag2); + this.link = null; } Entry.prototype = { fields: null, + get title() { + return plainTextFromTextConstruct(this._title); + }, summary: function Entry_summary(doStripTags) { if (this._summary == null) return null; - return doStripTags ? stripTags(this._summary) : this._summary; + + if (doStripTags) + return plainTextFromTextConstruct(this._summary); + + if (isIID(this._summary, Ci.nsIFeedTextConstruct)) + return this._summary.text; + + return this._summary; }, content: function Entry_content(doStripTags) { + if (this._content == null) return null; - return doStripTags ? stripTags(this._content) : this._content; + + if (doStripTags) + return plainTextFromTextConstruct(this._content); + + if (isIID(this._content, Ci.nsIFeedTextConstruct)) + return this._content.text; + + return this._content; }, enclosures: null, mediaContent: null, searchLists: { - title: ["title","rss1:title","atom:title"], + _title: ["title","rss1:title","atom03:title","atom:title"], link: [["link",strToURI],["rss1:link",strToURI]], - _summary: ["description", "rss1:description", - "dc:description", "atom:summary"], - _content: ["content:encoded", "atom:content"], + _summary: ["description", "rss1:description", "dc:description", + "atom03:summary", "atom:summary"], + _content: ["content:encoded","atom03:content","atom:content"] }, normalize: function Feed_normalize() { @@ -338,7 +405,7 @@ function TextConstruct() { this.lang = null; this.base = null; this.type = "text"; - this.text = ""; + this.text = null; } TextConstruct.prototype = { @@ -625,7 +692,8 @@ ExtensionHandler.prototype = { if (this._depth == 0) { if (this._isSimple) { this._processor.returnFromExtHandler(this._uri, this._localName, - trimString(this._buf)); + trimString(this._buf), + this._attrs); } else { this._processor.returnFromExtHandler(null,null,null); @@ -694,8 +762,19 @@ function FeedProcessor() { this.listener = null; // These elements can contain (X)HTML or plain text. - this._textConstructs = ["atom:title", "atom:summary", "atom:rights", - "atom:content", "atom:subtitle"]; + // We keep a table here that contains their default treatment + this._textConstructs = {"atom:title":"text", + "atom:summary":"text", + "atom:rights":"text", + "atom:content":"text", + "atom:subtitle":"text", + "description":"html", + "rss1:description":"html", + "content:encoded":"html", + "atom03:title":"text", + "atom03:tagline":"text", + "atom03:summary":"text", + "atom03:content":"text"}; this._stack = []; this._trans = { @@ -707,8 +786,11 @@ function FeedProcessor() { // verify that until we hit a rss1:channel element. "rdf:RDF": new WrapperElementInfo("RDF"), - //If we hit a Atom 1.0 element, treat as Atom 1.0. + // If we hit a Atom 1.0 element, treat as Atom 1.0. "atom:feed": new FeedElementInfo("Atom", "atom"), + + // Treat as Atom 0.3 + "atom03:feed": new FeedElementInfo("Atom03", "atom03"), }, /********* RSS2 **********/ @@ -766,6 +848,21 @@ function FeedProcessor() { "atom:contributor": new ElementInfo("contributor", null, null, true), "atom:link": new ElementInfo("links", null, null, true), }, + + /********* ATOM 0.3 **********/ + "IN_ATOM03": { + "atom03:author": new ElementInfo("author", null, null, true), + "atom03:link": new ElementInfo("links", null, null, true), + "atom03:entry": new ElementInfo("atom03_entries", Cc[ENTRY_CONTRACTID], + null, true) + }, + + "IN_ATOM03_ENTRIES": { + "atom03:author": new ElementInfo("author", null, null, true), + "atom03:link": new ElementInfo("links", null, null, true), + "atom03:entry": new ElementInfo("atom03_entries", Cc[ENTRY_CONTRACTID], + null, true) + } } } @@ -920,10 +1017,10 @@ FeedProcessor.prototype = { // The Atom spec explicitly says the div is not part of the content, // and explicitly allows whitespace collapsing. // - if (this._result.version == "atom" && - arrayContains(this._textConstructs, key)) { - var type = attributes.getValue("","type"); - if (type == "xhtml") { + if ((this._result.version == "atom" || this._result.version == "atom03") && + this._textConstructs[key] != null) { + var type = attributes.getValueFromName("","type"); + if (type != null && type.indexOf("xhtml") >= 0) { this._xhtmlHandler = new XHTMLHandler(this, (this._result.version == "atom")); this._reader.contentHandler = this._xhtmlHandler; @@ -1119,7 +1216,8 @@ FeedProcessor.prototype = { // unknown element values are returned here. See startElement above // for how this works. - returnFromExtHandler: function FP_returnExt(uri, localName, chars) { + returnFromExtHandler: + function FP_returnExt(uri, localName, chars, attributes) { --this._depth; // take control of the SAX events @@ -1143,10 +1241,44 @@ FeedProcessor.prototype = { container = container.queryElementAt(container.length - 1, Ci.nsIWritablePropertyBag2); } - - // Assign the property + + // Make the buffer our new property var prefix = gNamespaces[uri] ? gNamespaces[uri] + ":" : ""; - container.setPropertyAsAString(prefix+localName, chars); + var propName = prefix + localName; + + // But, it could be something containing HTML. If so, + // we need to know about that. + if (this._textConstructs[propName] != null && + (this._result.version.indexOf("rss") == -1 || + this._handlerStack[this._depth].containerClass != null)) { + var newProp = Cc[TEXTCONSTRUCT_CONTRACTID]. + createInstance(Ci.nsIFeedTextConstruct); + newProp.text = chars; + // Look up the default type in our table + var type = this._textConstructs[propName]; + var typeAttribute = attributes.getValueFromName("","type"); + if (this._result.version == "atom" && typeAttribute != null) { + type = typeAttribute; + } + else if (this._result.version == "atom03" && typeAttribute != null) { + if (typeAttribute.toLowerCase().indexOf("xhtml") >= 0) { + type = "xhtml"; + } + else if (typeAttribute.toLowerCase().indexOf("html") >= 0) { + type = "html"; + } + else if (typeAttribute.toLowerCase().indexOf("text") >= 0) { + type = "text"; + } + } + + newProp.type = type; + container.setPropertyAsInterface(propName, newProp); + } + else { + container.setPropertyAsAString(propName, chars); + } + }, // Sometimes, we'll hand off SAX handling duties to an XHTMLHandler @@ -1166,7 +1298,11 @@ FeedProcessor.prototype = { // Assign the property var prefix = gNamespaces[uri] ? gNamespaces[uri] + ":" : ""; - container.setPropertyAsAString(prefix + localName, chars); + var newProp = newProp = Cc[TEXTCONSTRUCT_CONTRACTID]. + createInstance(Ci.nsIFeedTextConstruct); + newProp.text = chars; + newProp.type = "xhtml"; + container.setPropertyAsInterface(prefix + localName, newProp); // XHTML will cause us to peek too far. The XHTML handler will // send us an end element to call. RFC4287-valid feeds allow a @@ -1288,7 +1424,7 @@ var Module = { // Entry cr.unregisterFactoryLocation(ENTRY_CLASSID, location); // Text Construct - cr.unregisterFactoryLocation(TEXTCONSTUCT_CLASSID, location); + cr.unregisterFactoryLocation(TEXTCONSTRUCT_CLASSID, location); }, canUnload: function(cm) { diff --git a/toolkit/components/feeds/src/Makefile.in b/toolkit/components/feeds/src/Makefile.in index b1704d52a05e..683f1004643b 100644 --- a/toolkit/components/feeds/src/Makefile.in +++ b/toolkit/components/feeds/src/Makefile.in @@ -42,6 +42,26 @@ VPATH = @srcdir@ include $(DEPTH)/config/autoconf.mk +MODULE = feeds +LIBRARY_NAME = feed_s +MOZILLA_INTERNAL_API = 1 +FORCE_STATIC_LIB = 1 +LIBXUL_LIBRARY = 1 + +REQUIRES = \ + xpcom \ + necko \ + string \ + js \ + dom \ + htmlparser \ + content \ + layout \ + $(NULL) + +CPPSRCS = nsScriptableUnescapeHTML.cpp \ + $(NULL) + EXTRA_COMPONENTS = FeedProcessor.js -include $(topsrcdir)/config/rules.mk \ No newline at end of file +include $(topsrcdir)/config/rules.mk diff --git a/toolkit/components/feeds/src/nsScriptableUnescapeHTML.cpp b/toolkit/components/feeds/src/nsScriptableUnescapeHTML.cpp new file mode 100644 index 000000000000..7a8a5c5debc1 --- /dev/null +++ b/toolkit/components/feeds/src/nsScriptableUnescapeHTML.cpp @@ -0,0 +1,99 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is mozilla.org code. + * + * The Initial Developer of the Original Code is Robert Sayre. + * Portions created by the Initial Developer are Copyright (C) 2006 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "nsString.h" +#include "nsCRT.h" +#include "nsISupportsArray.h" +#include "nsIComponentManager.h" +#include "nsCOMPtr.h" +#include "nsXPCOM.h" +#include "nsISupportsPrimitives.h" +#include "nsXPIDLString.h" + +#include "nsIParser.h" +#include "nsIDTD.h" +#include "nsNetCID.h" +#include "nsNetUtil.h" +#include "nsParserCIID.h" +#include "nsParserCIID.h" +#include "nsIContentSink.h" +#include "nsIHTMLToTextSink.h" +#include "nsIDocumentEncoder.h" + +#include "nsIScriptableUnescapeHTML.h" +#include "nsScriptableUnescapeHTML.h" + +NS_IMPL_ISUPPORTS1(nsScriptableUnescapeHTML, nsIScriptableUnescapeHTML) + +static NS_DEFINE_CID(kCParserCID, NS_PARSER_CID); + +// From /widget/HTMLConverter +// +// Takes HTML and converts it to plain text but in unicode. +// +NS_IMETHODIMP +nsScriptableUnescapeHTML::Unescape(const nsAString & aFromStr, + nsAString & aToStr) +{ + // create the parser to do the conversion. + aToStr.SetLength(0); + nsresult rv; + nsCOMPtr parser = do_CreateInstance(kCParserCID, &rv); + if ( !parser ) + return rv; + + // convert it! + nsCOMPtr sink; + + sink = do_CreateInstance(NS_PLAINTEXTSINK_CONTRACTID); + NS_ENSURE_TRUE(sink, NS_ERROR_FAILURE); + + nsCOMPtr textSink(do_QueryInterface(sink)); + NS_ENSURE_TRUE(textSink, NS_ERROR_FAILURE); + + textSink->Initialize(&aToStr, nsIDocumentEncoder::OutputSelectionOnly + | nsIDocumentEncoder::OutputAbsoluteLinks, 0); + + parser->SetContentSink(sink); + + parser->Parse(aFromStr, 0, NS_LITERAL_CSTRING("text/html"), + PR_TRUE, eDTDMode_fragment); + + return NS_OK; +} + + + diff --git a/toolkit/components/feeds/src/nsScriptableUnescapeHTML.h b/toolkit/components/feeds/src/nsScriptableUnescapeHTML.h new file mode 100644 index 000000000000..bea14c5a2e4b --- /dev/null +++ b/toolkit/components/feeds/src/nsScriptableUnescapeHTML.h @@ -0,0 +1,49 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is mozilla.org code. + * + * The Initial Developer of the Original Code is Robert Sayre. + * Portions created by the Initial Developer are Copyright (C) 2006 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#ifndef nsScriptableHTMLUnescape_h__ +#define nsScriptableHTMLUnescape_h__ + +#include "nsIScriptableUnescapeHTML.h" + +class nsScriptableUnescapeHTML : public nsIScriptableUnescapeHTML +{ +public: + NS_DECL_ISUPPORTS + NS_DECL_NSISCRIPTABLEUNESCAPEHTML +}; + +#endif // nsScriptableHTMLUnescape_h__ diff --git a/toolkit/components/feeds/test/test.js b/toolkit/components/feeds/test/test.js index 63ab75c4a693..9f7cfacf7bae 100644 --- a/toolkit/components/feeds/test/test.js +++ b/toolkit/components/feeds/test/test.js @@ -57,6 +57,7 @@ TestListener.prototype = { (isIID(feed, Components.interfaces.nsIFeed)); try { if(!eval(testcase.expect)){ + print(testcase.path + ": \n"); print("FAILED! Test was: \"" + testcase.desc + "\" |\n" + testcase.expect + '|\n'); }else{ passed += 1; diff --git a/toolkit/components/feeds/test/xml/rfc4287/entry_html_cdata.xml b/toolkit/components/feeds/test/xml/rfc4287/entry_html_cdata.xml new file mode 100644 index 000000000000..fa32f3dac8e8 --- /dev/null +++ b/toolkit/components/feeds/test/xml/rfc4287/entry_html_cdata.xml @@ -0,0 +1,28 @@ + + + +http://atomtests.philringnalda.com/tests/item/title/html-cdata.atom +Atom item title html cdata +2005-12-18T00:13:00Z + + Phil Ringnalda + http://weblog.philringnalda.com/ + + + + http://atomtests.philringnalda.com/tests/item/title/html-cdata.atom/1 + <![CDATA[<title>]]> + 2005-12-18T00:13:00Z + An item with a type="html" title consisting of a less-than +character, the word 'title' and a greater-than character, where +the character entity reference for the less-than is escaped by being +in a CDATA section. + + + + diff --git a/toolkit/components/feeds/test/xml/rfc4287/entry_parent.xml b/toolkit/components/feeds/test/xml/rfc4287/entry_parent.xml index dee638d76260..f9cc28cbe8b8 100644 --- a/toolkit/components/feeds/test/xml/rfc4287/entry_parent.xml +++ b/toolkit/components/feeds/test/xml/rfc4287/entry_parent.xml @@ -2,7 +2,7 @@ test rights' +Expect: feed.fields.getProperty('atom:rights') != null --> diff --git a/toolkit/components/feeds/test/xml/rfc4287/feed_rights_xhtml.xml b/toolkit/components/feeds/test/xml/rfc4287/feed_rights_xhtml.xml index 8bc26835044e..422c6fb49251 100644 --- a/toolkit/components/feeds/test/xml/rfc4287/feed_rights_xhtml.xml +++ b/toolkit/components/feeds/test/xml/rfc4287/feed_rights_xhtml.xml @@ -2,7 +2,7 @@ diff --git a/toolkit/components/feeds/test/xml/rfc4287/feed_rights_xhtml_nested_divs.xml b/toolkit/components/feeds/test/xml/rfc4287/feed_rights_xhtml_nested_divs.xml index 0bcca177cec4..ebad24ac3bc3 100644 --- a/toolkit/components/feeds/test/xml/rfc4287/feed_rights_xhtml_nested_divs.xml +++ b/toolkit/components/feeds/test/xml/rfc4287/feed_rights_xhtml_nested_divs.xml @@ -2,7 +2,7 @@ diff --git a/toolkit/components/feeds/test/xml/rfc4287/feed_subtitle.xml b/toolkit/components/feeds/test/xml/rfc4287/feed_subtitle.xml index cdbbd552a76b..268b38dd2806 100644 --- a/toolkit/components/feeds/test/xml/rfc4287/feed_subtitle.xml +++ b/toolkit/components/feeds/test/xml/rfc4287/feed_subtitle.xml @@ -2,7 +2,7 @@ diff --git a/toolkit/components/feeds/test/xml/rfc4287/feed_tantek_title.xml b/toolkit/components/feeds/test/xml/rfc4287/feed_tantek_title.xml new file mode 100644 index 000000000000..7ce4a468a6c1 --- /dev/null +++ b/toolkit/components/feeds/test/xml/rfc4287/feed_tantek_title.xml @@ -0,0 +1,46 @@ + + + + + <div xmlns="http://www.w3.org/1999/xhtml">Tantek's Updates</div> + + + + http://tantek.com/updates.atom + + Tantek + http://tantek.com/ + + 2006-05-02T20:13:00-07:00 + + 2006-04-22T00:00:00-07:00 + 2006-04-22T00:00:00-07:00 + + http://www.makezine.com/faire/ + Make Faire + + + + + \ No newline at end of file diff --git a/toolkit/components/feeds/test/xml/rfc4287/feed_title_xhtml.xml b/toolkit/components/feeds/test/xml/rfc4287/feed_title_xhtml.xml index 3a3783d93f9d..2c546a33bfe9 100644 --- a/toolkit/components/feeds/test/xml/rfc4287/feed_title_xhtml.xml +++ b/toolkit/components/feeds/test/xml/rfc4287/feed_title_xhtml.xml @@ -2,7 +2,7 @@ diff --git a/toolkit/components/feeds/test/xml/rss2/feed_description.xml b/toolkit/components/feeds/test/xml/rss2/feed_description.xml index 6d9f4c454a28..c66dcaeded09 100644 --- a/toolkit/components/feeds/test/xml/rss2/feed_description.xml +++ b/toolkit/components/feeds/test/xml/rss2/feed_description.xml @@ -2,7 +2,7 @@ diff --git a/toolkit/components/feeds/test/xml/rss2/feed_subtitle_markup_stripped.xml b/toolkit/components/feeds/test/xml/rss2/feed_subtitle_markup_stripped.xml index 8a8250175421..1808f281bbde 100644 --- a/toolkit/components/feeds/test/xml/rss2/feed_subtitle_markup_stripped.xml +++ b/toolkit/components/feeds/test/xml/rss2/feed_subtitle_markup_stripped.xml @@ -1,8 +1,8 @@ diff --git a/toolkit/components/feeds/test/xml/rss2/item_content_encoded.xml b/toolkit/components/feeds/test/xml/rss2/item_content_encoded.xml index 674817ecab41..de7a1c9d3718 100644 --- a/toolkit/components/feeds/test/xml/rss2/item_content_encoded.xml +++ b/toolkit/components/feeds/test/xml/rss2/item_content_encoded.xml @@ -2,7 +2,7 @@ diff --git a/toolkit/components/feeds/test/xml/rss2/item_description.xml b/toolkit/components/feeds/test/xml/rss2/item_description.xml index 91d792b11de2..c553730e4601 100644 --- a/toolkit/components/feeds/test/xml/rss2/item_description.xml +++ b/toolkit/components/feeds/test/xml/rss2/item_description.xml @@ -2,7 +2,7 @@ diff --git a/toolkit/components/feeds/test/xml/rss2/item_description_2.xml b/toolkit/components/feeds/test/xml/rss2/item_description_2.xml index 9900dab0f674..ce2cd9a05ad9 100644 --- a/toolkit/components/feeds/test/xml/rss2/item_description_2.xml +++ b/toolkit/components/feeds/test/xml/rss2/item_description_2.xml @@ -2,7 +2,7 @@ diff --git a/toolkit/components/feeds/test/xml/rss2/item_description_cdata.xml b/toolkit/components/feeds/test/xml/rss2/item_description_cdata.xml index e68a688d893c..6f50325fb52b 100644 --- a/toolkit/components/feeds/test/xml/rss2/item_description_cdata.xml +++ b/toolkit/components/feeds/test/xml/rss2/item_description_cdata.xml @@ -2,7 +2,7 @@ diff --git a/toolkit/components/feeds/test/xml/rss2/item_description_decode_entities.xml b/toolkit/components/feeds/test/xml/rss2/item_description_decode_entities.xml new file mode 100644 index 000000000000..43927bc3094e --- /dev/null +++ b/toolkit/components/feeds/test/xml/rss2/item_description_decode_entities.xml @@ -0,0 +1,21 @@ + + + + + +http://example.org +jbb@dallas.example.com (Joe Bob Briggs) +test +bar + + + <b>test D&eacute;sol&eacute;e</b> + + + + \ No newline at end of file diff --git a/toolkit/components/feeds/test/xml/rss2/item_plain_desc.xml b/toolkit/components/feeds/test/xml/rss2/item_plain_desc.xml index 0271d5f6fab5..61ecf47a2367 100644 --- a/toolkit/components/feeds/test/xml/rss2/item_plain_desc.xml +++ b/toolkit/components/feeds/test/xml/rss2/item_plain_desc.xml @@ -2,7 +2,7 @@