From 670ce54d8ac848b1ef517679691bb79078db5155 Mon Sep 17 00:00:00 2001 From: "annie.sullivan%gmail.com" Date: Fri, 27 Jan 2006 21:58:25 +0000 Subject: [PATCH] Patch from sayrer@gmail.com Change text search method, handle html/xhtml, trim whitespace bug=321036 r=annie.sullivan@gmail.com --- browser/components/places/src/Makefile.in | 1 + .../places/src/nsBookmarksFeedHandler.cpp | 172 ++++++++++++++---- 2 files changed, 137 insertions(+), 36 deletions(-) diff --git a/browser/components/places/src/Makefile.in b/browser/components/places/src/Makefile.in index 1f29fa6f25fa..66cc43b26e16 100644 --- a/browser/components/places/src/Makefile.in +++ b/browser/components/places/src/Makefile.in @@ -72,6 +72,7 @@ REQUIRES = xpcom \ xpconnect \ js \ nkcache \ + widget \ $(NULL) LOCAL_INCLUDES = -I$(srcdir)/../../build diff --git a/browser/components/places/src/nsBookmarksFeedHandler.cpp b/browser/components/places/src/nsBookmarksFeedHandler.cpp index 755be3dc4532..452197498196 100755 --- a/browser/components/places/src/nsBookmarksFeedHandler.cpp +++ b/browser/components/places/src/nsBookmarksFeedHandler.cpp @@ -21,6 +21,7 @@ * * Contributor(s): * Annie Sullivan (modified to work with places) + * Robert Sayre * * Alternatively, the contents of this file may be used under the terms of * either the GNU General Public License Version 2 or later (the "GPL"), or @@ -70,6 +71,17 @@ #include "nsIDOMCharacterData.h" #include "nsIDOMNodeList.h" #include "nsIDOM3Node.h" +#include "nsIDOMDocumentTraversal.h" +#include "nsIDOMTreeWalker.h" +#include "nsIDOMNodeFilter.h" +#include "nsIDOMDocumentFragment.h" +#include "nsIParser.h" +#include "nsParserCIID.h" +#include "nsIFragmentContentSink.h" +#include "nsIContentSink.h" +#include "nsIDocument.h" + +static NS_DEFINE_CID(kCParserCID, NS_PARSER_CID); // These are defined in nsLivemarkService.cpp extern nsIRDFResource *kLMRDF_type; @@ -135,7 +147,8 @@ protected: // helpers NS_METHOD HandleRDFItem (nsIRDFDataSource *aDS, nsIRDFResource *itemResource, nsIRDFResource *aLinkResource, nsIRDFResource *aTitleResource); - NS_METHOD FindTextInNode (nsIDOMNode *aParentNode, nsAString &aString); + NS_METHOD FindTextInChildNodes (nsIDOMNode *aNode, nsAString &aString); + NS_METHOD ParseHTMLFragment(nsAString &aFragString, nsIDocument* aTargetDocument, nsIDOMNode **outNode); PRBool IsLinkValid(const PRUnichar *aURI); @@ -412,44 +425,79 @@ nsLivemarkLoadListener::TryParseAsRDF () return NS_OK; } -// -// find the first node below aParentNode that is a TEXT_NODE, -// and return it -// + nsresult -nsLivemarkLoadListener::FindTextInNode (nsIDOMNode *aParentNode, nsAString &aString) +nsLivemarkLoadListener::ParseHTMLFragment(nsAString &aFragString, + nsIDocument* aTargetDocument, + nsIDOMNode **outNode) { - NS_ENSURE_ARG(aParentNode); - + NS_ENSURE_ARG(aTargetDocument); + nsresult rv; + + nsCOMPtr parser; + parser = do_CreateInstance(kCParserCID, &rv); + NS_ENSURE_SUCCESS(rv, rv); + + // create the html fragment sink + nsCOMPtr sink; + sink = do_CreateInstance(NS_HTMLFRAGMENTSINK2_CONTRACTID); + NS_ENSURE_TRUE(sink, NS_ERROR_FAILURE); + nsCOMPtr fragSink(do_QueryInterface(sink)); + NS_ENSURE_TRUE(fragSink, NS_ERROR_FAILURE); + + fragSink->SetTargetDocument(aTargetDocument); + + // parse the fragment + parser->SetContentSink(sink); + parser->Parse(aFragString, (void*)0, NS_LITERAL_CSTRING("text/html"), PR_FALSE, PR_TRUE, eDTDMode_fragment); + // get the fragment node + nsCOMPtr contextfrag; + rv = fragSink->GetFragment(getter_AddRefs(contextfrag)); + NS_ENSURE_SUCCESS(rv, rv); + rv = CallQueryInterface (contextfrag, outNode); + + return rv; +} + - nsCOMPtr childNode; - rv = aParentNode->GetFirstChild(getter_AddRefs(childNode)); - if (!childNode) return NS_OK; +// find all of the text and CDATA nodes below aNode and return them as a string +nsresult +nsLivemarkLoadListener::FindTextInChildNodes (nsIDOMNode *aNode, nsAString &aString) +{ + NS_ENSURE_ARG(aNode); + + nsresult rv; + + nsCOMPtr aDoc; + aNode->GetOwnerDocument(getter_AddRefs(aDoc)); + nsCOMPtr trav = do_QueryInterface(aDoc, &rv); if (NS_FAILED(rv)) return rv; - PRUint16 nodeType = 0; - do { - rv = childNode->GetNodeType(&nodeType); - if (nodeType == nsIDOMNode::TEXT_NODE || - nodeType == nsIDOMNode::CDATA_SECTION_NODE) - break; - - nsCOMPtr temp; - rv = childNode->GetNextSibling(getter_AddRefs(temp)); - if (NS_FAILED(rv)) return rv; - childNode = temp; - } while (childNode); - - if (nodeType == nsIDOMNode::TEXT_NODE || - nodeType == nsIDOMNode::CDATA_SECTION_NODE) - { - nsCOMPtr charTextNode = do_QueryInterface(childNode); - if (charTextNode) - return charTextNode->GetData(aString); + nsCOMPtr walker; + rv = trav->CreateTreeWalker(aNode, + nsIDOMNodeFilter::SHOW_TEXT | nsIDOMNodeFilter::SHOW_CDATA_SECTION, + nsnull, PR_TRUE, getter_AddRefs(walker)); + if (NS_FAILED(rv)) return rv; + + nsCOMPtr currentNode; + walker->GetCurrentNode(getter_AddRefs(currentNode)); + nsCOMPtr charTextNode; + nsAutoString tempString; + while (currentNode) { + charTextNode = do_QueryInterface(currentNode); + if (charTextNode) { + charTextNode->GetData(tempString); + aString.Append(tempString); + } + walker->NextNode(getter_AddRefs(currentNode)); + } + + if (aString.IsEmpty()) { + return NS_ERROR_FAILURE; + } else { + return NS_OK; } - - return NS_ERROR_FAILURE; } nsresult @@ -620,12 +668,59 @@ nsLivemarkLoadListener::TryParseAsSimpleRSS () rv = childNode->GetNodeName (childNname); if (childNname.Equals(NS_LITERAL_STRING("title"))) { - rv = FindTextInNode (childNode, titleStr); + if (isAtom) { + /* Atom titles can contain HTML, so we need to find and remove it */ + nsCOMPtr titleElem = do_QueryInterface(childNode); + if (!titleElem) break; // out of while(childNode) loop + + nsAutoString titleMode; // Atom 0.3 only + nsAutoString titleType; + titleElem->GetAttribute(NS_LITERAL_STRING("type"), titleType); + titleElem->GetAttribute(NS_LITERAL_STRING("mode"), titleMode); + + /* No one does this in except standards pedants making test feeds, + * Atom 0.3 is deprecated, and RFC 4287 doesn't allow it + */ + if (titleMode.EqualsLiteral("base64")) { + break; // out of while(childNode) loop + } + + /* Cover Atom 0.3 and RFC 4287 together */ + if (titleType.EqualsLiteral("text") || + titleType.EqualsLiteral("text/plain") || + titleType.IsEmpty()) + { + rv = FindTextInChildNodes(childNode, titleStr); + } else if (titleType.EqualsLiteral("html") || // RFC 4287 + (titleType.EqualsLiteral("text/html") && // Atom 0.3 + !titleMode.EqualsLiteral("xml")) || + titleMode.EqualsLiteral("escaped")) // Atom 0.3 + { + nsAutoString escapedHTMLStr; + rv = FindTextInChildNodes(childNode, escapedHTMLStr); + if (NS_FAILED(rv)) break; // out of while(childNode) loop + + nsCOMPtr<nsIDOMNode> newNode; + nsCOMPtr<nsIDocument> doc = do_QueryInterface(xmldoc); + ParseHTMLFragment(escapedHTMLStr, doc, getter_AddRefs(newNode)); + rv = FindTextInChildNodes(newNode, titleStr); + + }else if (titleType.EqualsLiteral("xhtml") || // RFC 4287 + titleType.EqualsLiteral("application/xhtml") || // Atom 0.3 + titleMode.EqualsLiteral("xml")) // Atom 0.3 + { + rv = FindTextInChildNodes(childNode, titleStr); + } else { + break; // out of while(childNode) loop + } + } else { + rv = FindTextInChildNodes(childNode, titleStr); + } if (NS_FAILED(rv)) break; } else if (childNname.Equals(NS_LITERAL_STRING("pubDate")) || childNname.Equals(NS_LITERAL_STRING("updated"))) { - rv = FindTextInNode (childNode, dateStr); + rv = FindTextInChildNodes (childNode, dateStr); if (NS_FAILED(rv)) break; } else if (!isAtom && childNname.Equals(NS_LITERAL_STRING("guid"))) { nsCOMPtr<nsIDOMElement> linkElem = do_QueryInterface(childNode); @@ -636,7 +731,7 @@ nsLivemarkLoadListener::TryParseAsSimpleRSS () // Ignore failures. isPermaLink defaults to true. if (!isPermaLink.Equals(NS_LITERAL_STRING("false"))) { // in node's TEXT - rv = FindTextInNode (childNode, linkStr); + rv = FindTextInChildNodes (childNode, linkStr); if (NS_FAILED(rv)) break; } } else if (childNname.Equals(NS_LITERAL_STRING("link"))) { @@ -672,7 +767,7 @@ nsLivemarkLoadListener::TryParseAsSimpleRSS () } } else if (linkStr.IsEmpty()) { // in node's TEXT - rv = FindTextInNode (childNode, linkStr); + rv = FindTextInChildNodes (childNode, linkStr); if (NS_FAILED(rv)) break; } } @@ -687,6 +782,11 @@ nsLivemarkLoadListener::TryParseAsSimpleRSS () if (!childNode || NS_FAILED(rv)) break; } + // Clean up whitespace + titleStr.CompressWhitespace(); + linkStr.Trim("\b\t\r\n "); + dateStr.CompressWhitespace(); + if (titleStr.IsEmpty() && !dateStr.IsEmpty()) titleStr.Assign(dateStr);