Patch from sayrer@gmail.com Change text search method, handle html/xhtml, trim whitespace bug=321036 r=annie.sullivan@gmail.com

This commit is contained in:
annie.sullivan%gmail.com 2006-01-27 21:58:25 +00:00
Родитель ce64f51e07
Коммит 670ce54d8a
2 изменённых файлов: 137 добавлений и 36 удалений

Просмотреть файл

@ -72,6 +72,7 @@ REQUIRES = xpcom \
xpconnect \
js \
nkcache \
widget \
$(NULL)
LOCAL_INCLUDES = -I$(srcdir)/../../build

Просмотреть файл

@ -21,6 +21,7 @@
*
* Contributor(s):
* Annie Sullivan <annie.sullivan@gmail.com> (modified to work with places)
* Robert Sayre <sayrer@gmail.com>
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
@ -70,6 +71,17 @@
#include "nsIDOMCharacterData.h"
#include "nsIDOMNodeList.h"
#include "nsIDOM3Node.h"
#include "nsIDOMDocumentTraversal.h"
#include "nsIDOMTreeWalker.h"
#include "nsIDOMNodeFilter.h"
#include "nsIDOMDocumentFragment.h"
#include "nsIParser.h"
#include "nsParserCIID.h"
#include "nsIFragmentContentSink.h"
#include "nsIContentSink.h"
#include "nsIDocument.h"
static NS_DEFINE_CID(kCParserCID, NS_PARSER_CID);
// These are defined in nsLivemarkService.cpp
extern nsIRDFResource *kLMRDF_type;
@ -135,7 +147,8 @@ protected:
// helpers
NS_METHOD HandleRDFItem (nsIRDFDataSource *aDS, nsIRDFResource *itemResource,
nsIRDFResource *aLinkResource, nsIRDFResource *aTitleResource);
NS_METHOD FindTextInNode (nsIDOMNode *aParentNode, nsAString &aString);
NS_METHOD FindTextInChildNodes (nsIDOMNode *aNode, nsAString &aString);
NS_METHOD ParseHTMLFragment(nsAString &aFragString, nsIDocument* aTargetDocument, nsIDOMNode **outNode);
PRBool IsLinkValid(const PRUnichar *aURI);
@ -412,44 +425,79 @@ nsLivemarkLoadListener::TryParseAsRDF ()
return NS_OK;
}
//
// find the first node below aParentNode that is a TEXT_NODE,
// and return it
//
nsresult
nsLivemarkLoadListener::FindTextInNode (nsIDOMNode *aParentNode, nsAString &aString)
nsLivemarkLoadListener::ParseHTMLFragment(nsAString &aFragString,
nsIDocument* aTargetDocument,
nsIDOMNode **outNode)
{
NS_ENSURE_ARG(aParentNode);
NS_ENSURE_ARG(aTargetDocument);
nsresult rv;
nsCOMPtr<nsIParser> parser;
parser = do_CreateInstance(kCParserCID, &rv);
NS_ENSURE_SUCCESS(rv, rv);
// create the html fragment sink
nsCOMPtr<nsIContentSink> sink;
sink = do_CreateInstance(NS_HTMLFRAGMENTSINK2_CONTRACTID);
NS_ENSURE_TRUE(sink, NS_ERROR_FAILURE);
nsCOMPtr<nsIFragmentContentSink> fragSink(do_QueryInterface(sink));
NS_ENSURE_TRUE(fragSink, NS_ERROR_FAILURE);
fragSink->SetTargetDocument(aTargetDocument);
// parse the fragment
parser->SetContentSink(sink);
parser->Parse(aFragString, (void*)0, NS_LITERAL_CSTRING("text/html"), PR_FALSE, PR_TRUE, eDTDMode_fragment);
// get the fragment node
nsCOMPtr<nsIDOMDocumentFragment> contextfrag;
rv = fragSink->GetFragment(getter_AddRefs(contextfrag));
NS_ENSURE_SUCCESS(rv, rv);
rv = CallQueryInterface (contextfrag, outNode);
return rv;
}
nsCOMPtr<nsIDOMNode> childNode;
rv = aParentNode->GetFirstChild(getter_AddRefs(childNode));
if (!childNode) return NS_OK;
// find all of the text and CDATA nodes below aNode and return them as a string
nsresult
nsLivemarkLoadListener::FindTextInChildNodes (nsIDOMNode *aNode, nsAString &aString)
{
NS_ENSURE_ARG(aNode);
nsresult rv;
nsCOMPtr<nsIDOMDocument> aDoc;
aNode->GetOwnerDocument(getter_AddRefs(aDoc));
nsCOMPtr<nsIDOMDocumentTraversal> trav = do_QueryInterface(aDoc, &rv);
if (NS_FAILED(rv)) return rv;
PRUint16 nodeType = 0;
do {
rv = childNode->GetNodeType(&nodeType);
if (nodeType == nsIDOMNode::TEXT_NODE ||
nodeType == nsIDOMNode::CDATA_SECTION_NODE)
break;
nsCOMPtr<nsIDOMNode> temp;
rv = childNode->GetNextSibling(getter_AddRefs(temp));
if (NS_FAILED(rv)) return rv;
childNode = temp;
} while (childNode);
if (nodeType == nsIDOMNode::TEXT_NODE ||
nodeType == nsIDOMNode::CDATA_SECTION_NODE)
{
nsCOMPtr<nsIDOMCharacterData> charTextNode = do_QueryInterface(childNode);
if (charTextNode)
return charTextNode->GetData(aString);
nsCOMPtr<nsIDOMTreeWalker> walker;
rv = trav->CreateTreeWalker(aNode,
nsIDOMNodeFilter::SHOW_TEXT | nsIDOMNodeFilter::SHOW_CDATA_SECTION,
nsnull, PR_TRUE, getter_AddRefs(walker));
if (NS_FAILED(rv)) return rv;
nsCOMPtr<nsIDOMNode> currentNode;
walker->GetCurrentNode(getter_AddRefs(currentNode));
nsCOMPtr<nsIDOMCharacterData> charTextNode;
nsAutoString tempString;
while (currentNode) {
charTextNode = do_QueryInterface(currentNode);
if (charTextNode) {
charTextNode->GetData(tempString);
aString.Append(tempString);
}
walker->NextNode(getter_AddRefs(currentNode));
}
if (aString.IsEmpty()) {
return NS_ERROR_FAILURE;
} else {
return NS_OK;
}
return NS_ERROR_FAILURE;
}
nsresult
@ -620,12 +668,59 @@ nsLivemarkLoadListener::TryParseAsSimpleRSS ()
rv = childNode->GetNodeName (childNname);
if (childNname.Equals(NS_LITERAL_STRING("title"))) {
rv = FindTextInNode (childNode, titleStr);
if (isAtom) {
/* Atom titles can contain HTML, so we need to find and remove it */
nsCOMPtr<nsIDOMElement> titleElem = do_QueryInterface(childNode);
if (!titleElem) break; // out of while(childNode) loop
nsAutoString titleMode; // Atom 0.3 only
nsAutoString titleType;
titleElem->GetAttribute(NS_LITERAL_STRING("type"), titleType);
titleElem->GetAttribute(NS_LITERAL_STRING("mode"), titleMode);
/* No one does this in <title> except standards pedants making test feeds,
* Atom 0.3 is deprecated, and RFC 4287 doesn't allow it
*/
if (titleMode.EqualsLiteral("base64")) {
break; // out of while(childNode) loop
}
/* Cover Atom 0.3 and RFC 4287 together */
if (titleType.EqualsLiteral("text") ||
titleType.EqualsLiteral("text/plain") ||
titleType.IsEmpty())
{
rv = FindTextInChildNodes(childNode, titleStr);
} else if (titleType.EqualsLiteral("html") || // RFC 4287
(titleType.EqualsLiteral("text/html") && // Atom 0.3
!titleMode.EqualsLiteral("xml")) ||
titleMode.EqualsLiteral("escaped")) // Atom 0.3
{
nsAutoString escapedHTMLStr;
rv = FindTextInChildNodes(childNode, escapedHTMLStr);
if (NS_FAILED(rv)) break; // out of while(childNode) loop
nsCOMPtr<nsIDOMNode> newNode;
nsCOMPtr<nsIDocument> doc = do_QueryInterface(xmldoc);
ParseHTMLFragment(escapedHTMLStr, doc, getter_AddRefs(newNode));
rv = FindTextInChildNodes(newNode, titleStr);
}else if (titleType.EqualsLiteral("xhtml") || // RFC 4287
titleType.EqualsLiteral("application/xhtml") || // Atom 0.3
titleMode.EqualsLiteral("xml")) // Atom 0.3
{
rv = FindTextInChildNodes(childNode, titleStr);
} else {
break; // out of while(childNode) loop
}
} else {
rv = FindTextInChildNodes(childNode, titleStr);
}
if (NS_FAILED(rv)) break;
} else if (childNname.Equals(NS_LITERAL_STRING("pubDate")) ||
childNname.Equals(NS_LITERAL_STRING("updated")))
{
rv = FindTextInNode (childNode, dateStr);
rv = FindTextInChildNodes (childNode, dateStr);
if (NS_FAILED(rv)) break;
} else if (!isAtom && childNname.Equals(NS_LITERAL_STRING("guid"))) {
nsCOMPtr<nsIDOMElement> linkElem = do_QueryInterface(childNode);
@ -636,7 +731,7 @@ nsLivemarkLoadListener::TryParseAsSimpleRSS ()
// Ignore failures. isPermaLink defaults to true.
if (!isPermaLink.Equals(NS_LITERAL_STRING("false"))) {
// in node's TEXT
rv = FindTextInNode (childNode, linkStr);
rv = FindTextInChildNodes (childNode, linkStr);
if (NS_FAILED(rv)) break;
}
} else if (childNname.Equals(NS_LITERAL_STRING("link"))) {
@ -672,7 +767,7 @@ nsLivemarkLoadListener::TryParseAsSimpleRSS ()
}
} else if (linkStr.IsEmpty()) {
// in node's TEXT
rv = FindTextInNode (childNode, linkStr);
rv = FindTextInChildNodes (childNode, linkStr);
if (NS_FAILED(rv)) break;
}
}
@ -687,6 +782,11 @@ nsLivemarkLoadListener::TryParseAsSimpleRSS ()
if (!childNode || NS_FAILED(rv)) break;
}
// Clean up whitespace
titleStr.CompressWhitespace();
linkStr.Trim("\b\t\r\n ");
dateStr.CompressWhitespace();
if (titleStr.IsEmpty() && !dateStr.IsEmpty())
titleStr.Assign(dateStr);