/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ /* * nsIContentSerializer implementation that can be used with an * nsIDocumentEncoder to convert a DOM into plaintext in a nice way * (eg for copy/paste as plaintext). */ #include "nsPlainTextSerializer.h" #include "nsLWBrkCIID.h" #include "nsIServiceManager.h" #include "nsGkAtoms.h" #include "nsNameSpaceManager.h" #include "nsTextFragment.h" #include "nsContentUtils.h" #include "nsReadableUtils.h" #include "nsUnicharUtils.h" #include "nsCRT.h" #include "mozilla/dom/Element.h" #include "mozilla/Preferences.h" #include "mozilla/BinarySearch.h" #include "nsComputedDOMStyle.h" #include "nsIDocument.h" using namespace mozilla; using namespace mozilla::dom; #define PREF_STRUCTS "converter.html2txt.structs" #define PREF_HEADER_STRATEGY "converter.html2txt.header_strategy" static const int32_t kTabSize=4; static const int32_t kIndentSizeHeaders = 2; /* Indention of h1, if mHeaderStrategy = 1 or = 2. Indention of other headers is derived from that. XXX center h1? */ static const int32_t kIndentIncrementHeaders = 2; /* If mHeaderStrategy = 1, indent h(x+1) this many columns more than h(x) */ static const int32_t kIndentSizeList = kTabSize; // Indention of non-first lines of ul and ol static const int32_t kIndentSizeDD = kTabSize; // Indention of
, etc. I mean it! Don't make me smack you! return NS_OK; } if (mTagStackIndex < TagStackSize) { mTagStack[mTagStackIndex++] = aTag; } if (mIgnoreAboveIndex != (uint32_t)kNotFound) { return NS_OK; } // Reset this so thatdoesn't affect the whitespace // above randoms below it. mHasWrittenCiteBlockquote = mHasWrittenCiteBlockquote && aTag == nsGkAtoms::pre; bool isInCiteBlockquote = false; // XXX special-caseso that we don't add additional // newlines before the text. if (aTag == nsGkAtoms::blockquote) { nsAutoString value; nsresult rv = GetAttributeValue(nsGkAtoms::type, value); isInCiteBlockquote = NS_SUCCEEDED(rv) && value.EqualsIgnoreCase("cite"); } if (mLineBreakDue && !isInCiteBlockquote) EnsureVerticalSpace(mFloatingLines); // Check if this tag's content that should not be output if ((aTag == nsGkAtoms::noscript && !(mFlags & nsIDocumentEncoder::OutputNoScriptContent)) || ((aTag == nsGkAtoms::iframe || aTag == nsGkAtoms::noframes) && !(mFlags & nsIDocumentEncoder::OutputNoFramesContent))) { // Ignore everything that follows the current tag in // question until a matching end tag is encountered. mIgnoreAboveIndex = mTagStackIndex - 1; return NS_OK; } if (aTag == nsGkAtoms::body) { // Try to figure out here whether we have a // preformatted style attribute. // // Trigger on the presence of a "pre-wrap" in the // style attribute. That's a very simplistic way to do // it, but better than nothing. // Also set mWrapColumn to the value given there // (which arguably we should only do if told to do so). nsAutoString style; int32_t whitespace; if (NS_SUCCEEDED(GetAttributeValue(nsGkAtoms::style, style)) && (kNotFound != (whitespace = style.Find("white-space:")))) { if (kNotFound != style.Find("pre-wrap", true, whitespace)) { #ifdef DEBUG_preformatted printf("Set mPreFormatted based on style pre-wrap\n"); #endif mPreFormatted = true; int32_t widthOffset = style.Find("width:"); if (widthOffset >= 0) { // We have to search for the ch before the semicolon, // not for the semicolon itself, because nsString::ToInteger() // considers 'c' to be a valid numeric char (even if radix=10) // but then gets confused if it sees it next to the number // when the radix specified was 10, and returns an error code. int32_t semiOffset = style.Find("ch", false, widthOffset+6); int32_t length = (semiOffset > 0 ? semiOffset - widthOffset - 6 : style.Length() - widthOffset); nsAutoString widthstr; style.Mid(widthstr, widthOffset+6, length); nsresult err; int32_t col = widthstr.ToInteger(&err); if (NS_SUCCEEDED(err)) { mWrapColumn = (uint32_t)col; #ifdef DEBUG_preformatted printf("Set wrap column to %d based on style\n", mWrapColumn); #endif } } } else if (kNotFound != style.Find("pre", true, whitespace)) { #ifdef DEBUG_preformatted printf("Set mPreFormatted based on style pre\n"); #endif mPreFormatted = true; mWrapColumn = 0; } } else { /* See comment at end of function. */ mInWhitespace = true; mPreFormatted = false; } return NS_OK; } // Keep this in sync with DoCloseContainer! if (!DoOutput()) { return NS_OK; } if (aTag == nsGkAtoms::p) EnsureVerticalSpace(1); else if (aTag == nsGkAtoms::pre) { if (GetLastBool(mIsInCiteBlockquote)) EnsureVerticalSpace(0); else if (mHasWrittenCiteBlockquote) { EnsureVerticalSpace(0); mHasWrittenCiteBlockquote = false; } else EnsureVerticalSpace(1); } else if (aTag == nsGkAtoms::tr) { PushBool(mHasWrittenCellsForRow, false); } else if (aTag == nsGkAtoms::td || aTag == nsGkAtoms::th) { // We must make sure that the content of two table cells get a // space between them. // To make the separation between cells most obvious and // importable, we use a TAB. if (GetLastBool(mHasWrittenCellsForRow)) { // Bypass |Write| so that the TAB isn't compressed away. AddToLine(MOZ_UTF16("\t"), 1); mInWhitespace = true; } else if (mHasWrittenCellsForRow.IsEmpty()) { // We don't always see a(nor a ) before the
if we're // copying part of a table PushBool(mHasWrittenCellsForRow, true); // will never be popped } else { SetLastBool(mHasWrittenCellsForRow, true); } } else if (aTag == nsGkAtoms::ul) { // Indent here to support nested lists, which aren't included in li :-( EnsureVerticalSpace(mULCount + mOLStackIndex == 0 ? 1 : 0); // Must end the current line before we change indention mIndent += kIndentSizeList; mULCount++; } else if (aTag == nsGkAtoms::ol) { EnsureVerticalSpace(mULCount + mOLStackIndex == 0 ? 1 : 0); if (mFlags & nsIDocumentEncoder::OutputFormatted) { // Must end the current line before we change indention if (mOLStackIndex < OLStackSize) { nsAutoString startAttr; int32_t startVal = 1; if (NS_SUCCEEDED(GetAttributeValue(nsGkAtoms::start, startAttr))) { nsresult rv = NS_OK; startVal = startAttr.ToInteger(&rv); if (NS_FAILED(rv)) startVal = 1; } mOLStack[mOLStackIndex++] = startVal; } } else { mOLStackIndex++; } mIndent += kIndentSizeList; // see ul } else if (aTag == nsGkAtoms::li && (mFlags & nsIDocumentEncoder::OutputFormatted)) { if (mTagStackIndex > 1 && IsInOL()) { if (mOLStackIndex > 0) { nsAutoString valueAttr; if (NS_SUCCEEDED(GetAttributeValue(nsGkAtoms::value, valueAttr))) { nsresult rv = NS_OK; int32_t valueAttrVal = valueAttr.ToInteger(&rv); if (NS_SUCCEEDED(rv)) mOLStack[mOLStackIndex-1] = valueAttrVal; } // This is what nsBulletFrame does for OLs: mInIndentString.AppendInt(mOLStack[mOLStackIndex-1]++, 10); } else { mInIndentString.Append(char16_t('#')); } mInIndentString.Append(char16_t('.')); } else { static char bulletCharArray[] = "*o+#"; uint32_t index = mULCount > 0 ? (mULCount - 1) : 3; char bulletChar = bulletCharArray[index % 4]; mInIndentString.Append(char16_t(bulletChar)); } mInIndentString.Append(char16_t(' ')); } else if (aTag == nsGkAtoms::dl) { EnsureVerticalSpace(1); } else if (aTag == nsGkAtoms::dt) { EnsureVerticalSpace(0); } else if (aTag == nsGkAtoms::dd) { EnsureVerticalSpace(0); mIndent += kIndentSizeDD; } else if (aTag == nsGkAtoms::span) { ++mSpanLevel; } else if (aTag == nsGkAtoms::blockquote) { // Push PushBool(mIsInCiteBlockquote, isInCiteBlockquote); if (isInCiteBlockquote) { EnsureVerticalSpace(0); mCiteQuoteLevel++; } else { EnsureVerticalSpace(1); mIndent += kTabSize; // Check for some maximum value? } } else if (aTag == nsGkAtoms::q) { Write(NS_LITERAL_STRING("\"")); } // Else make sure we'll separate block level tags, // even if we're about to leave, before doing any other formatting. else if (nsContentUtils::IsHTMLBlock(aTag)) { EnsureVerticalSpace(0); } ////////////////////////////////////////////////////////////// if (!(mFlags & nsIDocumentEncoder::OutputFormatted)) { return NS_OK; } ////////////////////////////////////////////////////////////// // The rest of this routine is formatted output stuff, // which we should skip if we're not formatted: ////////////////////////////////////////////////////////////// // Push on stack bool currentNodeIsConverted = IsCurrentNodeConverted(); if (aTag == nsGkAtoms::h1 || aTag == nsGkAtoms::h2 || aTag == nsGkAtoms::h3 || aTag == nsGkAtoms::h4 || aTag == nsGkAtoms::h5 || aTag == nsGkAtoms::h6) { EnsureVerticalSpace(2); if (mHeaderStrategy == 2) { // numbered mIndent += kIndentSizeHeaders; // Caching int32_t level = HeaderLevel(aTag); // Increase counter for current level mHeaderCounter[level]++; // Reset all lower levels int32_t i; for (i = level + 1; i <= 6; i++) { mHeaderCounter[i] = 0; } // Construct numbers nsAutoString leadup; for (i = 1; i <= level; i++) { leadup.AppendInt(mHeaderCounter[i]); leadup.Append(char16_t('.')); } leadup.Append(char16_t(' ')); Write(leadup); } else if (mHeaderStrategy == 1) { // indent increasingly mIndent += kIndentSizeHeaders; for (int32_t i = HeaderLevel(aTag); i > 1; i--) { // for h(x), run x-1 times mIndent += kIndentIncrementHeaders; } } } else if (aTag == nsGkAtoms::a && !currentNodeIsConverted) { nsAutoString url; if (NS_SUCCEEDED(GetAttributeValue(nsGkAtoms::href, url)) && !url.IsEmpty()) { mURL = url; } } else if (aTag == nsGkAtoms::sup && mStructs && !currentNodeIsConverted) { Write(NS_LITERAL_STRING("^")); } else if (aTag == nsGkAtoms::sub && mStructs && !currentNodeIsConverted) { Write(NS_LITERAL_STRING("_")); } else if (aTag == nsGkAtoms::code && mStructs && !currentNodeIsConverted) { Write(NS_LITERAL_STRING("|")); } else if ((aTag == nsGkAtoms::strong || aTag == nsGkAtoms::b) && mStructs && !currentNodeIsConverted) { Write(NS_LITERAL_STRING("*")); } else if ((aTag == nsGkAtoms::em || aTag == nsGkAtoms::i) && mStructs && !currentNodeIsConverted) { Write(NS_LITERAL_STRING("/")); } else if (aTag == nsGkAtoms::u && mStructs && !currentNodeIsConverted) { Write(NS_LITERAL_STRING("_")); } /* Container elements are always block elements, so we shouldn't output any whitespace immediately after the container tag even if there's extra whitespace there because the HTML is pretty-printed or something. To ensure that happens, tell the serializer we're already in whitespace so it won't output more. */ mInWhitespace = true; return NS_OK; } nsresult nsPlainTextSerializer::DoCloseContainer(nsIAtom* aTag) { if (ShouldReplaceContainerWithPlaceholder(mElement->Tag())) { mIgnoredChildNodeLevel--; return NS_OK; } if (mFlags & nsIDocumentEncoder::OutputRaw) { // Raw means raw. Don't even think about doing anything fancy // here like indenting, adding line breaks or any other // characters such as list item bullets, quote characters // around , etc. I mean it! Don't make me smack you! return NS_OK; } if (mTagStackIndex > 0) { --mTagStackIndex; } if (mTagStackIndex >= mIgnoreAboveIndex) { if (mTagStackIndex == mIgnoreAboveIndex) { // We're dealing with the close tag whose matching // open tag had set the mIgnoreAboveIndex value. // Reset mIgnoreAboveIndex before discarding this tag. mIgnoreAboveIndex = (uint32_t)kNotFound; } return NS_OK; } // End current line if we're ending a block level tag if ((aTag == nsGkAtoms::body) || (aTag == nsGkAtoms::html)) { // We want the output to end with a new line, // but in preformatted areas like text fields, // we can't emit newlines that weren't there. // So add the newline only in the case of formatted output. if (mFlags & nsIDocumentEncoder::OutputFormatted) { EnsureVerticalSpace(0); } else { FlushLine(); } // We won't want to do anything with these in formatted mode either, // so just return now: return NS_OK; } // Keep this in sync with DoOpenContainer! if (!DoOutput()) { return NS_OK; } if (aTag == nsGkAtoms::tr) { PopBool(mHasWrittenCellsForRow); // Should always end a line, but get no more whitespace if (mFloatingLines < 0) mFloatingLines = 0; mLineBreakDue = true; } else if (((aTag == nsGkAtoms::li) || (aTag == nsGkAtoms::dt)) && (mFlags & nsIDocumentEncoder::OutputFormatted)) { // Items that should always end a line, but get no more whitespace if (mFloatingLines < 0) mFloatingLines = 0; mLineBreakDue = true; } else if (aTag == nsGkAtoms::pre) { mFloatingLines = GetLastBool(mIsInCiteBlockquote) ? 0 : 1; mLineBreakDue = true; } else if (aTag == nsGkAtoms::ul) { FlushLine(); mIndent -= kIndentSizeList; if (--mULCount + mOLStackIndex == 0) { mFloatingLines = 1; mLineBreakDue = true; } } else if (aTag == nsGkAtoms::ol) { FlushLine(); // Doing this after decreasing OLStackIndex would be wrong. mIndent -= kIndentSizeList; NS_ASSERTION(mOLStackIndex, "Wrong OLStack level!"); mOLStackIndex--; if (mULCount + mOLStackIndex == 0) { mFloatingLines = 1; mLineBreakDue = true; } } else if (aTag == nsGkAtoms::dl) { mFloatingLines = 1; mLineBreakDue = true; } else if (aTag == nsGkAtoms::dd) { FlushLine(); mIndent -= kIndentSizeDD; } else if (aTag == nsGkAtoms::span) { NS_ASSERTION(mSpanLevel, "Span level will be negative!"); --mSpanLevel; } else if (aTag == nsGkAtoms::div) { if (mFloatingLines < 0) mFloatingLines = 0; mLineBreakDue = true; } else if (aTag == nsGkAtoms::blockquote) { FlushLine(); // Is this needed? // Pop bool isInCiteBlockquote = PopBool(mIsInCiteBlockquote); if (isInCiteBlockquote) { NS_ASSERTION(mCiteQuoteLevel, "CiteQuote level will be negative!"); mCiteQuoteLevel--; mFloatingLines = 0; mHasWrittenCiteBlockquote = true; } else { mIndent -= kTabSize; mFloatingLines = 1; } mLineBreakDue = true; } else if (aTag == nsGkAtoms::q) { Write(NS_LITERAL_STRING("\"")); } else if (nsContentUtils::IsHTMLBlock(aTag) && aTag != nsGkAtoms::script) { // All other blocks get 1 vertical space after them // in formatted mode, otherwise 0. // This is hard. Sometimes 0 is a better number, but // how to know? if (mFlags & nsIDocumentEncoder::OutputFormatted) EnsureVerticalSpace(1); else { if (mFloatingLines < 0) mFloatingLines = 0; mLineBreakDue = true; } } ////////////////////////////////////////////////////////////// if (!(mFlags & nsIDocumentEncoder::OutputFormatted)) { return NS_OK; } ////////////////////////////////////////////////////////////// // The rest of this routine is formatted output stuff, // which we should skip if we're not formatted: ////////////////////////////////////////////////////////////// // Pop the currentConverted stack bool currentNodeIsConverted = IsCurrentNodeConverted(); if (aTag == nsGkAtoms::h1 || aTag == nsGkAtoms::h2 || aTag == nsGkAtoms::h3 || aTag == nsGkAtoms::h4 || aTag == nsGkAtoms::h5 || aTag == nsGkAtoms::h6) { if (mHeaderStrategy) { /*numbered or indent increasingly*/ mIndent -= kIndentSizeHeaders; } if (mHeaderStrategy == 1 /*indent increasingly*/ ) { for (int32_t i = HeaderLevel(aTag); i > 1; i--) { // for h(x), run x-1 times mIndent -= kIndentIncrementHeaders; } } EnsureVerticalSpace(1); } else if (aTag == nsGkAtoms::a && !currentNodeIsConverted && !mURL.IsEmpty()) { nsAutoString temp; temp.AssignLiteral(" <"); temp += mURL; temp.Append(char16_t('>')); Write(temp); mURL.Truncate(); } else if ((aTag == nsGkAtoms::sup || aTag == nsGkAtoms::sub) && mStructs && !currentNodeIsConverted) { Write(kSpace); } else if (aTag == nsGkAtoms::code && mStructs && !currentNodeIsConverted) { Write(NS_LITERAL_STRING("|")); } else if ((aTag == nsGkAtoms::strong || aTag == nsGkAtoms::b) && mStructs && !currentNodeIsConverted) { Write(NS_LITERAL_STRING("*")); } else if ((aTag == nsGkAtoms::em || aTag == nsGkAtoms::i) && mStructs && !currentNodeIsConverted) { Write(NS_LITERAL_STRING("/")); } else if (aTag == nsGkAtoms::u && mStructs && !currentNodeIsConverted) { Write(NS_LITERAL_STRING("_")); } return NS_OK; } bool nsPlainTextSerializer::MustSuppressLeaf() { if (mIgnoredChildNodeLevel > 0) { return true; } if ((mTagStackIndex > 1 && mTagStack[mTagStackIndex-2] == nsGkAtoms::select) || (mTagStackIndex > 0 && mTagStack[mTagStackIndex-1] == nsGkAtoms::select)) { // Don't output the contents of SELECT elements; // Might be nice, eventually, to output just the selected element. // Read more in bug 31994. return true; } if (mTagStackIndex > 0 && (mTagStack[mTagStackIndex-1] == nsGkAtoms::script || mTagStack[mTagStackIndex-1] == nsGkAtoms::style)) { // Don't output the contents of