gecko-dev/netwerk/streamconv/converters/mozTXTToHTMLConv.cpp

/* -*- Mode: C; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#include "mozTXTToHTMLConv.h"
#include "nsNetUtil.h"
#include "nsUnicharUtils.h"
#include "nsCRT.h"
#include "nsIExternalProtocolHandler.h"
#include "nsIIOService.h"

#include <algorithm>

#ifdef DEBUG_BenB_Perf
#include "prtime.h"
#include "prinrval.h"
#endif

const double growthRate = 1.2;

// Bug 183111, editor now replaces multiple spaces with leading
// 0xA0's and a single ending space, so need to treat 0xA0's as spaces.
// 0xA0 is the Latin1/Unicode character for "non-breaking space (nbsp)"
// Also recognize the Japanese ideographic space 0x3000 as a space.
static inline bool IsSpace(const char16_t aChar)
{
  return (nsCRT::IsAsciiSpace(aChar) || aChar == 0xA0 || aChar == 0x3000);
}

// Escape Char will take ch, escape it and append the result to
// aStringToAppendTo
void
mozTXTToHTMLConv::EscapeChar(const char16_t ch, nsString& aStringToAppendTo,
                             bool inAttribute)
{
    switch (ch)
    {
    case '<':
      aStringToAppendTo.AppendLiteral("&lt;");
      break;
    case '>':
      aStringToAppendTo.AppendLiteral("&gt;");
      break;
    case '&':
      aStringToAppendTo.AppendLiteral("&amp;");
      break;
    case '"':
      if (inAttribute)
      {
        aStringToAppendTo.AppendLiteral("&quot;");
        break;
      }
      // else fall through
    default:
      aStringToAppendTo += ch;
    }

    return;
}

// EscapeStr takes the passed in string and
// escapes it IN PLACE.
void
mozTXTToHTMLConv::EscapeStr(nsString& aInString, bool inAttribute)
{
  // the replace substring routines
  // don't seem to work if you have a character
  // in the in string that is also in the replacement
  // string! =(
  //aInString.ReplaceSubstring("&", "&amp;");
  //aInString.ReplaceSubstring("<", "&lt;");
  //aInString.ReplaceSubstring(">", "&gt;");
  for (uint32_t i = 0; i < aInString.Length();)
  {
    switch (aInString[i])
    {
    case '<':
      aInString.Cut(i, 1);
      aInString.Insert(NS_LITERAL_STRING("&lt;"), i);
      i += 4; // skip past the integers we just added
      break;
    case '>':
      aInString.Cut(i, 1);
      aInString.Insert(NS_LITERAL_STRING("&gt;"), i);
      i += 4; // skip past the integers we just added
      break;
    case '&':
      aInString.Cut(i, 1);
      aInString.Insert(NS_LITERAL_STRING("&amp;"), i);
      i += 5; // skip past the integers we just added
      break;
    case '"':
      if (inAttribute)
      {
        aInString.Cut(i, 1);
        aInString.Insert(NS_LITERAL_STRING("&quot;"), i);
        i += 6;
        break;
      }
      // else fall through
    default:
      i++;
    }
  }
}

void
mozTXTToHTMLConv::UnescapeStr(const char16_t * aInString, int32_t aStartPos, int32_t aLength, nsString& aOutString)
{
  const char16_t * subString = nullptr;
  for (uint32_t i = aStartPos; int32_t(i) - aStartPos < aLength;)
  {
    int32_t remainingChars = i - aStartPos;
    if (aInString[i] == '&')
    {
      subString = &aInString[i];
      if (!nsCRT::strncmp(subString, MOZ_UTF16("&lt;"), std::min(4, aLength - remainingChars)))
      {
        aOutString.Append(char16_t('<'));
        i += 4;
      }
      else if (!nsCRT::strncmp(subString, MOZ_UTF16("&gt;"), std::min(4, aLength - remainingChars)))
      {
        aOutString.Append(char16_t('>'));
        i += 4;
      }
      else if (!nsCRT::strncmp(subString, MOZ_UTF16("&amp;"), std::min(5, aLength - remainingChars)))
      {
        aOutString.Append(char16_t('&'));
        i += 5;
      }
      else if (!nsCRT::strncmp(subString, MOZ_UTF16("&quot;"), std::min(6, aLength - remainingChars)))
      {
        aOutString.Append(char16_t('"'));
        i += 6;
      }
      else
      {
        aOutString += aInString[i];
        i++;
      }
    }
    else
    {
      aOutString += aInString[i];
      i++;
    }
  }
}

void
mozTXTToHTMLConv::CompleteAbbreviatedURL(const char16_t * aInString, int32_t aInLength,
                                         const uint32_t pos, nsString& aOutString)
{
  NS_ASSERTION(int32_t(pos) < aInLength, "bad args to CompleteAbbreviatedURL, see bug #190851");
  if (int32_t(pos) >= aInLength)
    return;

  if (aInString[pos] == '@')
  {
    // only pre-pend a mailto url if the string contains a .domain in it..
    //i.e. we want to linkify johndoe@foo.com but not "let's meet @8pm"
    nsDependentString inString(aInString, aInLength);
    if (inString.FindChar('.', pos) != kNotFound) // if we have a '.' after the @ sign....
    {
      aOutString.AssignLiteral("mailto:");
      aOutString += aInString;
    }
  }
  else if (aInString[pos] == '.')
  {
    if (ItMatchesDelimited(aInString, aInLength,
                           MOZ_UTF16("www."), 4, LT_IGNORE, LT_IGNORE))
    {
      aOutString.AssignLiteral("http://");
      aOutString += aInString;
    }
    else if (ItMatchesDelimited(aInString,aInLength, MOZ_UTF16("ftp."), 4, LT_IGNORE, LT_IGNORE))
    {
      aOutString.AssignLiteral("ftp://");
      aOutString += aInString;
    }
  }
}

bool
mozTXTToHTMLConv::FindURLStart(const char16_t * aInString, int32_t aInLength,
                               const uint32_t pos, const modetype check,
                               uint32_t& start)
{
  switch(check)
  { // no breaks, because end of blocks is never reached
  case RFC1738:
  {
    if (!nsCRT::strncmp(&aInString[std::max(int32_t(pos - 4), 0)], MOZ_UTF16("<URL:"), 5))
    {
      start = pos + 1;
      return true;
    }
    else
      return false;
  }
  case RFC2396E:
  {
    nsString temp(aInString, aInLength);
    int32_t i = pos <= 0 ? kNotFound : temp.RFindCharInSet(MOZ_UTF16("<>\""), pos - 1);
    if (i != kNotFound && (temp[uint32_t(i)] == '<' ||
                           temp[uint32_t(i)] == '"'))
    {
      start = uint32_t(++i);
      return start < pos;
    }
    else
      return false;
  }
  case freetext:
  {
    int32_t i = pos - 1;
    for (; i >= 0 && (
         nsCRT::IsAsciiAlpha(aInString[uint32_t(i)]) ||
         nsCRT::IsAsciiDigit(aInString[uint32_t(i)]) ||
         aInString[uint32_t(i)] == '+' ||
         aInString[uint32_t(i)] == '-' ||
         aInString[uint32_t(i)] == '.'
         ); i--)
      ;
    if (++i >= 0 && uint32_t(i) < pos && nsCRT::IsAsciiAlpha(aInString[uint32_t(i)]))
    {
      start = uint32_t(i);
      return true;
    }
    else
      return false;
  }
  case abbreviated:
  {
    int32_t i = pos - 1;
    // This disallows non-ascii-characters for email.
    // Currently correct, but revisit later after standards changed.
    bool isEmail = aInString[pos] == (char16_t)'@';
    // These chars mark the start of the URL
    for (; i >= 0
             && aInString[uint32_t(i)] != '>' && aInString[uint32_t(i)] != '<'
             && aInString[uint32_t(i)] != '"' && aInString[uint32_t(i)] != '\''
             && aInString[uint32_t(i)] != '`' && aInString[uint32_t(i)] != ','
             && aInString[uint32_t(i)] != '{' && aInString[uint32_t(i)] != '['
             && aInString[uint32_t(i)] != '(' && aInString[uint32_t(i)] != '|'
             && aInString[uint32_t(i)] != '\\'
             && !IsSpace(aInString[uint32_t(i)])
             && (!isEmail || nsCRT::IsAscii(aInString[uint32_t(i)]))
         ; i--)
      ;
    if
      (
        ++i >= 0 && uint32_t(i) < pos
          &&
          (
            nsCRT::IsAsciiAlpha(aInString[uint32_t(i)]) ||
            nsCRT::IsAsciiDigit(aInString[uint32_t(i)])
          )
      )
    {
      start = uint32_t(i);
      return true;
    }
    else
      return false;
  }
  default:
    return false;
  } //switch
}

bool
mozTXTToHTMLConv::FindURLEnd(const char16_t * aInString, int32_t aInStringLength, const uint32_t pos,
           const modetype check, const uint32_t start, uint32_t& end)
{
  switch(check)
  { // no breaks, because end of blocks is never reached
  case RFC1738:
  case RFC2396E:
  {
    nsString temp(aInString, aInStringLength);

    int32_t i = temp.FindCharInSet(MOZ_UTF16("<>\""), pos + 1);
    if (i != kNotFound && temp[uint32_t(i--)] ==
        (check == RFC1738 || temp[start - 1] == '<' ? '>' : '"'))
    {
      end = uint32_t(i);
      return end > pos;
    }
    return false;
  }
  case freetext:
  case abbreviated:
  {
    uint32_t i = pos + 1;
    bool isEmail = aInString[pos] == (char16_t)'@';
    bool seenOpeningParenthesis = false; // there is a '(' earlier in the URL
    bool seenOpeningSquareBracket = false; // there is a '[' earlier in the URL
    for (; int32_t(i) < aInStringLength; i++)
    {
      // These chars mark the end of the URL
      if (aInString[i] == '>' || aInString[i] == '<' ||
          aInString[i] == '"' || aInString[i] == '`' ||
          aInString[i] == '}' || aInString[i] == '{' ||
          aInString[i] == '|' ||
          (aInString[i] == ')' && !seenOpeningParenthesis) ||
          (aInString[i] == ']' && !seenOpeningSquareBracket) ||
          // Allow IPv6 adresses like http://[1080::8:800:200C:417A]/foo.
          (aInString[i] == '[' && i > 2 &&
           (aInString[i - 1] != '/' || aInString[i - 2] != '/')) ||
          IsSpace(aInString[i]))
          break;
      // Disallow non-ascii-characters for email.
      // Currently correct, but revisit later after standards changed.
      if (isEmail && (
            aInString[i] == '(' || aInString[i] == '\'' ||
            !nsCRT::IsAscii(aInString[i])))
          break;
      if (aInString[i] == '(')
        seenOpeningParenthesis = true;
      if (aInString[i] == '[')
        seenOpeningSquareBracket = true;
    }
    // These chars are allowed in the middle of the URL, but not at end.
    // Technically they are, but are used in normal text after the URL.
    while (--i > pos && (
             aInString[i] == '.' || aInString[i] == ',' || aInString[i] == ';' ||
             aInString[i] == '!' || aInString[i] == '?' || aInString[i] == '-' ||
             aInString[i] == ':' || aInString[i] == '\''
             ))
        ;
    if (i > pos)
    {
      end = i;
      return true;
    }
    return false;
  }
  default:
    return false;
  } //switch
}

void
mozTXTToHTMLConv::CalculateURLBoundaries(const char16_t * aInString, int32_t aInStringLength,
     const uint32_t pos, const uint32_t whathasbeendone,
     const modetype check, const uint32_t start, const uint32_t end,
     nsString& txtURL, nsString& desc,
     int32_t& replaceBefore, int32_t& replaceAfter)
{
  uint32_t descstart = start;
  switch(check)
  {
  case RFC1738:
  {
    descstart = start - 5;
    desc.Append(&aInString[descstart], end - descstart + 2);  // include "<URL:" and ">"
    replaceAfter = end - pos + 1;
  } break;
  case RFC2396E:
  {
    descstart = start - 1;
    desc.Append(&aInString[descstart], end - descstart + 2); // include brackets
    replaceAfter = end - pos + 1;
  } break;
  case freetext:
  case abbreviated:
  {
    descstart = start;
    desc.Append(&aInString[descstart], end - start + 1); // don't include brackets
    replaceAfter = end - pos;
  } break;
  default: break;
  } //switch

  EscapeStr(desc, false);

  txtURL.Append(&aInString[start], end - start + 1);
  txtURL.StripWhitespace();

  // FIX ME
  nsAutoString temp2;
  ScanTXT(&aInString[descstart], pos - descstart, ~kURLs /*prevents loop*/ & whathasbeendone, temp2);
  replaceBefore = temp2.Length();
  return;
}

bool mozTXTToHTMLConv::ShouldLinkify(const nsCString& aURL)
{
  if (!mIOService)
    return false;

  nsAutoCString scheme;
  nsresult rv = mIOService->ExtractScheme(aURL, scheme);
  if(NS_FAILED(rv))
    return false;

  // Get the handler for this scheme.
  nsCOMPtr<nsIProtocolHandler> handler;
  rv = mIOService->GetProtocolHandler(scheme.get(), getter_AddRefs(handler));
  if(NS_FAILED(rv))
    return false;

  // Is it an external protocol handler? If not, linkify it.
  nsCOMPtr<nsIExternalProtocolHandler> externalHandler = do_QueryInterface(handler);
  if (!externalHandler)
   return true; // handler is built-in, linkify it!

  // If external app exists for the scheme then linkify it.
  bool exists;
  rv = externalHandler->ExternalAppExistsForScheme(scheme, &exists);
  return(NS_SUCCEEDED(rv) && exists);
}

bool
mozTXTToHTMLConv::CheckURLAndCreateHTML(
     const nsString& txtURL, const nsString& desc, const modetype mode,
     nsString& outputHTML)
{
  // Create *uri from txtURL
  nsCOMPtr<nsIURI> uri;
  nsresult rv;
  // Lazily initialize mIOService
  if (!mIOService)
  {
    mIOService = do_GetIOService();

    if (!mIOService)
      return false;
  }

  // See if the url should be linkified.
  NS_ConvertUTF16toUTF8 utf8URL(txtURL);
  if (!ShouldLinkify(utf8URL))
    return false;

  // it would be faster if we could just check to see if there is a protocol
  // handler for the url and return instead of actually trying to create a url...
  rv = mIOService->NewURI(utf8URL, nullptr, nullptr, getter_AddRefs(uri));

  // Real work
  if (NS_SUCCEEDED(rv) && uri)
  {
    outputHTML.AssignLiteral("<a class=\"moz-txt-link-");
    switch(mode)
    {
    case RFC1738:
      outputHTML.AppendLiteral("rfc1738");
      break;
    case RFC2396E:
      outputHTML.AppendLiteral("rfc2396E");
      break;
    case freetext:
      outputHTML.AppendLiteral("freetext");
      break;
    case abbreviated:
      outputHTML.AppendLiteral("abbreviated");
      break;
    default: break;
    }
    nsAutoString escapedURL(txtURL);
    EscapeStr(escapedURL, true);

    outputHTML.AppendLiteral("\" href=\"");
    outputHTML += escapedURL;
    outputHTML.AppendLiteral("\">");
    outputHTML += desc;
    outputHTML.AppendLiteral("</a>");
    return true;
  }
  else
    return false;
}

NS_IMETHODIMP mozTXTToHTMLConv::FindURLInPlaintext(const char16_t * aInString, int32_t aInLength, int32_t aPos, int32_t * aStartPos, int32_t * aEndPos)
{
  // call FindURL on the passed in string
  nsAutoString outputHTML; // we'll ignore the generated output HTML

  *aStartPos = -1;
  *aEndPos = -1;

  FindURL(aInString, aInLength, aPos, kURLs, outputHTML, *aStartPos, *aEndPos);

  return NS_OK;
}

bool
mozTXTToHTMLConv::FindURL(const char16_t * aInString, int32_t aInLength, const uint32_t pos,
     const uint32_t whathasbeendone,
     nsString& outputHTML, int32_t& replaceBefore, int32_t& replaceAfter)
{
  enum statetype {unchecked, invalid, startok, endok, success};
  static const modetype ranking[] = {RFC1738, RFC2396E, freetext, abbreviated};

  statetype state[mozTXTToHTMLConv_lastMode + 1]; // 0(=unknown)..lastMode
  /* I don't like this abuse of enums as index for the array,
     but I don't know a better method */

  // Define, which modes to check
  /* all modes but abbreviated are checked for text[pos] == ':',
     only abbreviated for '.', RFC2396E and abbreviated for '@' */
  for (modetype iState = unknown; iState <= mozTXTToHTMLConv_lastMode;
       iState = modetype(iState + 1))
    state[iState] = aInString[pos] == ':' ? unchecked : invalid;
  switch (aInString[pos])
  {
  case '@':
    state[RFC2396E] = unchecked;
    // no break here
  case '.':
    state[abbreviated] = unchecked;
    break;
  case ':':
    state[abbreviated] = invalid;
    break;
  default:
    break;
  }

  // Test, first successful mode wins, sequence defined by |ranking|
  int32_t iCheck = 0;  // the currently tested modetype
  modetype check = ranking[iCheck];
  for (; iCheck < mozTXTToHTMLConv_numberOfModes && state[check] != success;
       iCheck++)
    /* check state from last run.
       If this is the first, check this one, which isn't = success yet */
  {
    check = ranking[iCheck];

    uint32_t start, end;

    if (state[check] == unchecked)
      if (FindURLStart(aInString, aInLength, pos, check, start))
        state[check] = startok;

    if (state[check] == startok)
      if (FindURLEnd(aInString, aInLength, pos, check, start, end))
        state[check] = endok;

    if (state[check] == endok)
    {
      nsAutoString txtURL, desc;
      int32_t resultReplaceBefore, resultReplaceAfter;

      CalculateURLBoundaries(aInString, aInLength, pos, whathasbeendone, check, start, end,
                             txtURL, desc,
                             resultReplaceBefore, resultReplaceAfter);

      if (aInString[pos] != ':')
      {
        nsAutoString temp = txtURL;
        txtURL.SetLength(0);
        CompleteAbbreviatedURL(temp.get(),temp.Length(), pos - start, txtURL);
      }

      if (!txtURL.IsEmpty() && CheckURLAndCreateHTML(txtURL, desc, check,
                                                     outputHTML))
      {
        replaceBefore = resultReplaceBefore;
        replaceAfter = resultReplaceAfter;
        state[check] = success;
      }
    } // if
  } // for
  return state[check] == success;
}

bool
mozTXTToHTMLConv::ItMatchesDelimited(const char16_t * aInString,
    int32_t aInLength, const char16_t* rep, int32_t aRepLen,
    LIMTYPE before, LIMTYPE after)
{

  // this little method gets called a LOT. I found we were spending a
  // lot of time just calculating the length of the variable "rep"
  // over and over again every time we called it. So we're now passing
  // an integer in here.
  int32_t textLen = aInLength;

  if
    (
      ((before == LT_IGNORE && (after == LT_IGNORE || after == LT_DELIMITER))
        && textLen < aRepLen) ||
      ((before != LT_IGNORE || (after != LT_IGNORE && after != LT_DELIMITER))
        && textLen < aRepLen + 1) ||
      (before != LT_IGNORE && after != LT_IGNORE && after != LT_DELIMITER
        && textLen < aRepLen + 2)
    )
    return false;

  char16_t text0 = aInString[0];
  char16_t textAfterPos = aInString[aRepLen + (before == LT_IGNORE ? 0 : 1)];

  if
    (
      (before == LT_ALPHA
        && !nsCRT::IsAsciiAlpha(text0)) ||
      (before == LT_DIGIT
        && !nsCRT::IsAsciiDigit(text0)) ||
      (before == LT_DELIMITER
        &&
        (
          nsCRT::IsAsciiAlpha(text0) ||
          nsCRT::IsAsciiDigit(text0) ||
          text0 == *rep
        )) ||
      (after == LT_ALPHA
        && !nsCRT::IsAsciiAlpha(textAfterPos)) ||
      (after == LT_DIGIT
        && !nsCRT::IsAsciiDigit(textAfterPos)) ||
      (after == LT_DELIMITER
        &&
        (
          nsCRT::IsAsciiAlpha(textAfterPos) ||
          nsCRT::IsAsciiDigit(textAfterPos) ||
          textAfterPos == *rep
        )) ||
        !Substring(Substring(aInString, aInString+aInLength),
                   (before == LT_IGNORE ? 0 : 1),
                   aRepLen).Equals(Substring(rep, rep+aRepLen),
                                   nsCaseInsensitiveStringComparator())
    )
    return false;

  return true;
}

uint32_t
mozTXTToHTMLConv::NumberOfMatches(const char16_t * aInString, int32_t aInStringLength,
     const char16_t* rep, int32_t aRepLen, LIMTYPE before, LIMTYPE after)
{
  uint32_t result = 0;

  for (int32_t i = 0; i < aInStringLength; i++)
  {
    const char16_t * indexIntoString = &aInString[i];
    if (ItMatchesDelimited(indexIntoString, aInStringLength - i, rep, aRepLen, before, after))
      result++;
  }
  return result;
}


// NOTE: the converted html for the phrase is appended to aOutString
// tagHTML and attributeHTML are plain ASCII (literal strings, in fact)
bool
mozTXTToHTMLConv::StructPhraseHit(const char16_t * aInString, int32_t aInStringLength, bool col0,
     const char16_t* tagTXT, int32_t aTagTXTLen,
     const char* tagHTML, const char* attributeHTML,
     nsString& aOutString, uint32_t& openTags)
{
  /* We're searching for the following pattern:
     LT_DELIMITER - "*" - ALPHA -
     [ some text (maybe more "*"-pairs) - ALPHA ] "*" - LT_DELIMITER.
     <strong> is only inserted, if existence of a pair could be verified
     We use the first opening/closing tag, if we can choose */

  const char16_t * newOffset = aInString;
  int32_t newLength = aInStringLength;
  if (!col0) // skip the first element?
  {
    newOffset = &aInString[1];
    newLength = aInStringLength - 1;
  }

  // opening tag
  if
    (
      ItMatchesDelimited(aInString, aInStringLength, tagTXT, aTagTXTLen,
           (col0 ? LT_IGNORE : LT_DELIMITER), LT_ALPHA) // is opening tag
        && NumberOfMatches(newOffset, newLength, tagTXT, aTagTXTLen,
              LT_ALPHA, LT_DELIMITER)  // remaining closing tags
              > openTags
    )
  {
    openTags++;
    aOutString.Append('<');
    aOutString.AppendASCII(tagHTML);
    aOutString.Append(char16_t(' '));
    aOutString.AppendASCII(attributeHTML);
    aOutString.AppendLiteral("><span class=\"moz-txt-tag\">");
    aOutString.Append(tagTXT);
    aOutString.AppendLiteral("</span>");
    return true;
  }

  // closing tag
  else if (openTags > 0
       && ItMatchesDelimited(aInString, aInStringLength, tagTXT, aTagTXTLen, LT_ALPHA, LT_DELIMITER))
  {
    openTags--;
    aOutString.AppendLiteral("<span class=\"moz-txt-tag\">");
    aOutString.Append(tagTXT);
    aOutString.AppendLiteral("</span></");
    aOutString.AppendASCII(tagHTML);
    aOutString.Append(char16_t('>'));
    return true;
  }

  return false;
}


bool
mozTXTToHTMLConv::SmilyHit(const char16_t * aInString, int32_t aLength, bool col0,
         const char* tagTXT, const char* imageName,
         nsString& outputHTML, int32_t& glyphTextLen)
{
  if ( !aInString || !tagTXT || !imageName )
      return false;

  int32_t tagLen = strlen(tagTXT);

  uint32_t delim = (col0 ? 0 : 1) + tagLen;

  if
    (
      (col0 || IsSpace(aInString[0]))
        &&
        (
          aLength <= int32_t(delim) ||
          IsSpace(aInString[delim]) ||
          (aLength > int32_t(delim + 1)
            &&
            (
              aInString[delim] == '.' ||
              aInString[delim] == ',' ||
              aInString[delim] == ';' ||
              aInString[delim] == '8' ||
              aInString[delim] == '>' ||
              aInString[delim] == '!' ||
              aInString[delim] == '?'
            )
            && IsSpace(aInString[delim + 1]))
        )
        && ItMatchesDelimited(aInString, aLength, NS_ConvertASCIItoUTF16(tagTXT).get(), tagLen,
                              col0 ? LT_IGNORE : LT_DELIMITER, LT_IGNORE)
	        // Note: tests at different pos for LT_IGNORE and LT_DELIMITER
    )
  {
    if (!col0)
    {
      outputHTML.Truncate();
      outputHTML.Append(char16_t(' '));
    }

    outputHTML.AppendLiteral("<span class=\""); // <span class="
    AppendASCIItoUTF16(imageName, outputHTML);  // e.g. smiley-frown
    outputHTML.AppendLiteral("\" title=\"");    // " title="
    AppendASCIItoUTF16(tagTXT, outputHTML);     // smiley tooltip
    outputHTML.AppendLiteral("\"><span>");      // "><span>
    AppendASCIItoUTF16(tagTXT, outputHTML);     // original text
    outputHTML.AppendLiteral("</span></span>"); // </span></span>
    glyphTextLen = (col0 ? 0 : 1) + tagLen;
    return true;
  }

  return false;
}

// the glyph is appended to aOutputString instead of the original string...
bool
mozTXTToHTMLConv::GlyphHit(const char16_t * aInString, int32_t aInLength, bool col0,
         nsString& aOutputString, int32_t& glyphTextLen)
{
  char16_t text0 = aInString[0];
  char16_t text1 = aInString[1];
  char16_t firstChar = (col0 ? text0 : text1);

  // temporary variable used to store the glyph html text
  nsAutoString outputHTML;
  bool bTestSmilie;
  bool bArg = false;
  int i;

  // refactor some of this mess to avoid code duplication and speed execution a bit
  // there are two cases that need to be tried one after another. To avoid a lot of
  // duplicate code, rolling into a loop

  i = 0;
  while ( i < 2 )
  {
    bTestSmilie = false;
    if ( !i && (firstChar == ':' || firstChar == ';' || firstChar == '=' || firstChar == '>' || firstChar == '8' || firstChar == 'O'))
    {
        // first test passed

        bTestSmilie = true;
        bArg = col0;
    }
    if ( i && col0 && ( text1 == ':' || text1 == ';' || text1 == '=' || text1 == '>' || text1 == '8' || text1 == 'O' ) )
    {
        // second test passed

        bTestSmilie = true;
        bArg = false;
    }
    if ( bTestSmilie && (
          SmilyHit(aInString, aInLength, bArg,
                   ":-)",
                   "moz-smiley-s1", // smile
                   outputHTML, glyphTextLen) ||

          SmilyHit(aInString, aInLength, bArg,
                   ":)",
                   "moz-smiley-s1", // smile
                   outputHTML, glyphTextLen) ||

          SmilyHit(aInString, aInLength, bArg,
                   ":-D",
                   "moz-smiley-s5", // laughing
                   outputHTML, glyphTextLen) ||

          SmilyHit(aInString, aInLength, bArg,
                   ":-(",
                   "moz-smiley-s2", // frown
                   outputHTML, glyphTextLen) ||

          SmilyHit(aInString, aInLength, bArg,
                   ":(",
                   "moz-smiley-s2", // frown
                   outputHTML, glyphTextLen) ||

          SmilyHit(aInString, aInLength, bArg,
                   ":-[",
                   "moz-smiley-s6", // embarassed
                   outputHTML, glyphTextLen) ||

          SmilyHit(aInString, aInLength, bArg,
                   ";-)",
                   "moz-smiley-s3", // wink
                   outputHTML, glyphTextLen) ||

          SmilyHit(aInString, aInLength, col0,
                   ";)",
                   "moz-smiley-s3", // wink
                   outputHTML, glyphTextLen) ||

          SmilyHit(aInString, aInLength, bArg,
                   ":-\\",
                   "moz-smiley-s7", // undecided
                   outputHTML, glyphTextLen) ||

          SmilyHit(aInString, aInLength, bArg,
                   ":-P",
                   "moz-smiley-s4", // tongue
                   outputHTML, glyphTextLen) ||

          SmilyHit(aInString, aInLength, bArg,
                   ";-P",
                   "moz-smiley-s4", // tongue
                   outputHTML, glyphTextLen) ||

          SmilyHit(aInString, aInLength, bArg,
                   "=-O",
                   "moz-smiley-s8", // surprise
                   outputHTML, glyphTextLen) ||

          SmilyHit(aInString, aInLength, bArg,
                   ":-*",
                   "moz-smiley-s9", // kiss
                   outputHTML, glyphTextLen) ||

          SmilyHit(aInString, aInLength, bArg,
                   ">:o",
                   "moz-smiley-s10", // yell
                   outputHTML, glyphTextLen) ||

          SmilyHit(aInString, aInLength, bArg,
                   ">:-o",
                   "moz-smiley-s10", // yell
                   outputHTML, glyphTextLen) ||

          SmilyHit(aInString, aInLength, bArg,
                   "8-)",
                   "moz-smiley-s11", // cool
                   outputHTML, glyphTextLen) ||

          SmilyHit(aInString, aInLength, bArg,
                   ":-$",
                   "moz-smiley-s12", // money
                   outputHTML, glyphTextLen) ||

          SmilyHit(aInString, aInLength, bArg,
                   ":-!",
                   "moz-smiley-s13", // foot
                   outputHTML, glyphTextLen) ||

          SmilyHit(aInString, aInLength, bArg,
                   "O:-)",
                   "moz-smiley-s14", // innocent
                   outputHTML, glyphTextLen) ||

          SmilyHit(aInString, aInLength, bArg,
                   ":'(",
                   "moz-smiley-s15", // cry
                   outputHTML, glyphTextLen) ||

          SmilyHit(aInString, aInLength, bArg,
                   ":-X",
                   "moz-smiley-s16", // sealed
                   outputHTML, glyphTextLen)
        )
    )
    {
        aOutputString.Append(outputHTML);
        return true;
    }
    i++;
  }
  if (text0 == '\f')
  {
      aOutputString.AppendLiteral("<span class='moz-txt-formfeed'></span>");
      glyphTextLen = 1;
      return true;
  }
  if (text0 == '+' || text1 == '+')
  {
    if (ItMatchesDelimited(aInString, aInLength,
                           MOZ_UTF16(" +/-"), 4,
                           LT_IGNORE, LT_IGNORE))
    {
      aOutputString.AppendLiteral(" &plusmn;");
      glyphTextLen = 4;
      return true;
    }
    if (col0 && ItMatchesDelimited(aInString, aInLength,
                                   MOZ_UTF16("+/-"), 3,
                                   LT_IGNORE, LT_IGNORE))
    {
      aOutputString.AppendLiteral("&plusmn;");
      glyphTextLen = 3;
      return true;
    }
  }

  // x^2  =>  x<sup>2</sup>,   also handle powers x^-2,  x^0.5
  // implement regular expression /[\dA-Za-z\)\]}]\^-?\d+(\.\d+)*[^\dA-Za-z]/
  if
    (
      text1 == '^'
      &&
      (
        nsCRT::IsAsciiDigit(text0) || nsCRT::IsAsciiAlpha(text0) ||
        text0 == ')' || text0 == ']' || text0 == '}'
      )
      &&
      (
        (2 < aInLength && nsCRT::IsAsciiDigit(aInString[2])) ||
        (3 < aInLength && aInString[2] == '-' && nsCRT::IsAsciiDigit(aInString[3]))
      )
    )
  {
    // Find first non-digit
    int32_t delimPos = 3;  // skip "^" and first digit (or '-')
    for (; delimPos < aInLength
           &&
           (
             nsCRT::IsAsciiDigit(aInString[delimPos]) ||
             (aInString[delimPos] == '.' && delimPos + 1 < aInLength &&
               nsCRT::IsAsciiDigit(aInString[delimPos + 1]))
           );
         delimPos++)
      ;

    if (delimPos < aInLength && nsCRT::IsAsciiAlpha(aInString[delimPos]))
    {
      return false;
    }

    outputHTML.Truncate();
    outputHTML += text0;
    outputHTML.AppendLiteral(
      "<sup class=\"moz-txt-sup\">"
      "<span style=\"display:inline-block;width:0;height:0;overflow:hidden\">"
      "^</span>");

    aOutputString.Append(outputHTML);
    aOutputString.Append(&aInString[2], delimPos - 2);
    aOutputString.AppendLiteral("</sup>");

    glyphTextLen = delimPos /* - 1 + 1 */ ;
    return true;
  }
  /*
   The following strings are not substituted:
   |TXT   |HTML     |Reason
   +------+---------+----------
    ->     &larr;    Bug #454
    =>     &lArr;    dito
    <-     &rarr;    dito
    <=     &rArr;    dito
    (tm)   &trade;   dito
    1/4    &frac14;  is triggered by 1/4 Part 1, 2/4 Part 2, ...
    3/4    &frac34;  dito
    1/2    &frac12;  similar
  */
  return false;
}

/***************************************************************************
  Library-internal Interface
****************************************************************************/

mozTXTToHTMLConv::mozTXTToHTMLConv()
{
}

mozTXTToHTMLConv::~mozTXTToHTMLConv()
{
}

NS_IMPL_ISUPPORTS(mozTXTToHTMLConv,
                  mozITXTToHTMLConv,
                  nsIStreamConverter,
                  nsIStreamListener,
                  nsIRequestObserver)

int32_t
mozTXTToHTMLConv::CiteLevelTXT(const char16_t *line,
				    uint32_t& logLineStart)
{
  int32_t result = 0;
  int32_t lineLength = NS_strlen(line);

  bool moreCites = true;
  while (moreCites)
  {
    /* E.g. the following lines count as quote:

       > text
       //#ifdef QUOTE_RECOGNITION_AGGRESSIVE
       >text
       //#ifdef QUOTE_RECOGNITION_AGGRESSIVE
           > text
       ] text
       USER> text
       USER] text
       //#endif

       logLineStart is the position of "t" in this example
    */
    uint32_t i = logLineStart;

#ifdef QUOTE_RECOGNITION_AGGRESSIVE
    for (; int32_t(i) < lineLength && IsSpace(line[i]); i++)
      ;
    for (; int32_t(i) < lineLength && nsCRT::IsAsciiAlpha(line[i])
                                   && nsCRT::IsUpper(line[i])   ; i++)
      ;
    if (int32_t(i) < lineLength && (line[i] == '>' || line[i] == ']'))
#else
    if (int32_t(i) < lineLength && line[i] == '>')
#endif
    {
      i++;
      if (int32_t(i) < lineLength && line[i] == ' ')
        i++;
      // sendmail/mbox
      // Placed here for performance increase
      const char16_t * indexString = &line[logLineStart];
           // here, |logLineStart < lineLength| is always true
      uint32_t minlength = std::min(uint32_t(6), NS_strlen(indexString));
      if (Substring(indexString,
                    indexString+minlength).Equals(Substring(NS_LITERAL_STRING(">From "), 0, minlength),
                                                  nsCaseInsensitiveStringComparator()))
        //XXX RFC2646
        moreCites = false;
      else
      {
        result++;
        logLineStart = i;
      }
    }
    else
      moreCites = false;
  }

  return result;
}

void
mozTXTToHTMLConv::ScanTXT(const char16_t * aInString, int32_t aInStringLength, uint32_t whattodo, nsString& aOutString)
{
  bool doURLs = 0 != (whattodo & kURLs);
  bool doGlyphSubstitution = 0 != (whattodo & kGlyphSubstitution);
  bool doStructPhrase = 0 != (whattodo & kStructPhrase);

  uint32_t structPhrase_strong = 0;  // Number of currently open tags
  uint32_t structPhrase_underline = 0;
  uint32_t structPhrase_italic = 0;
  uint32_t structPhrase_code = 0;

  nsAutoString outputHTML;  // moved here for performance increase

  for(uint32_t i = 0; int32_t(i) < aInStringLength;)
  {
    if (doGlyphSubstitution)
    {
      int32_t glyphTextLen;
      if (GlyphHit(&aInString[i], aInStringLength - i, i == 0, aOutString, glyphTextLen))
      {
        i += glyphTextLen;
        continue;
      }
    }

    if (doStructPhrase)
    {
      const char16_t * newOffset = aInString;
      int32_t newLength = aInStringLength;
      if (i > 0 ) // skip the first element?
      {
        newOffset = &aInString[i-1];
        newLength = aInStringLength - i + 1;
      }

      switch (aInString[i]) // Performance increase
      {
      case '*':
        if (StructPhraseHit(newOffset, newLength, i == 0,
                            MOZ_UTF16("*"), 1,
                            "b", "class=\"moz-txt-star\"",
                            aOutString, structPhrase_strong))
        {
          i++;
          continue;
        }
        break;
      case '/':
        if (StructPhraseHit(newOffset, newLength, i == 0,
                            MOZ_UTF16("/"), 1,
                            "i", "class=\"moz-txt-slash\"",
                            aOutString, structPhrase_italic))
        {
          i++;
          continue;
        }
        break;
      case '_':
        if (StructPhraseHit(newOffset, newLength, i == 0,
                            MOZ_UTF16("_"), 1,
                            "span" /* <u> is deprecated */,
                            "class=\"moz-txt-underscore\"",
                            aOutString, structPhrase_underline))
        {
          i++;
          continue;
        }
        break;
      case '|':
        if (StructPhraseHit(newOffset, newLength, i == 0,
                            MOZ_UTF16("|"), 1,
                            "code", "class=\"moz-txt-verticalline\"",
                            aOutString, structPhrase_code))
        {
          i++;
          continue;
        }
        break;
      }
    }

    if (doURLs)
    {
      switch (aInString[i])
      {
      case ':':
      case '@':
      case '.':
        if ( (i == 0 || ((i > 0) && aInString[i - 1] != ' ')) && aInString[i +1] != ' ') // Performance increase
        {
          int32_t replaceBefore;
          int32_t replaceAfter;
          if (FindURL(aInString, aInStringLength, i, whattodo,
                      outputHTML, replaceBefore, replaceAfter)
                  && structPhrase_strong + structPhrase_italic +
                       structPhrase_underline + structPhrase_code == 0
                       /* workaround for bug #19445 */ )
          {
            aOutString.Cut(aOutString.Length() - replaceBefore, replaceBefore);
            aOutString += outputHTML;
            i += replaceAfter + 1;
            continue;
          }
        }
        break;
      } //switch
    }

    switch (aInString[i])
    {
    // Special symbols
    case '<':
    case '>':
    case '&':
      EscapeChar(aInString[i], aOutString, false);
      i++;
      break;
    // Normal characters
    default:
      aOutString += aInString[i];
      i++;
      break;
    }
  }
}

void
mozTXTToHTMLConv::ScanHTML(nsString& aInString, uint32_t whattodo, nsString &aOutString)
{
  // some common variables we were recalculating
  // every time inside the for loop...
  int32_t lengthOfInString = aInString.Length();
  const char16_t * uniBuffer = aInString.get();

#ifdef DEBUG_BenB_Perf
  PRTime parsing_start = PR_IntervalNow();
#endif

  // Look for simple entities not included in a tags and scan them.
  /* Skip all tags ("<[...]>") and content in an a tag ("<a[...]</a>")
     or in a tag ("<!--[...]-->").
     Unescape the rest (text between tags) and pass it to ScanTXT. */
  for (int32_t i = 0; i < lengthOfInString;)
  {
    if (aInString[i] == '<')  // html tag
    {
      uint32_t start = uint32_t(i);
      if (nsCRT::ToLower((char)aInString[uint32_t(i) + 1]) == 'a')
           // if a tag, skip until </a>
      {
        i = aInString.Find("</a>", true, i);
        if (i == kNotFound)
          i = lengthOfInString;
        else
          i += 4;
      }
      else if (aInString[uint32_t(i) + 1] == '!' && aInString[uint32_t(i) + 2] == '-' &&
        aInString[uint32_t(i) + 3] == '-')
          //if out-commended code, skip until -->
      {
        i = aInString.Find("-->", false, i);
        if (i == kNotFound)
          i = lengthOfInString;
        else
          i += 3;

      }
      else  // just skip tag (attributes etc.)
      {
        i = aInString.FindChar('>', i);
        if (i == kNotFound)
          i = lengthOfInString;
        else
          i++;
      }
      aOutString.Append(&uniBuffer[start], uint32_t(i) - start);
    }
    else
    {
      uint32_t start = uint32_t(i);
      i = aInString.FindChar('<', i);
      if (i == kNotFound)
        i = lengthOfInString;

      nsString tempString;
      tempString.SetCapacity(uint32_t((uint32_t(i) - start) * growthRate));
      UnescapeStr(uniBuffer, start, uint32_t(i) - start, tempString);
      ScanTXT(tempString.get(), tempString.Length(), whattodo, aOutString);
    }
  }

#ifdef DEBUG_BenB_Perf
  printf("ScanHTML time:    %d ms\n", PR_IntervalToMilliseconds(PR_IntervalNow() - parsing_start));
#endif
}

/****************************************************************************
  XPCOM Interface
*****************************************************************************/

NS_IMETHODIMP
mozTXTToHTMLConv::Convert(nsIInputStream *aFromStream,
                          const char *aFromType,
                          const char *aToType,
                          nsISupports *aCtxt, nsIInputStream **_retval)
{
  return NS_ERROR_NOT_IMPLEMENTED;
}

NS_IMETHODIMP
mozTXTToHTMLConv::AsyncConvertData(const char *aFromType,
                                   const char *aToType,
                                   nsIStreamListener *aListener, nsISupports *aCtxt) {
  return NS_ERROR_NOT_IMPLEMENTED;
}

NS_IMETHODIMP
mozTXTToHTMLConv::OnDataAvailable(nsIRequest* request, nsISupports *ctxt,
                                 nsIInputStream *inStr, uint64_t sourceOffset,
                                 uint32_t count)
{
  return NS_ERROR_NOT_IMPLEMENTED;
}

NS_IMETHODIMP
mozTXTToHTMLConv::OnStartRequest(nsIRequest* request, nsISupports *ctxt)
{
  return NS_ERROR_NOT_IMPLEMENTED;
}

NS_IMETHODIMP
mozTXTToHTMLConv::OnStopRequest(nsIRequest* request, nsISupports *ctxt,
                                nsresult aStatus)
{
  return NS_ERROR_NOT_IMPLEMENTED;
}

NS_IMETHODIMP
mozTXTToHTMLConv::CiteLevelTXT(const char16_t *line, uint32_t *logLineStart,
				uint32_t *_retval)
{
   if (!logLineStart || !_retval || !line)
     return NS_ERROR_NULL_POINTER;
   *_retval = CiteLevelTXT(line, *logLineStart);
   return NS_OK;
}

NS_IMETHODIMP
mozTXTToHTMLConv::ScanTXT(const char16_t *text, uint32_t whattodo,
			   char16_t **_retval)
{
  NS_ENSURE_ARG(text);

  // FIX ME!!!
  nsString outString;
  int32_t inLength = NS_strlen(text);
  // by setting a large capacity up front, we save time
  // when appending characters to the output string because we don't
  // need to reallocate and re-copy the characters already in the out String.
  NS_ASSERTION(inLength, "ScanTXT passed 0 length string");
  if (inLength == 0) {
    *_retval = NS_strdup(text);
    return NS_OK;
  }

  outString.SetCapacity(uint32_t(inLength * growthRate));
  ScanTXT(text, inLength, whattodo, outString);

  *_retval = ToNewUnicode(outString);
  return *_retval ? NS_OK : NS_ERROR_OUT_OF_MEMORY;
}

NS_IMETHODIMP
mozTXTToHTMLConv::ScanHTML(const char16_t *text, uint32_t whattodo,
			    char16_t **_retval)
{
  NS_ENSURE_ARG(text);

  // FIX ME!!!
  nsString outString;
  nsString inString (text); // look at this nasty extra copy of the entire input buffer!
  outString.SetCapacity(uint32_t(inString.Length() * growthRate));

  ScanHTML(inString, whattodo, outString);
  *_retval = ToNewUnicode(outString);
  return *_retval ? NS_OK : NS_ERROR_OUT_OF_MEMORY;
}

nsresult
MOZ_NewTXTToHTMLConv(mozTXTToHTMLConv** aConv)
{
    NS_PRECONDITION(aConv != nullptr, "null ptr");
    if (!aConv)
      return NS_ERROR_NULL_POINTER;

    *aConv = new mozTXTToHTMLConv();
    if (!*aConv)
      return NS_ERROR_OUT_OF_MEMORY;

    NS_ADDREF(*aConv);
    //    return (*aConv)->Init();
    return NS_OK;
}