gecko-dev/xpcom/io/nsEscape.cpp

/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

//  First checked in on 98/12/03 by John R. McMullen, derived from net.h/mkparse.c.

#include "nsEscape.h"
#include "nsMemory.h"
#include "nsCRT.h"
#include "nsReadableUtils.h"

const int netCharType[256] =
/*  Bit 0       xalpha      -- the alphas
**  Bit 1       xpalpha     -- as xalpha but
**                             converts spaces to plus and plus to %2B
**  Bit 3 ...   path        -- as xalphas but doesn't escape '/'
*/
  /* 0 1 2 3 4 5 6 7 8 9 A B C D E F */
  {  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,   /* 0x */
     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,   /* 1x */
     0,0,0,0,0,0,0,0,0,0,7,4,0,7,7,4,   /* 2x   !"#$%&'()*+,-./  */
     7,7,7,7,7,7,7,7,7,7,0,0,0,0,0,0,   /* 3x  0123456789:;<=>?  */
     0,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,   /* 4x  @ABCDEFGHIJKLMNO  */
     /* bits for '@' changed from 7 to 0 so '@' can be escaped   */
     /* in usernames and passwords in publishing.                */
     7,7,7,7,7,7,7,7,7,7,7,0,0,0,0,7,   /* 5X  PQRSTUVWXYZ[\]^_  */
     0,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,   /* 6x  `abcdefghijklmno  */
     7,7,7,7,7,7,7,7,7,7,7,0,0,0,0,0,   /* 7X  pqrstuvwxyz{\}~  DEL */
     0, };

/* decode % escaped hex codes into character values
 */
#define UNHEX(C) \
    ((C >= '0' && C <= '9') ? C - '0' : \
     ((C >= 'A' && C <= 'F') ? C - 'A' + 10 : \
     ((C >= 'a' && C <= 'f') ? C - 'a' + 10 : 0)))


#define IS_OK(C) (netCharType[((unsigned int) (C))] & (flags))
#define HEX_ESCAPE '%'

//----------------------------------------------------------------------------------------
static char* nsEscapeCount(
  const char * str,
  nsEscapeMask flags,
  size_t* out_len)
//----------------------------------------------------------------------------------------
{
  if (!str)
    return 0;

  size_t i, len = 0, charsToEscape = 0;
  static const char hexChars[] = "0123456789ABCDEF";

  const unsigned char* src = (const unsigned char *) str;
  while (*src)
  {
    len++;
    if (!IS_OK(*src++))
      charsToEscape++;
  }

  // calculate how much memory should be allocated
  // original length + 2 bytes for each escaped character + terminating '\0'
  // do the sum in steps to check for overflow
  size_t dstSize = len + 1 + charsToEscape;
  if (dstSize <= len)
    return 0;
  dstSize += charsToEscape;
  if (dstSize < len)
    return 0;

  // fail if we need more than 4GB
  // size_t is likely to be long unsigned int but nsMemory::Alloc(size_t)
  // calls NS_Alloc_P(size_t) which calls PR_Malloc(uint32_t), so there is
  // no chance to allocate more than 4GB using nsMemory::Alloc()
  if (dstSize > UINT32_MAX)
    return 0;

  char* result = (char *)nsMemory::Alloc(dstSize);
  if (!result)
    return 0;

  unsigned char* dst = (unsigned char *) result;
  src = (const unsigned char *) str;
  if (flags == url_XPAlphas)
  {
    for (i = 0; i < len; i++)
    {
      unsigned char c = *src++;
      if (IS_OK(c))
        *dst++ = c;
      else if (c == ' ')
        *dst++ = '+'; /* convert spaces to pluses */
      else
      {
        *dst++ = HEX_ESCAPE;
        *dst++ = hexChars[c >> 4];  /* high nibble */
        *dst++ = hexChars[c & 0x0f];  /* low nibble */
      }
    }
  }
  else
  {
    for (i = 0; i < len; i++)
    {
      unsigned char c = *src++;
      if (IS_OK(c))
        *dst++ = c;
      else
      {
        *dst++ = HEX_ESCAPE;
        *dst++ = hexChars[c >> 4];  /* high nibble */
        *dst++ = hexChars[c & 0x0f];  /* low nibble */
      }
    }
  }

  *dst = '\0';     /* tack on eos */
  if(out_len)
    *out_len = dst - (unsigned char *) result;
  return result;
}

//----------------------------------------------------------------------------------------
char* nsEscape(const char * str, nsEscapeMask flags)
//----------------------------------------------------------------------------------------
{
  if(!str)
    return nullptr;
  return nsEscapeCount(str, flags, nullptr);
}

//----------------------------------------------------------------------------------------
char* nsUnescape(char * str)
//----------------------------------------------------------------------------------------
{
  nsUnescapeCount(str);
  return str;
}

//----------------------------------------------------------------------------------------
int32_t nsUnescapeCount(char * str)
//----------------------------------------------------------------------------------------
{
  char *src = str;
  char *dst = str;
  static const char hexChars[] = "0123456789ABCDEFabcdef";

  char c1[] = " ";
  char c2[] = " ";
  char* const pc1 = c1;
  char* const pc2 = c2;

  if (!*src) {
    // A null string was passed in.  Nothing to escape.
    // Returns early as the string might not actually be mutable with
    // length 0.
    return 0;
  }

  while (*src)
  {
    c1[0] = *(src+1);
    if (*(src+1) == '\0')
      c2[0] = '\0';
    else
      c2[0] = *(src+2);

    if (*src != HEX_ESCAPE || PL_strpbrk(pc1, hexChars) == 0 ||
        PL_strpbrk(pc2, hexChars) == 0 )
      *dst++ = *src++;
    else
    {
      src++; /* walk over escape */
      if (*src)
      {
        *dst = UNHEX(*src) << 4;
        src++;
      }
      if (*src)
      {
        *dst = (*dst + UNHEX(*src));
        src++;
      }
      dst++;
    }
  }

  *dst = 0;
  return (int)(dst - str);

} /* NET_UnEscapeCnt */


char *
nsEscapeHTML(const char * string)
{
  char *rv = nullptr;
  /* XXX Hardcoded max entity len. The +1 is for the trailing null. */
  uint32_t len = strlen(string);
  if (len >= (UINT32_MAX / 6))
    return nullptr;

  rv = (char *)NS_Alloc( (6 * len) + 1 );
  char *ptr = rv;

  if(rv)
  {
    for(; *string != '\0'; string++)
    {
      if(*string == '<')
      {
        *ptr++ = '&';
        *ptr++ = 'l';
        *ptr++ = 't';
        *ptr++ = ';';
      }
      else if(*string == '>')
      {
        *ptr++ = '&';
        *ptr++ = 'g';
        *ptr++ = 't';
        *ptr++ = ';';
      }
      else if(*string == '&')
      {
        *ptr++ = '&';
        *ptr++ = 'a';
        *ptr++ = 'm';
        *ptr++ = 'p';
        *ptr++ = ';';
      }
      else if (*string == '"')
      {
        *ptr++ = '&';
        *ptr++ = 'q';
        *ptr++ = 'u';
        *ptr++ = 'o';
        *ptr++ = 't';
        *ptr++ = ';';
      }
      else if (*string == '\'')
      {
        *ptr++ = '&';
        *ptr++ = '#';
        *ptr++ = '3';
        *ptr++ = '9';
        *ptr++ = ';';
      }
      else
      {
        *ptr++ = *string;
      }
    }
    *ptr = '\0';
  }

  return(rv);
}

char16_t *
nsEscapeHTML2(const char16_t *aSourceBuffer, int32_t aSourceBufferLen)
{
  // Calculate the length, if the caller didn't.
  if (aSourceBufferLen < 0) {
    aSourceBufferLen = NS_strlen(aSourceBuffer);
  }

  /* XXX Hardcoded max entity len. */
  if (uint32_t(aSourceBufferLen) >=
      ((UINT32_MAX - sizeof(char16_t)) / (6 * sizeof(char16_t))) )
    return nullptr;

  char16_t *resultBuffer = (char16_t *)nsMemory::Alloc(aSourceBufferLen *
    6 * sizeof(char16_t) + sizeof(char16_t('\0')));
  char16_t *ptr = resultBuffer;

  if (resultBuffer) {
    int32_t i;

    for(i = 0; i < aSourceBufferLen; i++) {
      if(aSourceBuffer[i] == '<') {
        *ptr++ = '&';
        *ptr++ = 'l';
        *ptr++ = 't';
        *ptr++ = ';';
      } else if(aSourceBuffer[i] == '>') {
        *ptr++ = '&';
        *ptr++ = 'g';
        *ptr++ = 't';
        *ptr++ = ';';
      } else if(aSourceBuffer[i] == '&') {
        *ptr++ = '&';
        *ptr++ = 'a';
        *ptr++ = 'm';
        *ptr++ = 'p';
        *ptr++ = ';';
      } else if (aSourceBuffer[i] == '"') {
        *ptr++ = '&';
        *ptr++ = 'q';
        *ptr++ = 'u';
        *ptr++ = 'o';
        *ptr++ = 't';
        *ptr++ = ';';
      } else if (aSourceBuffer[i] == '\'') {
        *ptr++ = '&';
        *ptr++ = '#';
        *ptr++ = '3';
        *ptr++ = '9';
        *ptr++ = ';';
      } else {
        *ptr++ = aSourceBuffer[i];
      }
    }
    *ptr = 0;
  }

  return resultBuffer;
}

//----------------------------------------------------------------------------------------

const int EscapeChars[256] =
/*   0    1    2    3    4    5    6    7    8    9    A    B    C    D    E    F */
{
     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  /* 0x */
     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  /* 1x */
     0,1023,   0, 512,1023,   0,1023,   0,1023,1023,1023,1023,1023,1023, 953, 784,  /* 2x   !"#$%&'()*+,-./  */
  1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,1008,1008,   0,1008,   0, 768,  /* 3x  0123456789:;<=>?  */
  1008,1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,  /* 4x  @ABCDEFGHIJKLMNO  */
  1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,1023, 896, 896, 896, 896,1023,  /* 5x  PQRSTUVWXYZ[\]^_  */
     0,1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,  /* 6x  `abcdefghijklmno  */
  1023,1023,1023,1023,1023,1023,1023,1023,1023,1023,1023, 896,1012, 896,1023,   0,  /* 7x  pqrstuvwxyz{|}~   */
     0    /* 8x  DEL               */
};

#define NO_NEED_ESC(C) (EscapeChars[((unsigned int) (C))] & (flags))

//----------------------------------------------------------------------------------------

/* returns an escaped string */

/* use the following flags to specify which
   part of an URL you want to escape:

   esc_Scheme        =     1
   esc_Username      =     2
   esc_Password      =     4
   esc_Host          =     8
   esc_Directory     =    16
   esc_FileBaseName  =    32
   esc_FileExtension =    64
   esc_Param         =   128
   esc_Query         =   256
   esc_Ref           =   512
*/

/* by default this function will not escape parts of a string
   that already look escaped, which means it already includes
   a valid hexcode. This is done to avoid multiple escapes of
   a string. Use the following flags to force escaping of a
   string:

   esc_Forced        =  1024
*/

bool NS_EscapeURL(const char *part,
                  int32_t partLen,
                  uint32_t flags,
                  nsACString &result)
{
  if (!part) {
    NS_NOTREACHED("null pointer");
    return false;
  }

  int i = 0;
  static const char hexChars[] = "0123456789ABCDEF";
  if (partLen < 0)
    partLen = strlen(part);
  bool forced = !!(flags & esc_Forced);
  bool ignoreNonAscii = !!(flags & esc_OnlyASCII);
  bool ignoreAscii = !!(flags & esc_OnlyNonASCII);
  bool writing = !!(flags & esc_AlwaysCopy);
  bool colon = !!(flags & esc_Colon);

  const unsigned char* src = (const unsigned char *) part;

  char tempBuffer[100];
  unsigned int tempBufferPos = 0;

  bool previousIsNonASCII = false;
  for (i = 0; i < partLen; i++)
  {
    unsigned char c = *src++;

    // if the char has not to be escaped or whatever follows % is
    // a valid escaped string, just copy the char.
    //
    // Also the % will not be escaped until forced
    // See bugzilla bug 61269 for details why we changed this
    //
    // And, we will not escape non-ascii characters if requested.
    // On special request we will also escape the colon even when
    // not covered by the matrix.
    // ignoreAscii is not honored for control characters (C0 and DEL)
    //
    // And, we should escape the '|' character when it occurs after any
    // non-ASCII character as it may be part of a multi-byte character.
    //
    // 0x20..0x7e are the valid ASCII characters. We also escape spaces
    // (0x20) since they are not legal in URLs.
    if ((NO_NEED_ESC(c) || (c == HEX_ESCAPE && !forced)
         || (c > 0x7f && ignoreNonAscii)
         || (c > 0x20 && c < 0x7f && ignoreAscii))
        && !(c == ':' && colon)
        && !(previousIsNonASCII && c == '|' && !ignoreNonAscii))
    {
      if (writing)
        tempBuffer[tempBufferPos++] = c;
    }
    else /* do the escape magic */
    {
      if (!writing)
      {
        result.Append(part, i);
        writing = true;
      }
      tempBuffer[tempBufferPos++] = HEX_ESCAPE;
      tempBuffer[tempBufferPos++] = hexChars[c >> 4]; /* high nibble */
      tempBuffer[tempBufferPos++] = hexChars[c & 0x0f]; /* low nibble */
    }

    if (tempBufferPos >= sizeof(tempBuffer) - 4)
    {
      NS_ASSERTION(writing, "should be writing");
      tempBuffer[tempBufferPos] = '\0';
      result += tempBuffer;
      tempBufferPos = 0;
    }

    previousIsNonASCII = (c > 0x7f);
  }
  if (writing) {
    tempBuffer[tempBufferPos] = '\0';
    result += tempBuffer;
  }
  return writing;
}

#define ISHEX(c) memchr(hexChars, c, sizeof(hexChars)-1)

bool NS_UnescapeURL(const char *str, int32_t len, uint32_t flags, nsACString &result)
{
  if (!str) {
    NS_NOTREACHED("null pointer");
    return false;
  }

  if (len < 0)
    len = strlen(str);

  bool ignoreNonAscii = !!(flags & esc_OnlyASCII);
  bool ignoreAscii = !!(flags & esc_OnlyNonASCII);
  bool writing = !!(flags & esc_AlwaysCopy);
  bool skipControl = !!(flags & esc_SkipControl);

  static const char hexChars[] = "0123456789ABCDEFabcdef";

  const char *last = str;
  const char *p = str;

  for (int i=0; i<len; ++i, ++p) {
    //printf("%c [i=%d of len=%d]\n", *p, i, len);
    if (*p == HEX_ESCAPE && i < len-2) {
      unsigned char *p1 = ((unsigned char *) p) + 1;
      unsigned char *p2 = ((unsigned char *) p) + 2;
      if (ISHEX(*p1) && ISHEX(*p2) &&
          ((*p1 < '8' && !ignoreAscii) || (*p1 >= '8' && !ignoreNonAscii)) &&
          !(skipControl &&
            (*p1 < '2' || (*p1 == '7' && (*p2 == 'f' || *p2 == 'F'))))) {
        //printf("- p1=%c p2=%c\n", *p1, *p2);
        writing = true;
        if (p > last) {
          //printf("- p=%p, last=%p\n", p, last);
          result.Append(last, p - last);
          last = p;
        }
        char u = (UNHEX(*p1) << 4) + UNHEX(*p2);
        //printf("- u=%c\n", u);
        result.Append(u);
        i += 2;
        p += 2;
        last += 3;
      }
    }
  }
  if (writing && last < str + len)
    result.Append(last, str + len - last);

  return writing;
}