pjs/intl/unicharutil/util/nsUnicharUtils.cpp

/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/* ***** BEGIN LICENSE BLOCK *****
 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
 *
 * The contents of this file are subject to the Mozilla Public License Version
 * 1.1 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the
 * License.
 *
 * The Original Code is Unicode case conversion helpers.
 *
 * The Initial Developer of the Original Code is
 * Netscape Communications Corp..
 * Portions created by the Initial Developer are Copyright (C) 2002
 * the Initial Developer. All Rights Reserved.
 *
 * Contributor(s):
 *   Alec Flett <alecf@netscape.com>
 *   Benjamin Smedberg <benjamin@smedbergs.us>
 *   Ben Turner <mozilla@songbirdnest.com>
 *
 * Alternatively, the contents of this file may be used under the terms of
 * either the GNU General Public License Version 2 or later (the "GPL"), or
 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 * in which case the provisions of the GPL or the LGPL are applicable instead
 * of those above. If you wish to allow use of your version of this file only
 * under the terms of either the GPL or the LGPL, and not to allow others to
 * use your version of this file under the terms of the MPL, indicate your
 * decision by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL or the LGPL. If you do not delete
 * the provisions above, a recipient may use your version of this file under
 * the terms of any one of the MPL, the GPL or the LGPL.
 *
 * ***** END LICENSE BLOCK ***** */

#include "nsUnicharUtils.h"
#include "nsUnicharUtilCIID.h"

#include "nsCRT.h"
#include "nsICaseConversion.h"
#include "nsServiceManagerUtils.h"
#include "nsXPCOMStrings.h"
#include "casetable.h"
#include "nsUTF8Utils.h"

#include <ctype.h>

// For gUpperToTitle
enum {
  kUpperIdx =0,
  kTitleIdx
};

// For gUpperToTitle
enum {
  kLowIdx =0,
  kSizeEveryIdx,
  kDiffIdx
};

#define IS_ASCII(u)       ((u) < 0x80)
#define IS_ASCII_UPPER(u) (('A' <= (u)) && ( (u) <= 'Z' ))
#define IS_ASCII_LOWER(u) (('a' <= (u)) && ( (u) <= 'z'))
#define IS_ASCII_ALPHA(u) (IS_ASCII_UPPER(u) || IS_ASCII_LOWER(u))
#define IS_ASCII_SPACE(u) ( ' ' == (u) )

#define IS_NOCASE_CHAR(u)  (0==(1&(gCaseBlocks[(u)>>13]>>(0x001F&((u)>>8)))))

// Size of Tables

// Changing these numbers may break UTF-8 caching.  Be careful!
#define CASE_MAP_CACHE_SIZE 0x100
#define CASE_MAP_CACHE_MASK 0xFF

struct nsCompressedMap {
  const PRUnichar *mTable;
  PRUint32 mSize;
  PRUint32 mCache[CASE_MAP_CACHE_SIZE];
  PRUint32 mLastBase;

  PRUnichar Map(PRUnichar aChar)
  {
    // We don't need explicit locking here since the cached values are int32s,
    // which are read and written atomically.  The following code is threadsafe
    // because we never access bits from mCache directly -- we always first
    // read the entire entry into a local variable and then mask off the bits
    // we're interested in.

    // Check the 256-byte cache first and bail with our answer if we can.
    PRUint32 cachedData = mCache[aChar & CASE_MAP_CACHE_MASK];
    if (aChar == ((cachedData >> 16) & 0x0000FFFF))
      return cachedData & 0x0000FFFF;

    // Now try the last index we looked up, storing it into a local variable
    // for thread-safety.
    PRUint32 base = mLastBase;
    PRUnichar res = 0;

    // Does this character fit in the slot?
    if ((aChar <= ((mTable[base+kSizeEveryIdx] >> 8) +
                   mTable[base+kLowIdx])) &&
        (mTable[base+kLowIdx] <= aChar)) {

      // This character uses the same base as our last lookup, so the
      // conversion is easy.
      if (((mTable[base+kSizeEveryIdx] & 0x00FF) > 0) &&
          (0 != ((aChar - mTable[base+kLowIdx]) %
                 (mTable[base+kSizeEveryIdx] & 0x00FF))))
      {
        res = aChar;
      } else {
        res = aChar + mTable[base+kDiffIdx];
      }

    } else {
      // Do the full lookup.
      res = this->Lookup(0, mSize/2, mSize-1, aChar);
    }

    // Cache the result and return.
    mCache[aChar & CASE_MAP_CACHE_MASK] =
        ((aChar << 16) & 0xFFFF0000) | (0x0000FFFF & res);
    return res;
  }

  // Takes as arguments the left bound, middle, right bound, and character to
  // search for.  Executes a binary search.
  PRUnichar Lookup(PRUint32 l,
                   PRUint32 m,
                   PRUint32 r,
                   PRUnichar aChar)
  {
    PRUint32 base = m*3; // Every line in the table is 3 units wide.

    // Is aChar past the top of the current table entry?  (The upper byte of
    // the 'every' entry contains the offset to the end of this entry.)
    if (aChar > ((mTable[base+kSizeEveryIdx] >> 8) +
                  mTable[base+kLowIdx]))
    {
      if (l > m || l == r)
        return aChar;
      // Advance one round.
      PRUint32 newm = (m+r+1)/2;
      if (newm == m)
        newm++;
      return this->Lookup(m+1, newm, r, aChar);

    // Is aChar below the bottom of the current table entry?
    } else if (mTable[base+kLowIdx] > aChar) {
      if (r < m || l == r)
        return aChar;
      // Advance one round
      PRUint32 newm = (l+m-1)/2;
      if(newm == m)
        newm++;
      return this->Lookup(l, newm, m-1, aChar);

    // We've found the entry aChar should live in.
    } else {
      // Determine if aChar falls in a gap.  (The lower byte of the 'every'
      // entry contains n for which every nth character from the base is a
      // character of interest.)
      if (((mTable[base+kSizeEveryIdx] & 0x00FF) > 0) &&
          (0 != ((aChar - mTable[base+kLowIdx]) %
                 (mTable[base+kSizeEveryIdx] & 0x00FF))))
      {
        return aChar;
      }
      // If aChar doesn't fall in the gap, cache and convert.
      mLastBase = base;
      return aChar + mTable[base+kDiffIdx];
    }
  }
};

static nsCompressedMap gUpperMap = {
  reinterpret_cast<const PRUnichar*>(&gToUpper[0]),
  gToUpperItems
};

static nsCompressedMap gLowerMap = {
  reinterpret_cast<const PRUnichar*>(&gToLower[0]),
  gToLowerItems
};

// We want ToLowerCase(PRUnichar) and ToLowerCaseASCII(PRUnichar) to be fast
// when they're called from within the case-insensitive comparators, so we
// define inlined versions.
static NS_ALWAYS_INLINE PRUnichar
ToLowerCase_inline(PRUnichar aChar)
{
  if (IS_ASCII(aChar)) {
    return gASCIIToLower[aChar];
  } else if (IS_NOCASE_CHAR(aChar)) {
     return aChar;
  }

  return gLowerMap.Map(aChar);
}

static NS_ALWAYS_INLINE PRUnichar
ToLowerCaseASCII_inline(const PRUnichar aChar)
{
  if (IS_ASCII(aChar))
    return gASCIIToLower[aChar];
  return aChar;
}

void
ToLowerCase(nsAString& aString)
{
  PRUnichar *buf = aString.BeginWriting();
  ToLowerCase(buf, buf, aString.Length());
}

void
ToLowerCase(const nsAString& aSource,
            nsAString& aDest)
{
  const PRUnichar *in;
  PRUnichar *out;
  PRUint32 len = NS_StringGetData(aSource, &in);
  NS_StringGetMutableData(aDest, len, &out);
  NS_ASSERTION(out, "Uh...");
  ToLowerCase(in, out, len);
}

PRUnichar
ToLowerCaseASCII(const PRUnichar aChar)
{
  return ToLowerCaseASCII_inline(aChar);
}

void
ToUpperCase(nsAString& aString)
{
  PRUnichar *buf = aString.BeginWriting();
  ToUpperCase(buf, buf, aString.Length());
}

void
ToUpperCase(const nsAString& aSource,
            nsAString& aDest)
{
  const PRUnichar *in;
  PRUnichar *out;
  PRUint32 len = NS_StringGetData(aSource, &in);
  NS_StringGetMutableData(aDest, len, &out);
  NS_ASSERTION(out, "Uh...");
  ToUpperCase(in, out, len);
}

#ifdef MOZILLA_INTERNAL_API

PRInt32
nsCaseInsensitiveStringComparator::operator()(const PRUnichar* lhs,
                                              const PRUnichar* rhs,
                                              PRUint32 lLength,
                                              PRUint32 rLength) const
{
  return (lLength == rLength) ? CaseInsensitiveCompare(lhs, rhs, lLength) :
         (lLength > rLength) ? 1 : -1;
}

PRInt32
nsCaseInsensitiveUTF8StringComparator::operator()(const char* lhs,
                                                  const char* rhs,
                                                  PRUint32 lLength,
                                                  PRUint32 rLength) const
{
  return CaseInsensitiveCompare(lhs, rhs, lLength, rLength);
}

PRInt32
nsASCIICaseInsensitiveStringComparator::operator()(const PRUnichar* lhs,
                                                   const PRUnichar* rhs,
                                                   PRUint32 lLength,
                                                   PRUint32 rLength) const
{
  if (lLength != rLength) {
    if (lLength > rLength)
      return 1;
    return -1;
  }

  while (rLength) {
    PRUnichar l = *lhs++;
    PRUnichar r = *rhs++;
    if (l != r) {
      l = ToLowerCaseASCII_inline(l);
      r = ToLowerCaseASCII_inline(r);

      if (l > r)
        return 1;
      else if (r > l)
        return -1;
    }
    rLength--;
  }

  return 0;
}

#endif // MOZILLA_INTERNAL_API

PRUnichar
ToLowerCase(PRUnichar aChar)
{
  return ToLowerCase_inline(aChar);
}

void
ToLowerCase(const PRUnichar *aIn, PRUnichar *aOut, PRUint32 aLen)
{
  for (PRUint32 i = 0; i < aLen; i++) {
    aOut[i] = ToLowerCase(aIn[i]);
  }
}

PRUnichar
ToUpperCase(PRUnichar aChar)
{
  if (IS_ASCII(aChar)) {
    if (IS_ASCII_LOWER(aChar))
      return aChar - 0x20;
    else
      return aChar;
  } else if (IS_NOCASE_CHAR(aChar)) {
    return aChar;
  }

  return gUpperMap.Map(aChar);
}

void
ToUpperCase(const PRUnichar *aIn, PRUnichar *aOut, PRUint32 aLen)
{
  for (PRUint32 i = 0; i < aLen; i++) {
    aOut[i] = ToUpperCase(aIn[i]);
  }
}

PRUnichar
ToTitleCase(PRUnichar aChar)
{
  if (IS_ASCII(aChar)) {
    return ToUpperCase(aChar);
  } else if (IS_NOCASE_CHAR(aChar)) {
    return aChar;
  }

  // First check for uppercase characters whose titlecase mapping is
  // different, like U+01F1 DZ: they must remain unchanged.
  if (0x01C0 == (aChar & 0xFFC0)) {
    for (PRUint32 i = 0; i < gUpperToTitleItems; i++) {
      if (aChar == gUpperToTitle[(i*2)+kUpperIdx]) {
        return aChar;
      }
    }
  }

  PRUnichar upper = gUpperMap.Map(aChar);

  if (0x01C0 == ( upper & 0xFFC0)) {
    for (PRUint32 i = 0 ; i < gUpperToTitleItems; i++) {
      if (upper == gUpperToTitle[(i*2)+kUpperIdx]) {
         return gUpperToTitle[(i*2)+kTitleIdx];
      }
    }
  }

  return upper;
}

PRInt32
CaseInsensitiveCompare(const PRUnichar *a,
                       const PRUnichar *b,
                       PRUint32 len)
{
  NS_ASSERTION(a && b, "Do not pass in invalid pointers!");

  if (len) {
    do {
      PRUnichar c1 = *a++;
      PRUnichar c2 = *b++;

      if (c1 != c2) {
        c1 = ToLowerCase_inline(c1);
        c2 = ToLowerCase_inline(c2);
        if (c1 != c2) {
          if (c1 < c2) {
            return -1;
          }
          return 1;
        }
      }
    } while (--len != 0);
  }
  return 0;
}

// Calculates the codepoint of the UTF8 sequence starting at aStr.  Sets aNext
// to the byte following the end of the sequence.
//
// If the sequence is invalid, or if computing the codepoint would take us off
// the end of the string (as marked by aEnd), returns -1 and does not set
// aNext.  Note that this function doesn't check that aStr < aEnd -- it assumes
// you've done that already.
static NS_ALWAYS_INLINE PRUint32
GetLowerUTF8Codepoint(const char* aStr, const char* aEnd, const char **aNext)
{
  // Convert to unsigned char so that stuffing chars into PRUint32s doesn't
  // sign extend.
  const unsigned char *str = (unsigned char*)aStr;

  if (UTF8traits::isASCII(str[0])) {
    // It's ASCII; just convert to lower-case and return it.
    *aNext = aStr + 1;
    return gASCIIToLower[*str];
  }
  if (UTF8traits::is2byte(str[0]) && NS_LIKELY(aStr + 1 < aEnd)) {
    // It's a two-byte sequence, so it looks like
    //  110XXXXX 10XXXXXX.
    // This is definitely in the BMP, so we can store straightaway into a
    // PRUint16.

    PRUint16 c;
    c  = (str[0] & 0x1F) << 6;
    c += (str[1] & 0x3F);

    if (!IS_NOCASE_CHAR(c))
      c = gLowerMap.Map(c);

    *aNext = aStr + 2;
    return c;
  }
  if (UTF8traits::is3byte(str[0]) && NS_LIKELY(aStr + 2 < aEnd)) {
    // It's a three-byte sequence, so it looks like
    //  1110XXXX 10XXXXXX 10XXXXXX.
    // This will just barely fit into 16-bits, so store into a PRUint16.

    PRUint16 c;
    c  = (str[0] & 0x0F) << 12;
    c += (str[1] & 0x3F) << 6;
    c += (str[2] & 0x3F);

    if (!IS_NOCASE_CHAR(c))
      c = gLowerMap.Map(c);

    *aNext = aStr + 3;
    return c;
  }
  if (UTF8traits::is4byte(str[0]) && NS_LIKELY(aStr + 3 < aEnd)) {
    // It's a four-byte sequence, so it looks like
    //   11110XXX 10XXXXXX 10XXXXXX 10XXXXXX.
    // Unless this is an overlong sequence, the codepoint it encodes definitely
    // isn't in the BMP, so we don't bother trying to convert it to lower-case.

    PRUint32 c;
    c  = (str[0] & 0x07) << 18;
    c += (str[1] & 0x3F) << 12;
    c += (str[2] & 0x3F) << 6;
    c += (str[3] & 0x3F);

    *aNext = aStr + 4;
    return c;
  }

  // Hm, we don't understand this sequence.
  return -1;
}

PRInt32 CaseInsensitiveCompare(const char *aLeft,
                               const char *aRight,
                               PRUint32 aLeftBytes,
                               PRUint32 aRightBytes)
{
  const char *leftEnd = aLeft + aLeftBytes;
  const char *rightEnd = aRight + aRightBytes;

  while (aLeft < leftEnd && aRight < rightEnd) {
    PRUint32 leftChar = GetLowerUTF8Codepoint(aLeft, leftEnd, &aLeft);
    if (NS_UNLIKELY(leftChar == PRUint32(-1)))
      return -1;

    PRUint32 rightChar = GetLowerUTF8Codepoint(aRight, rightEnd, &aRight);
    if (NS_UNLIKELY(rightChar == PRUint32(-1)))
      return -1;

    // Now leftChar and rightChar are lower-case, so we can compare them.
    if (leftChar != rightChar) {
      if (leftChar > rightChar)
        return 1;
      return -1;
    }
  }

  // Make sure that if one string is longer than the other we return the
  // correct result.
  if (aLeft < leftEnd)
    return 1;
  if (aRight < rightEnd)
    return -1;

  return 0;
}

PRBool
CaseInsensitiveUTF8CharsEqual(const char* aLeft, const char* aRight,
                              const char* aLeftEnd, const char* aRightEnd,
                              const char** aLeftNext, const char** aRightNext,
                              PRBool* aErr)
{
  NS_ASSERTION(aLeftNext, "Out pointer shouldn't be null.");
  NS_ASSERTION(aRightNext, "Out pointer shouldn't be null.");
  NS_ASSERTION(aErr, "Out pointer shouldn't be null.");
  NS_ASSERTION(aLeft < aLeftEnd, "aLeft must be less than aLeftEnd.");
  NS_ASSERTION(aRight < aRightEnd, "aRight must be less than aRightEnd.");

  PRUint32 leftChar = GetLowerUTF8Codepoint(aLeft, aLeftEnd, aLeftNext);
  if (NS_UNLIKELY(leftChar == PRUint32(-1))) {
    *aErr = PR_TRUE;
    return PR_FALSE;
  }

  PRUint32 rightChar = GetLowerUTF8Codepoint(aRight, aRightEnd, aRightNext);
  if (NS_UNLIKELY(rightChar == PRUint32(-1))) {
    *aErr = PR_TRUE;
    return PR_FALSE;
  }

  // Can't have an error past this point.
  *aErr = PR_FALSE;

  return leftChar == rightChar;
}