gecko-dev/intl/uconv/ucvlatin/nsUnicodeToTSCII.cpp

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim:expandtab:shiftwidth=2:tabstop=2:
 */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#include "nsUnicodeToTSCII.h"
#include "nsMemory.h"
#include "tamil.h"

/*
 *  TSCII is an 8-bit encoding consisting of:
 *  0x00..0x7F:       ASCII
 *  0x80..0x90, 0x95..0x9F, 0xAB..0xFE:
 *                    Tamil letters and glyphs
 *  0xA1..0xA5, 0xAA: Tamil combining letters (after the base character)
 *  0xA6..0xA8:       Tamil combining letters (before the base character)
 *  0x91..0x94:       Punctuation
 *  0xA9:             Symbols
 */

//----------------------------------------------------------------------
// Class nsUnicodeToTSCII [implementation]

NS_IMPL_ISUPPORTS1(nsUnicodeToTSCII, nsIUnicodeEncoder)

/*
 * During UCS-4 to TSCII conversion, mState contains
 * the last byte (or sometimes the last two bytes) to be output.
 * This can be:
 *   0x00                     Nothing pending.
 *   0xB8..0xC9, 0x83..0x86   A consonant.
 *   0xEC, 0x8A               A consonant with VIRAMA sign (final or joining).
 *   0x87, 0xC38A             Two consonants combined through a VIRAMA sign.
 */

static const uint8_t UnicharToTSCII[] =
{
     0,    0,    0, 0xb7,    0, 0xab, 0xac, 0xfe, // 0x0B80..0x0B87
  0xae, 0xaf, 0xb0,    0,    0,    0, 0xb1, 0xb2, // 0x0B88..0x0B8F
  0xb3,    0, 0xb4, 0xb5, 0xb6, 0xb8,    0,    0, // 0x0B90..0x0B97
     0, 0xb9, 0xba,    0, 0x83,    0, 0xbb, 0xbc, // 0x0B98..0x0B9F
     0,    0,    0, 0xbd, 0xbe,    0,    0,    0, // 0x0BA0..0x0BA7
  0xbf, 0xc9, 0xc0,    0,    0,    0, 0xc1, 0xc2, // 0x0BA8..0x0BAF
  0xc3, 0xc8, 0xc4, 0xc7, 0xc6, 0xc5,    0, 0x84, // 0x0BB0..0x0BB7
  0x85, 0x86,    0,    0,    0,    0, 0xa1, 0xa2, // 0x0BB8..0x0BBF
  0xa3, 0xa4, 0xa5,    0,    0,    0, 0xa6, 0xa7, // 0x0BC0..0x0BC7
  0xa8,    0,    0,    0,    0,    0,    0,    0, // 0x0BC8..0x0BCF
     0,    0,    0,    0,    0,    0,    0, 0xaa, // 0x0BD0..0x0BD7
     0,    0,    0,    0,    0,    0,    0,    0, // 0x0BD8..0x0BDF
     0,    0,    0,    0,    0,    0, 0x80, 0x81, // 0x0BE0..0x0BE7
  0x8d, 0x8e, 0x8f, 0x90, 0x95, 0x96, 0x97, 0x98, // 0x0BE8..0x0BEF
  0x9d, 0x9e, 0x9f,    0,    0,    0,    0,    0, // 0x0BF0..0x0BF7
     0,    0,    0,    0,    0,    0,    0,    0  // 0x0BF8..0x0BFF
};

static const uint8_t consonant_with_u[] =
{
  0xcc, 0x99, 0xcd, 0x9a, 0xce, 0xcf, 0xd0, 0xd1, 0xd2,
  0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb
};

static const uint8_t consonant_with_uu[] =
{
  0xdc, 0x9b, 0xdd, 0x9c, 0xde, 0xdf, 0xe0, 0xe1, 0xe2,
  0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb
};

static const uint8_t consonant_with_virama[18] =
{
  0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4,
  0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd
};


// Modified implementation of Unicode to TSCII converter in glibc by
// Bruno Haible.  My modifications are based on Unicode 3.0 chap. 9 and
// the code chart for Tamil.
NS_IMETHODIMP
nsUnicodeToTSCII::Convert(const PRUnichar * aSrc, int32_t * aSrcLength,
                          char * aDest, int32_t * aDestLength)
{
  const PRUnichar * src = aSrc;
  const PRUnichar * srcEnd = aSrc + *aSrcLength;
  char * dest = aDest;
  char * destEnd = dest + *aDestLength;

  nsresult rv = NS_OK;

  while (src < srcEnd && dest < destEnd) {
    PRUnichar ch = *src;
    if (mBuffer) {
      // Attempt to combine the last character with this one.
      uint32_t last = mBuffer;

      // last : consonant
      if (IS_TSC_CONSONANT(last)) {
        if (ch == UNI_VOWELSIGN_U && IS_TSC_CONSONANT1(last)) {
          *dest++ = consonant_with_u[last - TSC_KA];
          mBuffer = 0;
          ++src;
          continue;
        }

        if (ch == UNI_VOWELSIGN_UU && IS_TSC_CONSONANT1(last)) {
          *dest++ = consonant_with_uu[last - TSC_KA];
          mBuffer = 0;
          ++src;
          continue;
        }

        // reorder. vowel sign goes to the left of consonant
        if (IS_UNI_LEFT_VOWELSIGN(ch)) {
          if (dest + 2 > destEnd)
            goto error_more_output;
          *dest++ = TSC_LEFT_VOWELSIGN(ch);
          *dest++ = last;
          mBuffer = 0;
          ++src;
          continue;
        }

        // split and reorder. consonant goes bet. two parts
        if (IS_UNI_2PARTS_VOWELSIGN(ch)) {
          if (dest + 3 > destEnd)
            goto error_more_output;
          *dest++ = TSC_LEFT_VOWEL_PART(ch);
          *dest++ = last;
          *dest++ = TSC_RIGHT_VOWEL_PART(ch);
          mBuffer = 0;
          ++src;
          continue;
        }

        // Virama
        if (ch == UNI_VIRAMA) {
          // consonant KA can form a conjunct with consonant SSA(SHA).
          // buffer dead consonant 'K' for the now.
          if (last == TSC_KA) {
            mBuffer = TSC_KA_DEAD;
          }
          // SA can form a conjunct when followed by 'RA'.
          // buffer dead consonant 'S' for the now.
          else if (last == TSC_SA) {
            mBuffer = TSC_SA_DEAD;
          }
          else {
            *dest++ = IS_TSC_CONSONANT1(last) ?
              consonant_with_virama[last - TSC_KA] : last + 5;
            mBuffer = 0;
          }
          ++src;
          continue;
        }

        // consonant TA forms a ligature with vowel 'I' or 'II'.
        if (last == TSC_TA && (ch == UNI_VOWELSIGN_I || ch == UNI_VOWELSIGN_II)) {
          *dest++ = ch - (UNI_VOWELSIGN_I - TSC_TI_LIGA);
          mBuffer = 0;
          ++src;
          continue;
        }
      }
      else if (last == TSC_KA_DEAD) {
        // Kd + SSA =  K.SSA
        if (ch == UNI_SSA) {
          mBuffer = TSC_KSSA;
          ++src;
          continue;
        }
      }
      else if (last == TSC_SA_DEAD) {
        // Sd + RA = S.RA. Buffer RA + Sd.
        if (ch == UNI_RA) {
          mBuffer = 0xc38a;
          ++src;
          continue;
        }
      }
      else if (last == TSC_KSSA) {
        if (ch == UNI_VIRAMA) {
          *dest++ = (char) TSC_KSSA_DEAD;
          mBuffer = 0;
          ++src;
          continue;
        }

        // vowel splitting/reordering should be done around conjuncts as well.
        // reorder. vowel sign goes to the left of consonant
        if (IS_UNI_LEFT_VOWELSIGN(ch)) {
          if (dest + 2 > destEnd)
            goto error_more_output;
          *dest++ = TSC_LEFT_VOWELSIGN(ch);
          *dest++ = last;
          mBuffer = 0;
          ++src;
          continue;
        }

        // split and reorder. consonant goes bet. two parts
        if (IS_UNI_2PARTS_VOWELSIGN(ch)) {
          if (dest + 3 > destEnd)
            goto error_more_output;
          *dest++ = TSC_LEFT_VOWEL_PART(ch);
          *dest++ = last;
          *dest++ = TSC_RIGHT_VOWEL_PART(ch);
          mBuffer = 0;
          ++src;
          continue;
        }
      }
      else {
        NS_ASSERTION(last == 0xc38a, "No other value can be buffered");
        if (ch == UNI_VOWELSIGN_II) {
          *dest++ = (char) TSC_SRII_LIGA;
          mBuffer = 0;
          ++src;
          continue;
        }
        else {
          // put back TSC_SA_DEAD and TSC_RA
          *dest++ = (char) TSC_SA_DEAD;
          mBuffer = TSC_RA;
          ++src;
          continue;
        }
      }

      /* Output the buffered character.  */
      if (last >> 8) {
        if (dest + 2 >  destEnd)
          goto error_more_output;
        *dest++ = last & 0xff;
        *dest++ = (last >> 8) & 0xff;
      }
      else
        *dest++ = last & 0xff;
      mBuffer = 0;
      continue;
    }

    if (ch < 0x80)   // Plain ASCII character.
      *dest++ = (char)ch;
    else if (IS_UNI_TAMIL(ch)) {
      uint8_t t = UnicharToTSCII[ch - UNI_TAMIL_START];

      if (t != 0) {
          if (IS_TSC_CONSONANT(t))
            mBuffer = (uint32_t) t;
          else
            *dest++ = t;
      }
      else if (IS_UNI_2PARTS_VOWELSIGN(ch)) {
          // actually this is an illegal sequence.
          if (dest + 2 > destEnd)
            goto error_more_output;

          *dest++ = TSC_LEFT_VOWEL_PART(ch);
          *dest++ = TSC_RIGHT_VOWEL_PART(ch);
      }
      else {
        *aDestLength = dest - aDest;
        return NS_ERROR_UENC_NOMAPPING;
      }
    }
    else if (ch == 0x00A9)
      *dest++ = (char)ch;
    else if (IS_UNI_SINGLE_QUOTE(ch))
      *dest++ = ch - UNI_LEFT_SINGLE_QUOTE + TSC_LEFT_SINGLE_QUOTE;
    else if (IS_UNI_DOUBLE_QUOTE(ch))
      *dest++ = ch - UNI_LEFT_DOUBLE_QUOTE + TSC_LEFT_DOUBLE_QUOTE;
    else {
      *aDestLength = dest - aDest;
      return NS_ERROR_UENC_NOMAPPING;
    }

    /* Now that we wrote the output increment the input pointer.  */
    ++src;
  }

  // flush the buffer
  if (mBuffer >> 8) {
    // Write out the last character, two bytes.
    if (dest + 2 > destEnd)
      goto error_more_output;
    *dest++ = (mBuffer >> 8) & 0xff;
    *dest++ = mBuffer & 0xff;
    mBuffer = 0;
  }
  else if (mBuffer) {
    // Write out the last character, a single byte.
    if (dest >= destEnd)
      goto error_more_output;
    *dest++ = mBuffer & 0xff;
    mBuffer = 0;
  }

  *aSrcLength = src - aSrc;
  *aDestLength = dest - aDest;
  return rv;

error_more_output:
  *aSrcLength = src - aSrc;
  *aDestLength = dest - aDest;
  return NS_OK_UENC_MOREOUTPUT;
}

NS_IMETHODIMP
nsUnicodeToTSCII::Finish(char* aDest, int32_t* aDestLength)
{
  if (!mBuffer) {
    *aDestLength = 0;
    return NS_OK;
  }

  if (mBuffer >> 8) {
    // Write out the last character, two bytes.
    if (*aDestLength < 2) {
      *aDestLength = 0;
      return NS_OK_UENC_MOREOUTPUT;
    }
    *aDest++ = (mBuffer >> 8) & 0xff;
    *aDest++ = mBuffer & 0xff;
    mBuffer = 0;
    *aDestLength = 2;
  }
  else {
    // Write out the last character, a single byte.
    if (*aDestLength < 1) {
      *aDestLength = 0;
      return NS_OK_UENC_MOREOUTPUT;
    }
    *aDest++ = mBuffer & 0xff;
    mBuffer = 0;
    *aDestLength = 1;
  }
  return NS_OK;
}

//================================================================
NS_IMETHODIMP
nsUnicodeToTSCII::Reset()
{
  mBuffer = 0;
  return NS_OK;
}

NS_IMETHODIMP
nsUnicodeToTSCII::GetMaxLength(const PRUnichar * aSrc, int32_t aSrcLength,
                                 int32_t * aDestLength)
{
  // Some Tamil letters  can be decomposed into 2 glyphs in TSCII.
  *aDestLength = aSrcLength *  2;
  return NS_OK;
}


NS_IMETHODIMP
nsUnicodeToTSCII::SetOutputErrorBehavior(int32_t aBehavior,
                                           nsIUnicharEncoder *aEncoder,
                                           PRUnichar aChar)
{
  return NS_OK;
}


// same as the mapping of the C1(0x80-0x9f) part of  Windows-1252 to Unicode
const static PRUnichar gTSCIIToTTF[] = {
  0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
  0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F,
  0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
  0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178
};

//----------------------------------------------------------------------
// Class nsUnicodeToTamilTTF [implementation]
//
NS_IMPL_ISUPPORTS_INHERITED0(nsUnicodeToTamilTTF, nsUnicodeToTSCII)

NS_IMETHODIMP
nsUnicodeToTamilTTF::Convert(const PRUnichar * aSrc,
                             int32_t * aSrcLength, char * aDest,
                             int32_t * aDestLength)
{

  int32_t medLen, destLen;
  char *med;

  GetMaxLength(aSrc, *aSrcLength, &destLen);
  NS_ASSERTION(destLen  <= *aDestLength, "insufficient dest. buffer size");

  // TSCII converter is a single byte encoder and takes half the space
  // taken by TamilTTF encoder.
  medLen = destLen / 2;

  if (medLen > CHAR_BUFFER_SIZE) {
    med = (char *) nsMemory::Alloc(medLen);
    if (!med)
      return NS_ERROR_OUT_OF_MEMORY;
  }
  else
    med = mStaticBuffer;

  nsresult rv = nsUnicodeToTSCII::Convert(aSrc, aSrcLength, med, &medLen);

  if (NS_FAILED(rv)) {
    if (med != mStaticBuffer)
      nsMemory::Free(med);
    return rv;
  }

  int32_t i, j;

  // widen 8bit TSCII to pseudo-Unicode font encoding of TSCII-Tamil font
  for (i = 0, j = 0; i < medLen; i++) {
    // Only C1 part(0x80-0x9f) needs to be mapped as if they're CP1251.
    PRUnichar ucs2 = (med[i] & 0xe0) == 0x80 ?
                     gTSCIIToTTF[med[i] & 0x7f] : uint8_t(med[i]);
    // A lot of TSCII fonts are still based on TSCII 1.6 so that
    // they have Tamil vowel 'I' at 0xad instead of 0xfe.
    if (ucs2 == 0xfe) ucs2 = 0xad;
    aDest[j++] = uint8_t((ucs2 & 0xff00) >> 8);
    aDest[j++] = uint8_t(ucs2 & 0x00ff);
  }

  *aDestLength = j;

  if (med != mStaticBuffer)
    nsMemory::Free(med);

  return NS_OK;
}

NS_IMETHODIMP
nsUnicodeToTamilTTF::GetMaxLength(const PRUnichar * aSrc, int32_t aSrcLength, int32_t * aDestLength)
{
  // Each Tamil character can generate at most two presentation forms,
  // but we're 'extending' them to 16bit shorts, which accounts for
  // additional factor of 2.
  *aDestLength = (aSrcLength + 1) *  4;

  return NS_OK;
}

NS_IMETHODIMP
nsUnicodeToTamilTTF::SetOutputErrorBehavior(int32_t aBehavior,
                                            nsIUnicharEncoder *aEncoder,
                                            PRUnichar aChar)
{
  if (aBehavior == kOnError_CallBack && aEncoder == nullptr)
    return NS_ERROR_NULL_POINTER;
  mErrEncoder = aEncoder;
  mErrBehavior = aBehavior;
  mErrChar = aChar;
  return NS_OK;
}