pjs/intl/uconv/ucvcn/nsGBKToUnicode.cpp

/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* ***** BEGIN LICENSE BLOCK *****
 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
 *
 * The contents of this file are subject to the Mozilla Public License Version
 * 1.1 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the
 * License.
 *
 * The Original Code is mozilla.org code.
 *
 * The Initial Developer of the Original Code is
 * Netscape Communications Corporation.
 * Portions created by the Initial Developer are Copyright (C) 1998
 * the Initial Developer. All Rights Reserved.
 *
 * Contributor(s):
 *
 * Alternatively, the contents of this file may be used under the terms of
 * either of the GNU General Public License Version 2 or later (the "GPL"),
 * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 * in which case the provisions of the GPL or the LGPL are applicable instead
 * of those above. If you wish to allow use of your version of this file only
 * under the terms of either the GPL or the LGPL, and not to allow others to
 * use your version of this file under the terms of the MPL, indicate your
 * decision by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL or the LGPL. If you do not delete
 * the provisions above, a recipient may use your version of this file under
 * the terms of any one of the MPL, the GPL or the LGPL.
 *
 * ***** END LICENSE BLOCK ***** */
/**
 * A character set converter from GBK to Unicode.
 *
 *
 * @created         07/Sept/1999
 * @author  Yueheng Xu, Yueheng.Xu@intel.com
 */

#include "nsGBKToUnicode.h"
#include "nsUCvCnDll.h"
#include "gbku.h"


//------------------------------------------------------------
// nsGBKUnique2BytesToUnicode
//------------------------------------------------------------
class nsGBKUnique2BytesToUnicode : public nsTableDecoderSupport
{
public:
  nsGBKUnique2BytesToUnicode();
  virtual ~nsGBKUnique2BytesToUnicode()
    { }
protected:
};

static const PRUint16 g_utGBKUnique2Bytes[] = {
#include "gbkuniq2b.ut"
};
nsGBKUnique2BytesToUnicode::nsGBKUnique2BytesToUnicode()
  : nsTableDecoderSupport(u2BytesCharset, nsnull,
        (uMappingTable*) &g_utGBKUnique2Bytes, 1)
{
}

//------------------------------------------------------------
// nsGB18030Unique2BytesToUnicode
//------------------------------------------------------------
class nsGB18030Unique2BytesToUnicode : public nsTableDecoderSupport
{
public:
  nsGB18030Unique2BytesToUnicode();
  virtual ~nsGB18030Unique2BytesToUnicode()
    { }
protected:
};

static const PRUint16 g_utGB18030Unique2Bytes[] = {
#include "gb18030uniq2b.ut"
};
nsGB18030Unique2BytesToUnicode::nsGB18030Unique2BytesToUnicode()
  : nsTableDecoderSupport(u2BytesCharset, nsnull,
        (uMappingTable*) &g_utGB18030Unique2Bytes, 1)
{
}

//------------------------------------------------------------
// nsGB18030Unique4BytesToUnicode
//------------------------------------------------------------
class nsGB18030Unique4BytesToUnicode : public nsTableDecoderSupport
{
public:
  nsGB18030Unique4BytesToUnicode();
  virtual ~nsGB18030Unique4BytesToUnicode()
    { }
protected:
};

static const PRUint16 g_utGB18030Unique4Bytes[] = {
#include "gb180304bytes.ut"
};
nsGB18030Unique4BytesToUnicode::nsGB18030Unique4BytesToUnicode()
  : nsTableDecoderSupport(u4BytesGB18030Charset, nsnull,
        (uMappingTable*) &g_utGB18030Unique4Bytes, 1)
{
}


//----------------------------------------------------------------------
// Class nsGBKToUnicode [implementation]

//----------------------------------------------------------------------
// Subclassing of nsTablesDecoderSupport class [implementation]

#define LEGAL_GBK_MULTIBYTE_FIRST_BYTE(c)  \
      (UINT8_IN_RANGE(0x81, (c), 0xFE))
#define FIRST_BYTE_IS_SURROGATE(c)  \
      (UINT8_IN_RANGE(0x90, (c), 0xFE))
#define LEGAL_GBK_2BYTE_SECOND_BYTE(c) \
      (UINT8_IN_RANGE(0x40, (c), 0x7E)|| UINT8_IN_RANGE(0x80, (c), 0xFE))
#define LEGAL_GBK_4BYTE_SECOND_BYTE(c) \
      (UINT8_IN_RANGE(0x30, (c), 0x39))
#define LEGAL_GBK_4BYTE_THIRD_BYTE(c)  \
      (UINT8_IN_RANGE(0x81, (c), 0xFE))
#define LEGAL_GBK_4BYTE_FORTH_BYTE(c) \
      (UINT8_IN_RANGE(0x30, (c), 0x39))

NS_IMETHODIMP nsGBKToUnicode::ConvertNoBuff(const char* aSrc,
                                            PRInt32 * aSrcLength,
                                            PRUnichar *aDest,
                                            PRInt32 * aDestLength)
{
  PRInt32 i=0;
  PRInt32 iSrcLength = (*aSrcLength);
  PRInt32 iDestlen = 0;
  nsresult rv=NS_OK;
  *aSrcLength = 0;

  for (i=0;i<iSrcLength;i++)
  {
    if ( iDestlen >= (*aDestLength) )
    {
      rv = NS_OK_UDEC_MOREOUTPUT;
      break;
    }
    // The valid range for the 1st byte is [0x81,0xFE]
    if(LEGAL_GBK_MULTIBYTE_FIRST_BYTE(*aSrc))
    {
      if(i+1 >= iSrcLength)
      {
        rv = NS_OK_UDEC_MOREINPUT;
        break;
      }
      // To make sure, the second byte has to be checked as well.
      // In GBK, the second byte range is [0x40,0x7E] and [0x80,0XFE]
      if(LEGAL_GBK_2BYTE_SECOND_BYTE(aSrc[1]))
      {
        // Valid GBK code
        *aDest = mUtil.GBKCharToUnicode(aSrc[0], aSrc[1]);
        if(UCS2_NO_MAPPING == *aDest)
        {
          // We cannot map in the common mapping, let's call the
          // delegate 2 byte decoder to decode the gbk or gb18030 unique
          // 2 byte mapping
          if(! TryExtensionDecoder(aSrc, aDest))
          {
            *aDest = UCS2_NO_MAPPING;
          }
        }
        aSrc += 2;
        i++;
      }
      else if (LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1]))
      {
        // from the first 2 bytes, it looks like a 4 byte GB18030
        if(i+3 >= iSrcLength)  // make sure we got 4 bytes
        {
          rv = NS_OK_UDEC_MOREINPUT;
          break;
        }
        // 4 bytes patten
        // [0x81-0xfe][0x30-0x39][0x81-0xfe][0x30-0x39]
        // preset the

        if (LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2]) &&
            LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3]))
        {
           if ( ! FIRST_BYTE_IS_SURROGATE(aSrc[0]))
           {
             // let's call the delegated 4 byte gb18030 converter to convert it
             if(! Try4BytesDecoder(aSrc, aDest))
               *aDest = UCS2_NO_MAPPING;
           } else {
              // let's try supplement mapping
             NS_ASSERTION(( (iDestlen+1) <= (*aDestLength) ), "no enouth output memory");
             if ( (iDestlen+1) <= (*aDestLength) )
             {
               if(DecodeToSurrogate(aSrc, aDest))
               {
                 // surrogte two PRUnichar
                 iDestlen++;
                 aDest++;
               }  else {
                 *aDest = UCS2_NO_MAPPING;
              }
             } else {
               *aDest = UCS2_NO_MAPPING;
             }
           }
        } else {
          *aDest = UCS2_NO_MAPPING;
        }
        aSrc += 4;
        i+=3;
      }
      else if ((PRUint8) aSrc[0] == (PRUint8)0xA0 )
      {
        // stand-alone (not followed by a valid second byte) 0xA0 !
        // treat it as valid a la Netscape 4.x
        *aDest = CAST_CHAR_TO_UNICHAR(*aSrc);
        aSrc++;
      } else {
        // Invalid GBK code point (second byte should be 0x40 or higher)
        *aDest = UCS2_NO_MAPPING;
        aSrc++;
      }
    } else {
      if(IS_ASCII(*aSrc))
      {
        // The source is an ASCII
        *aDest = CAST_CHAR_TO_UNICHAR(*aSrc);
        aSrc++;
      } else {
        if(IS_GBK_EURO(*aSrc)) {
          *aDest = UCS2_EURO;
        } else {
          *aDest = UCS2_NO_MAPPING;
        }
        aSrc++;
      }
    }
    iDestlen++;
    aDest++;
    *aSrcLength = i+1;
  }
  *aDestLength = iDestlen;
  return rv;
}


void nsGBKToUnicode::CreateExtensionDecoder()
{
  mExtensionDecoder = new nsGBKUnique2BytesToUnicode();
}
void nsGBKToUnicode::Create4BytesDecoder()
{
  m4BytesDecoder =  nsnull;
}
void nsGB18030ToUnicode::CreateExtensionDecoder()
{
  mExtensionDecoder = new nsGB18030Unique2BytesToUnicode();
}
void nsGB18030ToUnicode::Create4BytesDecoder()
{
  m4BytesDecoder = new nsGB18030Unique4BytesToUnicode();
}
PRBool nsGB18030ToUnicode::DecodeToSurrogate(const char* aSrc, PRUnichar* aOut)
{
  NS_ASSERTION(FIRST_BYTE_IS_SURROGATE(aSrc[0]),       "illegal first byte");
  NS_ASSERTION(LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1]),   "illegal second byte");
  NS_ASSERTION(LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2]),    "illegal third byte");
  NS_ASSERTION(LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3]),    "illegal forth byte");
  if(! FIRST_BYTE_IS_SURROGATE(aSrc[0]))
    return PR_FALSE;
  if(! LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1]))
    return PR_FALSE;
  if(! LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2]))
    return PR_FALSE;
  if(! LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3]))
    return PR_FALSE;

  PRUint8 a1 = (PRUint8) aSrc[0];
  PRUint8 a2 = (PRUint8) aSrc[1];
  PRUint8 a3 = (PRUint8) aSrc[2];
  PRUint8 a4 = (PRUint8) aSrc[3];
  a1 -= (PRUint8)0x90;
  a2 -= (PRUint8)0x30;
  a3 -= (PRUint8)0x81;
  a4 -= (PRUint8)0x30;
  PRUint32 idx = (((a1 * 10 + a2 ) * 126 + a3) * 10) + a4;

  *aOut++ = 0xD800 | (0x000003FF & (idx >> 10));
  *aOut = 0xDC00 | (0x000003FF & idx);

  return PR_TRUE;
}
PRBool nsGBKToUnicode::TryExtensionDecoder(const char* aSrc, PRUnichar* aOut)
{
  if(!mExtensionDecoder)
    CreateExtensionDecoder();
  NS_ASSERTION(mExtensionDecoder, "cannot creqte 2 bytes unique converter");
  if(mExtensionDecoder)
  {
    nsresult res = mExtensionDecoder->Reset();
    NS_ASSERTION(NS_SUCCEEDED(res), "2 bytes unique conversoin reset failed");
    PRInt32 len = 2;
    PRInt32 dstlen = 1;
    res = mExtensionDecoder->Convert(aSrc,&len, aOut, &dstlen);
    NS_ASSERTION(NS_FAILED(res) || ((len==2) && (dstlen == 1)),
       "some strange conversion result");
     // if we failed, we then just use the 0xfffd
     // therefore, we ignore the res here.
    if(NS_SUCCEEDED(res))
      return PR_TRUE;
  }
  return  PR_FALSE;
}
PRBool nsGBKToUnicode::DecodeToSurrogate(const char* aSrc, PRUnichar* aOut)
{
  return PR_FALSE;
}
PRBool nsGBKToUnicode::Try4BytesDecoder(const char* aSrc, PRUnichar* aOut)
{
  if(!m4BytesDecoder)
    Create4BytesDecoder();
  if(m4BytesDecoder)
  {
    nsresult res = m4BytesDecoder->Reset();
    NS_ASSERTION(NS_SUCCEEDED(res), "4 bytes unique conversoin reset failed");
    PRInt32 len = 4;
    PRInt32 dstlen = 1;
    res = m4BytesDecoder->Convert(aSrc,&len, aOut, &dstlen);
    NS_ASSERTION(NS_FAILED(res) || ((len==4) && (dstlen == 1)),
       "some strange conversion result");
     // if we failed, we then just use the 0xfffd
     // therefore, we ignore the res here.
    if(NS_SUCCEEDED(res))
      return PR_TRUE;
  }
  return  PR_FALSE;
}